summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst41
-rw-r--r--Documentation/virt/kvm/devices/vm.rst3
-rw-r--r--MAINTAINERS10
-rw-r--r--arch/arm64/kvm/arm.c5
-rw-r--r--arch/arm64/kvm/irq.h16
-rw-r--r--arch/arm64/kvm/mmu.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c2
-rw-r--r--arch/powerpc/kvm/irq.h22
-rw-r--r--arch/powerpc/kvm/powerpc.c18
-rw-r--r--arch/s390/include/asm/kvm_host.h14
-rw-r--r--arch/s390/include/asm/mem_encrypt.h4
-rw-r--r--arch/s390/include/asm/stacktrace.h1
-rw-r--r--arch/s390/include/asm/uv.h10
-rw-r--r--arch/s390/kernel/asm-offsets.c1
-rw-r--r--arch/s390/kernel/entry.S26
-rw-r--r--arch/s390/kernel/uv.c7
-rw-r--r--arch/s390/kvm/intercept.c9
-rw-r--r--arch/s390/kvm/interrupt.c5
-rw-r--r--arch/s390/kvm/irq.h19
-rw-r--r--arch/s390/kvm/kvm-s390.c142
-rw-r--r--arch/s390/kvm/kvm-s390.h9
-rw-r--r--arch/s390/kvm/pci.c2
-rw-r--r--arch/s390/kvm/priv.c3
-rw-r--r--arch/s390/kvm/pv.c357
-rw-r--r--arch/s390/kvm/vsie.c4
-rw-r--r--arch/s390/mm/gmap.c147
-rw-r--r--arch/s390/mm/init.c12
-rw-r--r--arch/x86/events/intel/lbr.c6
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h37
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h148
-rw-r--r--arch/x86/include/asm/perf_event.h6
-rw-r--r--arch/x86/include/asm/spec-ctrl.h10
-rw-r--r--arch/x86/include/asm/svm.h100
-rw-r--r--arch/x86/include/uapi/asm/kvm.h5
-rw-r--r--arch/x86/kernel/asm-offsets.c6
-rw-r--r--arch/x86/kernel/cpu/bugs.c15
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kvm/.gitignore2
-rw-r--r--arch/x86/kvm/Kconfig11
-rw-r--r--arch/x86/kvm/Makefile18
-rw-r--r--arch/x86/kvm/cpuid.c29
-rw-r--r--arch/x86/kvm/emulate.c355
-rw-r--r--arch/x86/kvm/hyperv.c353
-rw-r--r--arch/x86/kvm/hyperv.h64
-rw-r--r--arch/x86/kvm/irq.c5
-rw-r--r--arch/x86/kvm/kvm-asm-offsets.c29
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/kvm_emulate.h48
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c176
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h33
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h6
-rw-r--r--arch/x86/kvm/mmu/spte.c12
-rw-r--r--arch/x86/kvm/mmu/spte.h19
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c114
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h2
-rw-r--r--arch/x86/kvm/pmu.c92
-rw-r--r--arch/x86/kvm/pmu.h6
-rw-r--r--arch/x86/kvm/reverse_cpuid.h25
-rw-r--r--arch/x86/kvm/smm.c649
-rw-r--r--arch/x86/kvm/smm.h168
-rw-r--r--arch/x86/kvm/svm/hyperv.c18
-rw-r--r--arch/x86/kvm/svm/hyperv.h50
-rw-r--r--arch/x86/kvm/svm/nested.c64
-rw-r--r--arch/x86/kvm/svm/pmu.c11
-rw-r--r--arch/x86/kvm/svm/sev.c8
-rw-r--r--arch/x86/kvm/svm/svm.c174
-rw-r--r--arch/x86/kvm/svm/svm.h16
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c8
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h25
-rw-r--r--arch/x86/kvm/svm/svm_ops.h5
-rw-r--r--arch/x86/kvm/svm/vmenter.S260
-rw-r--r--arch/x86/kvm/trace.h36
-rw-r--r--arch/x86/kvm/vmx/capabilities.h24
-rw-r--r--arch/x86/kvm/vmx/hyperv.c (renamed from arch/x86/kvm/vmx/evmcs.c)45
-rw-r--r--arch/x86/kvm/vmx/hyperv.h (renamed from arch/x86/kvm/vmx/evmcs.h)12
-rw-r--r--arch/x86/kvm/vmx/nested.c48
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c15
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h5
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c53
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h2
-rw-r--r--arch/x86/kvm/x86.c493
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--arch/x86/kvm/xen.c34
-rw-r--r--drivers/s390/crypto/vfio_ap_ops.c2
-rw-r--r--include/asm-generic/hyperv-tlfs.h5
-rw-r--r--include/asm-generic/mshyperv.h11
-rw-r--r--include/linux/build_bug.h9
-rw-r--r--include/linux/kvm_host.h21
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/uapi/linux/kvm.h8
-rw-r--r--mm/gup.c33
-rw-r--r--mm/hugetlb.c5
-rw-r--r--tools/arch/x86/include/asm/atomic.h7
-rw-r--r--tools/include/asm-generic/atomic-gcc.h12
-rwxr-xr-xtools/kvm/kvm_stat/kvm_stat98
-rw-r--r--tools/testing/selftests/kvm/.gitignore8
-rw-r--r--tools/testing/selftests/kvm/Makefile14
-rw-r--r--tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c3
-rw-r--r--tools/testing/selftests/kvm/aarch64/arch_timer.c29
-rw-r--r--tools/testing/selftests/kvm/aarch64/debug-exceptions.c32
-rw-r--r--tools/testing/selftests/kvm/aarch64/hypercalls.c3
-rw-r--r--tools/testing/selftests/kvm/aarch64/page_fault_test.c11
-rw-r--r--tools/testing/selftests/kvm/aarch64/psci_test.c1
-rw-r--r--tools/testing/selftests/kvm/aarch64/vgic_init.c2
-rw-r--r--tools/testing/selftests/kvm/aarch64/vgic_irq.c10
-rw-r--r--tools/testing/selftests/kvm/access_tracking_perf_test.c24
-rw-r--r--tools/testing/selftests/kvm/demand_paging_test.c26
-rw-r--r--tools/testing/selftests/kvm/dirty_log_perf_test.c130
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c3
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util_base.h29
-rw-r--r--tools/testing/selftests/kvm/include/memstress.h75
-rw-r--r--tools/testing/selftests/kvm/include/perf_test_util.h66
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h25
-rw-r--r--tools/testing/selftests/kvm/include/ucall_common.h10
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/evmcs.h48
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/hyperv.h103
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h452
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/svm.h26
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/svm_util.h14
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/vmx.h25
-rw-r--r--tools/testing/selftests/kvm/kvm_page_table_test.c6
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/processor.c18
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/ucall.c102
-rw-r--r--tools/testing/selftests/kvm/lib/elf.c2
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c94
-rw-r--r--tools/testing/selftests/kvm/lib/memstress.c (renamed from tools/testing/selftests/kvm/lib/perf_test_util.c)137
-rw-r--r--tools/testing/selftests/kvm/lib/riscv/ucall.c42
-rw-r--r--tools/testing/selftests/kvm/lib/s390x/ucall.c39
-rw-r--r--tools/testing/selftests/kvm/lib/test_util.c36
-rw-r--r--tools/testing/selftests/kvm/lib/ucall_common.c103
-rw-r--r--tools/testing/selftests/kvm/lib/userfaultfd_util.c2
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/hyperv.c46
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/memstress.c (renamed from tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c)37
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c236
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/ucall.c39
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/vmx.c56
-rw-r--r--tools/testing/selftests/kvm/max_guest_memory_test.c21
-rw-r--r--tools/testing/selftests/kvm/memslot_modification_stress_test.c40
-rw-r--r--tools/testing/selftests/kvm/memslot_perf_test.c30
-rw-r--r--tools/testing/selftests/kvm/rseq_test.c4
-rw-r--r--tools/testing/selftests/kvm/s390x/memop.c2
-rw-r--r--tools/testing/selftests/kvm/s390x/resets.c2
-rw-r--r--tools/testing/selftests/kvm/s390x/sync_regs_test.c3
-rw-r--r--tools/testing/selftests/kvm/set_memory_region_test.c5
-rw-r--r--tools/testing/selftests/kvm/steal_time.c1
-rw-r--r--tools/testing/selftests/kvm/system_counter_offset_test.c1
-rw-r--r--tools/testing/selftests/kvm/x86_64/amx_test.c101
-rw-r--r--tools/testing/selftests/kvm/x86_64/cpuid_test.c11
-rw-r--r--tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c3
-rw-r--r--tools/testing/selftests/kvm/x86_64/emulator_error_test.c193
-rw-r--r--tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c45
-rw-r--r--tools/testing/selftests/kvm/x86_64/flds_emulation.h55
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c3
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c (renamed from tools/testing/selftests/kvm/x86_64/evmcs_test.c)96
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_features.c25
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_ipi.c314
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c99
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c690
-rw-r--r--tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c5
-rw-r--r--tools/testing/selftests/kvm/x86_64/platform_info_test.c3
-rw-r--r--tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c77
-rw-r--r--tools/testing/selftests/kvm/x86_64/set_sregs_test.c3
-rw-r--r--tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c111
-rw-r--r--tools/testing/selftests/kvm/x86_64/smm_test.c2
-rw-r--r--tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c67
-rw-r--r--tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c3
-rw-r--r--tools/testing/selftests/kvm/x86_64/sync_regs_test.c3
-rw-r--r--tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c73
-rw-r--r--tools/testing/selftests/kvm/x86_64/userspace_io_test.c3
-rw-r--r--tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c88
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c1
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c19
-rw-r--r--tools/testing/selftests/kvm/x86_64/xapic_state_test.c4
-rw-r--r--virt/kvm/irqchip.c3
-rw-r--r--virt/kvm/kvm_main.c38
-rw-r--r--virt/kvm/kvm_mm.h4
-rw-r--r--virt/kvm/pfncache.c9
183 files changed, 6423 insertions, 3088 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 226b40baffb8..fbb9a22030ba 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5163,10 +5163,13 @@ KVM_PV_ENABLE
===== =============================
KVM_PV_DISABLE
- Deregister the VM from the Ultravisor and reclaim the memory that
- had been donated to the Ultravisor, making it usable by the kernel
- again. All registered VCPUs are converted back to non-protected
- ones.
+ Deregister the VM from the Ultravisor and reclaim the memory that had
+ been donated to the Ultravisor, making it usable by the kernel again.
+ All registered VCPUs are converted back to non-protected ones. If a
+ previous protected VM had been prepared for asynchonous teardown with
+ KVM_PV_ASYNC_CLEANUP_PREPARE and not subsequently torn down with
+ KVM_PV_ASYNC_CLEANUP_PERFORM, it will be torn down in this call
+ together with the current protected VM.
KVM_PV_VM_SET_SEC_PARMS
Pass the image header from VM memory to the Ultravisor in
@@ -5289,6 +5292,36 @@ KVM_PV_DUMP
authentication tag all of which are needed to decrypt the dump at a
later time.
+KVM_PV_ASYNC_CLEANUP_PREPARE
+ :Capability: KVM_CAP_S390_PROTECTED_ASYNC_DISABLE
+
+ Prepare the current protected VM for asynchronous teardown. Most
+ resources used by the current protected VM will be set aside for a
+ subsequent asynchronous teardown. The current protected VM will then
+ resume execution immediately as non-protected. There can be at most
+ one protected VM prepared for asynchronous teardown at any time. If
+ a protected VM had already been prepared for teardown without
+ subsequently calling KVM_PV_ASYNC_CLEANUP_PERFORM, this call will
+ fail. In that case, the userspace process should issue a normal
+ KVM_PV_DISABLE. The resources set aside with this call will need to
+ be cleaned up with a subsequent call to KVM_PV_ASYNC_CLEANUP_PERFORM
+ or KVM_PV_DISABLE, otherwise they will be cleaned up when KVM
+ terminates. KVM_PV_ASYNC_CLEANUP_PREPARE can be called again as soon
+ as cleanup starts, i.e. before KVM_PV_ASYNC_CLEANUP_PERFORM finishes.
+
+KVM_PV_ASYNC_CLEANUP_PERFORM
+ :Capability: KVM_CAP_S390_PROTECTED_ASYNC_DISABLE
+
+ Tear down the protected VM previously prepared for teardown with
+ KVM_PV_ASYNC_CLEANUP_PREPARE. The resources that had been set aside
+ will be freed during the execution of this command. This PV command
+ should ideally be issued by userspace from a separate thread. If a
+ fatal signal is received (or the process terminates naturally), the
+ command will terminate immediately without completing, and the normal
+ KVM shutdown procedure will take care of cleaning up all remaining
+ protected VMs, including the ones whose teardown was interrupted by
+ process termination.
+
4.126 KVM_XEN_HVM_SET_ATTR
--------------------------
diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst
index 0aa5b1cfd700..60acc39e0e93 100644
--- a/Documentation/virt/kvm/devices/vm.rst
+++ b/Documentation/virt/kvm/devices/vm.rst
@@ -215,6 +215,7 @@ KVM_S390_VM_TOD_EXT).
:Parameters: address of a buffer in user space to store the data (u8) to
:Returns: -EFAULT if the given address is not accessible from kernel space;
-EINVAL if setting the TOD clock extension to != 0 is not supported
+ -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor)
3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW
-----------------------------------
@@ -224,6 +225,7 @@ the POP (u64).
:Parameters: address of a buffer in user space to store the data (u64) to
:Returns: -EFAULT if the given address is not accessible from kernel space
+ -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor)
3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT
-----------------------------------
@@ -237,6 +239,7 @@ it, it is stored as 0 and not allowed to be set to a value != 0.
(kvm_s390_vm_tod_clock) to
:Returns: -EFAULT if the given address is not accessible from kernel space;
-EINVAL if setting the TOD clock extension to != 0 is not supported
+ -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor)
4. GROUP: KVM_S390_VM_CRYPTO
============================
diff --git a/MAINTAINERS b/MAINTAINERS
index 046ff06ff97f..89672a59c0c3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11324,6 +11324,16 @@ F: arch/x86/kvm/svm/hyperv.*
F: arch/x86/kvm/svm/svm_onhyperv.*
F: arch/x86/kvm/vmx/evmcs.*
+KVM X86 Xen (KVM/Xen)
+M: David Woodhouse <dwmw2@infradead.org>
+M: Paul Durrant <paul@xen.org>
+M: Sean Christopherson <seanjc@google.com>
+M: Paolo Bonzini <pbonzini@redhat.com>
+L: kvm@vger.kernel.org
+S: Supported
+T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
+F: arch/x86/kvm/xen.*
+
KERNFS
M: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
M: Tejun Heo <tj@kernel.org>
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 00da570ed72b..9c5573bc4614 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2142,6 +2142,11 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
return NULL;
}
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
bool kvm_arch_has_irq_bypass(void)
{
return true;
diff --git a/arch/arm64/kvm/irq.h b/arch/arm64/kvm/irq.h
deleted file mode 100644
index 0d257de42c10..000000000000
--- a/arch/arm64/kvm/irq.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * irq.h: in kernel interrupt controller related definitions
- * Copyright (c) 2016 Red Hat, Inc.
- *
- * This header is included by irqchip.c. However, on ARM, interrupt
- * controller declarations are located in include/kvm/arm_vgic.h since
- * they are mostly shared between arm and arm64.
- */
-
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include <kvm/arm_vgic.h>
-
-#endif
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 39d9a334efb5..31d7fa4c7c14 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1300,7 +1300,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
*/
smp_rmb();
- pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
write_fault, &writable, NULL);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index e9744b41a226..4939f57b6f6a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -598,7 +598,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
write_ok = true;
} else {
/* Call KVM generic code to do the slow-path check */
- pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
writing, &write_ok, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 5d5e12f3bf86..9d3743ca16d5 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -846,7 +846,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
unsigned long pfn;
/* Call KVM generic code to do the slow-path check */
- pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
writing, upgrade_p, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
deleted file mode 100644
index e6463f866abc..000000000000
--- a/arch/powerpc/kvm/irq.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __IRQ_H
-#define __IRQ_H
-
-#include <linux/kvm_host.h>
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
- int ret = 0;
-
-#ifdef CONFIG_KVM_MPIC
- ret = ret || (kvm->arch.mpic != NULL);
-#endif
-#ifdef CONFIG_KVM_XICS
- ret = ret || (kvm->arch.xics != NULL);
- ret = ret || (kvm->arch.xive != NULL);
-#endif
- smp_rmb();
- return ret;
-}
-
-#endif
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b850b0efa201..04494a4fb37a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -36,7 +36,6 @@
#include <asm/setup.h>
#include "timing.h"
-#include "irq.h"
#include "../mm/mmu_decl.h"
#define CREATE_TRACE_POINTS
@@ -2165,10 +2164,25 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
return 0;
}
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ int ret = 0;
+
+#ifdef CONFIG_KVM_MPIC
+ ret = ret || (kvm->arch.mpic != NULL);
+#endif
+#ifdef CONFIG_KVM_XICS
+ ret = ret || (kvm->arch.xics != NULL);
+ ret = ret || (kvm->arch.xive != NULL);
+#endif
+ smp_rmb();
+ return ret;
+}
+
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
bool line_status)
{
- if (!irqchip_in_kernel(kvm))
+ if (!kvm_arch_irqchip_in_kernel(kvm))
return -ENXIO;
irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index b1e98a9ed152..d67ce719d16a 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -142,8 +142,7 @@ struct mcck_volatile_info {
CR14_EXTERNAL_DAMAGE_SUBMASK)
#define SIDAD_SIZE_MASK 0xff
-#define sida_origin(sie_block) \
- ((sie_block)->sidad & PAGE_MASK)
+#define sida_addr(sie_block) phys_to_virt((sie_block)->sidad & PAGE_MASK)
#define sida_size(sie_block) \
((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE)
@@ -276,6 +275,7 @@ struct kvm_s390_sie_block {
#define ECB3_AES 0x04
#define ECB3_RI 0x01
__u8 ecb3; /* 0x0063 */
+#define ESCA_SCAOL_MASK ~0x3fU
__u32 scaol; /* 0x0064 */
__u8 sdf; /* 0x0068 */
__u8 epdx; /* 0x0069 */
@@ -942,6 +942,8 @@ struct kvm_s390_pv {
unsigned long stor_base;
void *stor_var;
bool dumping;
+ void *set_aside;
+ struct list_head need_cleanup;
struct mmu_notifier mmu_notifier;
};
@@ -1017,7 +1019,13 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm);
void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
unsigned long *aqm, unsigned long *adm);
-extern int sie64a(struct kvm_s390_sie_block *, u64 *);
+int __sie64a(phys_addr_t sie_block_phys, struct kvm_s390_sie_block *sie_block, u64 *rsa);
+
+static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa)
+{
+ return __sie64a(virt_to_phys(sie_block), sie_block, rsa);
+}
+
extern char sie_exit;
extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h
index 08a8b96606d7..b85e13505a0f 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -4,8 +4,8 @@
#ifndef __ASSEMBLY__
-int set_memory_encrypted(unsigned long addr, int numpages);
-int set_memory_decrypted(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long vaddr, int numpages);
+int set_memory_decrypted(unsigned long vaddr, int numpages);
#endif /* __ASSEMBLY__ */
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index b23c658dce77..1802be5abb5d 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -46,6 +46,7 @@ struct stack_frame {
unsigned long sie_savearea;
unsigned long sie_reason;
unsigned long sie_flags;
+ unsigned long sie_control_block_phys;
};
};
unsigned long gprs[10];
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index be3ef9dd6972..28a9ad57b6f1 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -34,6 +34,7 @@
#define UVC_CMD_INIT_UV 0x000f
#define UVC_CMD_CREATE_SEC_CONF 0x0100
#define UVC_CMD_DESTROY_SEC_CONF 0x0101
+#define UVC_CMD_DESTROY_SEC_CONF_FAST 0x0102
#define UVC_CMD_CREATE_SEC_CPU 0x0120
#define UVC_CMD_DESTROY_SEC_CPU 0x0121
#define UVC_CMD_CONV_TO_SEC_STOR 0x0200
@@ -81,6 +82,7 @@ enum uv_cmds_inst {
BIT_UVC_CMD_UNSHARE_ALL = 20,
BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
+ BIT_UVC_CMD_DESTROY_SEC_CONF_FAST = 23,
BIT_UVC_CMD_DUMP_INIT = 24,
BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25,
BIT_UVC_CMD_DUMP_CPU = 26,
@@ -230,6 +232,14 @@ struct uv_cb_nodata {
u64 reserved20[4];
} __packed __aligned(8);
+/* Destroy Configuration Fast */
+struct uv_cb_destroy_fast {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u64 handle;
+ u64 reserved20[5];
+} __packed __aligned(8);
+
/* Set Shared Access */
struct uv_cb_share {
struct uv_cb_header header;
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index d8ce965c0a97..3f8e760298c2 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -62,6 +62,7 @@ int main(void)
OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea);
OFFSET(__SF_SIE_REASON, stack_frame, sie_reason);
OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags);
+ OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys);
DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
BLANK();
/* idle data offsets */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index d2a1f2f4f5b8..12e1773a94a4 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -225,18 +225,20 @@ ENDPROC(__switch_to)
#if IS_ENABLED(CONFIG_KVM)
/*
- * sie64a calling convention:
- * %r2 pointer to sie control block
- * %r3 guest register save area
+ * __sie64a calling convention:
+ * %r2 pointer to sie control block phys
+ * %r3 pointer to sie control block virt
+ * %r4 guest register save area
*/
-ENTRY(sie64a)
+ENTRY(__sie64a)
stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
lg %r12,__LC_CURRENT
- stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer
- stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area
+ stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical..
+ stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses
+ stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area
xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
- lmg %r0,%r13,0(%r3) # load guest gprs 0-13
+ lmg %r0,%r13,0(%r4) # load guest gprs 0-13
lg %r14,__LC_GMAP # get gmap pointer
ltgr %r14,%r14
jz .Lsie_gmap
@@ -248,6 +250,7 @@ ENTRY(sie64a)
jnz .Lsie_skip
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
jo .Lsie_skip # exit if fp/vx regs changed
+ lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr
BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
.Lsie_entry:
sie 0(%r14)
@@ -258,13 +261,14 @@ ENTRY(sie64a)
BPOFF
BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
.Lsie_skip:
+ lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
.Lsie_done:
# some program checks are suppressing. C code (e.g. do_protection_exception)
# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
-# Other instructions between sie64a and .Lsie_done should not cause program
+# Other instructions between __sie64a and .Lsie_done should not cause program
# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
.Lrewind_pad6:
nopr 7
@@ -293,8 +297,8 @@ sie_exit:
EX_TABLE(.Lrewind_pad4,.Lsie_fault)
EX_TABLE(.Lrewind_pad2,.Lsie_fault)
EX_TABLE(sie_exit,.Lsie_fault)
-ENDPROC(sie64a)
-EXPORT_SYMBOL(sie64a)
+ENDPROC(__sie64a)
+EXPORT_SYMBOL(__sie64a)
EXPORT_SYMBOL(sie_exit)
#endif
@@ -373,7 +377,7 @@ ENTRY(pgm_check_handler)
j 3f # -> fault in user space
.Lpgm_skip_asce:
#if IS_ENABLED(CONFIG_KVM)
- # cleanup critical section for program checks in sie64a
+ # cleanup critical section for program checks in __sie64a
OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f
SIEEXIT
lghi %r10,_PIF_GUEST_FAULT
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index f9810d2a267c..9f18a4af9c13 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -255,6 +255,13 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr,
*/
static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
{
+ /*
+ * The misc feature indicates, among other things, that importing a
+ * shared page from a different protected VM will automatically also
+ * transfer its ownership.
+ */
+ if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications))
+ return false;
if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
return false;
return atomic_read(&mm->context.protected_count) > 1;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 88112065d941..0ee02dae14b2 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -217,7 +217,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
return 0;
if (current->thread.per_flags & PER_FLAG_NO_TE)
return 0;
- itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba;
+ itdb = phys_to_virt(vcpu->arch.sie_block->itdba);
rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb));
if (rc)
return rc;
@@ -409,8 +409,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
out:
if (!cc) {
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
- memcpy((void *)(sida_origin(vcpu->arch.sie_block)),
- sctns, PAGE_SIZE);
+ memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE);
} else {
r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
if (r) {
@@ -464,7 +463,7 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
static int handle_pv_spx(struct kvm_vcpu *vcpu)
{
- u32 pref = *(u32 *)vcpu->arch.sie_block->sidad;
+ u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block);
kvm_s390_set_prefix(vcpu, pref);
trace_kvm_s390_handle_prefix(vcpu, 1, pref);
@@ -497,7 +496,7 @@ static int handle_pv_sclp(struct kvm_vcpu *vcpu)
static int handle_pv_uvc(struct kvm_vcpu *vcpu)
{
- struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad;
+ struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block);
struct uv_cb_cts uvcb = {
.header.cmd = UVC_CMD_UNPIN_PAGE_SHARED,
.header.len = sizeof(uvcb),
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index ab569faf0df2..1dae78deddf2 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -314,11 +314,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa)
return READ_ONCE(gisa->ipm);
}
-static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
-{
- clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
-}
-
static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
{
return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
deleted file mode 100644
index 484608c71dd0..000000000000
--- a/arch/s390/kvm/irq.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * s390 irqchip routines
- *
- * Copyright IBM Corp. 2014
- *
- * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
- */
-#ifndef __KVM_IRQ_H
-#define __KVM_IRQ_H
-
-#include <linux/kvm_host.h>
-
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
- return 1;
-}
-
-#endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 45d4b8182b07..e4890e04b210 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -210,6 +210,14 @@ module_param(diag9c_forwarding_hz, uint, 0644);
MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
/*
+ * allow asynchronous deinit for protected guests; enable by default since
+ * the feature is opt-in anyway
+ */
+static int async_destroy = 1;
+module_param(async_destroy, int, 0444);
+MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests");
+
+/*
* For now we handle at most 16 double words as this is what the s390 base
* kernel handles and stores in the prefix page. If we ever need to go beyond
* this, this requires changes to code, but the external uapi can stay.
@@ -616,6 +624,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_BPB:
r = test_facility(82);
break;
+ case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE:
+ r = async_destroy && is_prot_virt_host();
+ break;
case KVM_CAP_S390_PROTECTED:
r = is_prot_virt_host();
break;
@@ -1207,6 +1218,8 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm,
return 0;
}
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
+
static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
{
struct kvm_s390_vm_tod_clock gtod;
@@ -1216,7 +1229,7 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr)
if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx)
return -EINVAL;
- kvm_s390_set_tod_clock(kvm, &gtod);
+ __kvm_s390_set_tod_clock(kvm, &gtod);
VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx",
gtod.epoch_idx, gtod.tod);
@@ -1247,7 +1260,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
sizeof(gtod.tod)))
return -EFAULT;
- kvm_s390_set_tod_clock(kvm, &gtod);
+ __kvm_s390_set_tod_clock(kvm, &gtod);
VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod);
return 0;
}
@@ -1259,6 +1272,16 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
if (attr->flags)
return -EINVAL;
+ mutex_lock(&kvm->lock);
+ /*
+ * For protected guests, the TOD is managed by the ultravisor, so trying
+ * to change it will never bring the expected results.
+ */
+ if (kvm_s390_pv_is_protected(kvm)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
switch (attr->attr) {
case KVM_S390_VM_TOD_EXT:
ret = kvm_s390_set_tod_ext(kvm, attr);
@@ -1273,6 +1296,9 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr)
ret = -ENXIO;
break;
}
+
+out_unlock:
+ mutex_unlock(&kvm->lock);
return ret;
}
@@ -2504,9 +2530,13 @@ static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
{
+ const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM);
+ void __user *argp = (void __user *)cmd->data;
int r = 0;
u16 dummy;
- void __user *argp = (void __user *)cmd->data;
+
+ if (need_lock)
+ mutex_lock(&kvm->lock);
switch (cmd->cmd) {
case KVM_PV_ENABLE: {
@@ -2540,6 +2570,31 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
break;
}
+ case KVM_PV_ASYNC_CLEANUP_PREPARE:
+ r = -EINVAL;
+ if (!kvm_s390_pv_is_protected(kvm) || !async_destroy)
+ break;
+
+ r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
+ /*
+ * If a CPU could not be destroyed, destroy VM will also fail.
+ * There is no point in trying to destroy it. Instead return
+ * the rc and rrc from the first CPU that failed destroying.
+ */
+ if (r)
+ break;
+ r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc);
+
+ /* no need to block service interrupts any more */
+ clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
+ break;
+ case KVM_PV_ASYNC_CLEANUP_PERFORM:
+ r = -EINVAL;
+ if (!async_destroy)
+ break;
+ /* kvm->lock must not be held; this is asserted inside the function. */
+ r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc);
+ break;
case KVM_PV_DISABLE: {
r = -EINVAL;
if (!kvm_s390_pv_is_protected(kvm))
@@ -2553,7 +2608,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
*/
if (r)
break;
- r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
+ r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc);
/* no need to block service interrupts any more */
clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
@@ -2703,6 +2758,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
default:
r = -ENOTTY;
}
+ if (need_lock)
+ mutex_unlock(&kvm->lock);
+
return r;
}
@@ -2907,9 +2965,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = -EINVAL;
break;
}
- mutex_lock(&kvm->lock);
+ /* must be called without kvm->lock */
r = kvm_s390_handle_pv(kvm, &args);
- mutex_unlock(&kvm->lock);
if (copy_to_user(argp, &args, sizeof(args))) {
r = -EFAULT;
break;
@@ -3228,6 +3285,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_s390_vsie_init(kvm);
if (use_gisa)
kvm_s390_gisa_init(kvm);
+ INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup);
+ kvm->arch.pv.set_aside = NULL;
KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
return 0;
@@ -3272,11 +3331,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
/*
* We are already at the end of life and kvm->lock is not taken.
* This is ok as the file descriptor is closed by now and nobody
- * can mess with the pv state. To avoid lockdep_assert_held from
- * complaining we do not use kvm_s390_pv_is_protected.
+ * can mess with the pv state.
*/
- if (kvm_s390_pv_get_handle(kvm))
- kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
+ kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc);
/*
* Remove the mmu notifier only when the whole KVM VM is torn down,
* and only if one was registered to begin with. If the VM is
@@ -3329,28 +3386,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu)
static void sca_add_vcpu(struct kvm_vcpu *vcpu)
{
if (!kvm_s390_use_sca_entries()) {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca);
/* we still need the basic sca for the ipte control */
- vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
- vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys;
return;
}
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(sca);
- sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
- vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
- vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU;
+ sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
} else {
struct bsca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(sca);
- sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
- vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
- vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+ sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys;
set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
}
read_unlock(&vcpu->kvm->arch.sca_lock);
@@ -3381,6 +3440,7 @@ static int sca_switch_to_extended(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long vcpu_idx;
u32 scaol, scaoh;
+ phys_addr_t new_sca_phys;
if (kvm->arch.use_esca)
return 0;
@@ -3389,8 +3449,9 @@ static int sca_switch_to_extended(struct kvm *kvm)
if (!new_sca)
return -ENOMEM;
- scaoh = (u32)((u64)(new_sca) >> 32);
- scaol = (u32)(u64)(new_sca) & ~0x3fU;
+ new_sca_phys = virt_to_phys(new_sca);
+ scaoh = new_sca_phys >> 32;
+ scaol = new_sca_phys & ESCA_SCAOL_MASK;
kvm_s390_vcpu_block_all(kvm);
write_lock(&kvm->arch.sca_lock);
@@ -3610,15 +3671,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu)
{
- free_page(vcpu->arch.sie_block->cbrlo);
+ free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo));
vcpu->arch.sie_block->cbrlo = 0;
}
int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
{
- vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.sie_block->cbrlo)
+ void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+
+ if (!cbrlo_page)
return -ENOMEM;
+
+ vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page);
return 0;
}
@@ -3628,7 +3692,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->ibc = model->ibc;
if (test_kvm_facility(vcpu->kvm, 7))
- vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list;
+ vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list);
}
static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -3685,9 +3749,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
}
- vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
- | SDNXC;
- vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
+ vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC;
+ vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb);
if (sclp.has_kss)
kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS);
@@ -3737,7 +3800,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
return -ENOMEM;
vcpu->arch.sie_block = &sie_page->sie_block;
- vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+ vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb);
/* the real guest size will always be smaller than msl */
vcpu->arch.sie_block->mso = 0;
@@ -4377,13 +4440,6 @@ static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_t
preempt_enable();
}
-void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
-{
- mutex_lock(&kvm->lock);
- __kvm_s390_set_tod_clock(kvm, gtod);
- mutex_unlock(&kvm->lock);
-}
-
int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
{
if (!mutex_trylock(&kvm->lock))
@@ -5161,6 +5217,7 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu,
struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
+ void *sida_addr;
int r = 0;
if (mop->flags || !mop->size)
@@ -5172,16 +5229,16 @@ static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu,
if (!kvm_s390_pv_cpu_is_protected(vcpu))
return -EINVAL;
+ sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset;
+
switch (mop->op) {
case KVM_S390_MEMOP_SIDA_READ:
- if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) +
- mop->sida_offset), mop->size))
+ if (copy_to_user(uaddr, sida_addr, mop->size))
r = -EFAULT;
break;
case KVM_S390_MEMOP_SIDA_WRITE:
- if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) +
- mop->sida_offset), uaddr, mop->size))
+ if (copy_from_user(sida_addr, uaddr, mop->size))
r = -EFAULT;
break;
}
@@ -5559,6 +5616,11 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return true;
+}
+
/* Section: memory related */
int kvm_arch_prepare_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *old,
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index f6fd668f887e..d48588c207d8 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -23,7 +23,8 @@
/* Transactional Memory Execution related macros */
#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE))
#define TDB_FORMAT1 1
-#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
+#define IS_ITDB_VALID(vcpu) \
+ ((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1))
extern debug_info_t *kvm_s390_dbf;
extern debug_info_t *kvm_s390_dbf_uv;
@@ -233,7 +234,7 @@ static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots)
static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm)
{
- u32 gd = (u32)(u64)kvm->arch.gisa_int.origin;
+ u32 gd = virt_to_phys(kvm->arch.gisa_int.origin);
if (gd && sclp.has_gisaf)
gd |= GISA_FORMAT1;
@@ -243,6 +244,9 @@ static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm)
/* implemented in pv.c */
int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc);
int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
@@ -363,7 +367,6 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
-void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index c50c1645c0ae..ded1af2ddae9 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -126,7 +126,7 @@ int kvm_s390_pci_aen_init(u8 nisc)
return -EPERM;
mutex_lock(&aift->aift_lock);
- aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev),
+ aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *),
GFP_KERNEL);
if (!aift->kzdev) {
rc = -ENOMEM;
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 3335fa09b6f1..9f8a192bd750 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -924,8 +924,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
return -EREMOTE;
}
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
- memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem,
- PAGE_SIZE);
+ memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE);
rc = 0;
} else {
rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE);
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 7cb7799a0acb..e032ebbf51b9 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -18,6 +18,29 @@
#include <linux/mmu_notifier.h>
#include "kvm-s390.h"
+/**
+ * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
+ * be destroyed
+ *
+ * @list: list head for the list of leftover VMs
+ * @old_gmap_table: the gmap table of the leftover protected VM
+ * @handle: the handle of the leftover protected VM
+ * @stor_var: pointer to the variable storage of the leftover protected VM
+ * @stor_base: address of the base storage of the leftover protected VM
+ *
+ * Represents a protected VM that is still registered with the Ultravisor,
+ * but which does not correspond any longer to an active KVM VM. It should
+ * be destroyed at some point later, either asynchronously or when the
+ * process terminates.
+ */
+struct pv_vm_to_be_destroyed {
+ struct list_head list;
+ unsigned long old_gmap_table;
+ u64 handle;
+ void *stor_var;
+ unsigned long stor_base;
+};
+
static void kvm_s390_clear_pv_state(struct kvm *kvm)
{
kvm->arch.pv.handle = 0;
@@ -44,7 +67,7 @@ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
free_pages(vcpu->arch.pv.stor_base,
get_order(uv_info.guest_cpu_stor_len));
- free_page(sida_origin(vcpu->arch.sie_block));
+ free_page((unsigned long)sida_addr(vcpu->arch.sie_block));
vcpu->arch.sie_block->pv_handle_cpu = 0;
vcpu->arch.sie_block->pv_handle_config = 0;
memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
@@ -66,6 +89,7 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
.header.cmd = UVC_CMD_CREATE_SEC_CPU,
.header.len = sizeof(uvcb),
};
+ void *sida_addr;
int cc;
if (kvm_s390_pv_cpu_get_handle(vcpu))
@@ -79,16 +103,17 @@ int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
/* Input */
uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
uvcb.num = vcpu->arch.sie_block->icpua;
- uvcb.state_origin = (u64)vcpu->arch.sie_block;
- uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base;
+ uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block);
+ uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base);
/* Alloc Secure Instruction Data Area Designation */
- vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!vcpu->arch.sie_block->sidad) {
+ sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sida_addr) {
free_pages(vcpu->arch.pv.stor_base,
get_order(uv_info.guest_cpu_stor_len));
return -ENOMEM;
}
+ vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr);
cc = uv_call(0, (u64)&uvcb);
*rc = uvcb.header.rc;
@@ -159,23 +184,192 @@ out_err:
return -ENOMEM;
}
-/* this should not fail, but if it does, we must not free the donated memory */
-int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+/**
+ * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM.
+ * @kvm: the KVM that was associated with this leftover protected VM
+ * @leftover: details about the leftover protected VM that needs a clean up
+ * @rc: the RC code of the Destroy Secure Configuration UVC
+ * @rrc: the RRC code of the Destroy Secure Configuration UVC
+ *
+ * Destroy one leftover protected VM.
+ * On success, kvm->mm->context.protected_count will be decremented atomically
+ * and all other resources used by the VM will be freed.
+ *
+ * Return: 0 in case of success, otherwise 1
+ */
+static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm,
+ struct pv_vm_to_be_destroyed *leftover,
+ u16 *rc, u16 *rrc)
{
int cc;
- cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
- UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+ /* It used the destroy-fast UVC, nothing left to do here */
+ if (!leftover->handle)
+ goto done_fast;
+ cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc);
+ WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc);
+ if (cc)
+ return cc;
+ /*
+ * Intentionally leak unusable memory. If the UVC fails, the memory
+ * used for the VM and its metadata is permanently unusable.
+ * This can only happen in case of a serious KVM or hardware bug; it
+ * is not expected to happen in normal operation.
+ */
+ free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len));
+ free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER);
+ vfree(leftover->stor_var);
+done_fast:
+ atomic_dec(&kvm->mm->context.protected_count);
+ return 0;
+}
+
+/**
+ * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
+ * @kvm: the VM whose memory is to be cleared.
+ *
+ * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
+ * The CPUs of the protected VM need to be destroyed beforehand.
+ */
+static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
+{
+ const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
+ struct kvm_memory_slot *slot;
+ unsigned long len;
+ int srcu_idx;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ /* Take the memslot containing guest absolute address 0 */
+ slot = gfn_to_memslot(kvm, 0);
+ /* Clear all slots or parts thereof that are below 2GB */
+ while (slot && slot->base_gfn < pages_2g) {
+ len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
+ s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
+ /* Take the next memslot */
+ slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
+ }
+
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct uv_cb_destroy_fast uvcb = {
+ .header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST,
+ .header.len = sizeof(uvcb),
+ .handle = kvm_s390_pv_get_handle(kvm),
+ };
+ int cc;
+
+ cc = uv_call_sched(0, (u64)&uvcb);
+ if (rc)
+ *rc = uvcb.header.rc;
+ if (rrc)
+ *rrc = uvcb.header.rrc;
WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
+ uvcb.header.rc, uvcb.header.rrc);
+ WARN_ONCE(cc, "protvirt destroy vm fast failed handle %llx rc %x rrc %x",
+ kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc);
+ /* Inteded memory leak on "impossible" error */
+ if (!cc)
+ kvm_s390_pv_dealloc_vm(kvm);
+ return cc ? -EIO : 0;
+}
+
+static inline bool is_destroy_fast_available(void)
+{
+ return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list);
+}
+
+/**
+ * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown.
+ * @kvm: the VM
+ * @rc: return value for the RC field of the UVCB
+ * @rrc: return value for the RRC field of the UVCB
+ *
+ * Set aside the protected VM for a subsequent teardown. The VM will be able
+ * to continue immediately as a non-secure VM, and the information needed to
+ * properly tear down the protected VM is set aside. If another protected VM
+ * was already set aside without starting its teardown, this function will
+ * fail.
+ * The CPUs of the protected VM need to be destroyed beforehand.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return: 0 in case of success, -EINVAL if another protected VM was already set
+ * aside, -ENOMEM if the system ran out of memory.
+ */
+int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct pv_vm_to_be_destroyed *priv;
+ int res = 0;
+
+ lockdep_assert_held(&kvm->lock);
/*
- * if the mm still has a mapping, make all its pages accessible
- * before destroying the guest
+ * If another protected VM was already prepared for teardown, refuse.
+ * A normal deinitialization has to be performed instead.
*/
- if (mmget_not_zero(kvm->mm)) {
- s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
- mmput(kvm->mm);
+ if (kvm->arch.pv.set_aside)
+ return -EINVAL;
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ if (is_destroy_fast_available()) {
+ res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
+ } else {
+ priv->stor_var = kvm->arch.pv.stor_var;
+ priv->stor_base = kvm->arch.pv.stor_base;
+ priv->handle = kvm_s390_pv_get_handle(kvm);
+ priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
+ WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+ if (s390_replace_asce(kvm->arch.gmap))
+ res = -ENOMEM;
}
+ if (res) {
+ kfree(priv);
+ return res;
+ }
+
+ kvm_s390_destroy_lower_2g(kvm);
+ kvm_s390_clear_pv_state(kvm);
+ kvm->arch.pv.set_aside = priv;
+
+ *rc = UVC_RC_EXECUTED;
+ *rrc = 42;
+ return 0;
+}
+
+/**
+ * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM
+ * @kvm: the KVM whose protected VM needs to be deinitialized
+ * @rc: the RC code of the UVC
+ * @rrc: the RRC code of the UVC
+ *
+ * Deinitialize the current protected VM. This function will destroy and
+ * cleanup the current protected VM, but it will not cleanup the guest
+ * memory. This function should only be called when the protected VM has
+ * just been created and therefore does not have any guest memory, or when
+ * the caller cleans up the guest memory separately.
+ *
+ * This function should not fail, but if it does, the donated memory must
+ * not be freed.
+ *
+ * Context: kvm->lock needs to be held
+ *
+ * Return: 0 in case of success, otherwise -EIO
+ */
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ int cc;
+
+ cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+ UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+ WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
if (!cc) {
atomic_dec(&kvm->mm->context.protected_count);
kvm_s390_pv_dealloc_vm(kvm);
@@ -189,11 +383,137 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
return cc ? -EIO : 0;
}
+/**
+ * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated
+ * with a specific KVM.
+ * @kvm: the KVM to be cleaned up
+ * @rc: the RC code of the first failing UVC
+ * @rrc: the RRC code of the first failing UVC
+ *
+ * This function will clean up all protected VMs associated with a KVM.
+ * This includes the active one, the one prepared for deinitialization with
+ * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list.
+ *
+ * Context: kvm->lock needs to be held unless being called from
+ * kvm_arch_destroy_vm.
+ *
+ * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO
+ */
+int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct pv_vm_to_be_destroyed *cur;
+ bool need_zap = false;
+ u16 _rc, _rrc;
+ int cc = 0;
+
+ /* Make sure the counter does not reach 0 before calling s390_uv_destroy_range */
+ atomic_inc(&kvm->mm->context.protected_count);
+
+ *rc = 1;
+ /* If the current VM is protected, destroy it */
+ if (kvm_s390_pv_get_handle(kvm)) {
+ cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
+ need_zap = true;
+ }
+
+ /* If a previous protected VM was set aside, put it in the need_cleanup list */
+ if (kvm->arch.pv.set_aside) {
+ list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
+ kvm->arch.pv.set_aside = NULL;
+ }
+
+ /* Cleanup all protected VMs in the need_cleanup list */
+ while (!list_empty(&kvm->arch.pv.need_cleanup)) {
+ cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list);
+ need_zap = true;
+ if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) {
+ cc = 1;
+ /*
+ * Only return the first error rc and rrc, so make
+ * sure it is not overwritten. All destroys will
+ * additionally be reported via KVM_UV_EVENT().
+ */
+ if (*rc == UVC_RC_EXECUTED) {
+ *rc = _rc;
+ *rrc = _rrc;
+ }
+ }
+ list_del(&cur->list);
+ kfree(cur);
+ }
+
+ /*
+ * If the mm still has a mapping, try to mark all its pages as
+ * accessible. The counter should not reach zero before this
+ * cleanup has been performed.
+ */
+ if (need_zap && mmget_not_zero(kvm->mm)) {
+ s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+ mmput(kvm->mm);
+ }
+
+ /* Now the counter can safely reach 0 */
+ atomic_dec(&kvm->mm->context.protected_count);
+ return cc ? -EIO : 0;
+}
+
+/**
+ * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM.
+ * @kvm: the VM previously associated with the protected VM
+ * @rc: return value for the RC field of the UVCB
+ * @rrc: return value for the RRC field of the UVCB
+ *
+ * Tear down the protected VM that had been previously prepared for teardown
+ * using kvm_s390_pv_set_aside_vm. Ideally this should be called by
+ * userspace asynchronously from a separate thread.
+ *
+ * Context: kvm->lock must not be held.
+ *
+ * Return: 0 in case of success, -EINVAL if no protected VM had been
+ * prepared for asynchronous teardowm, -EIO in case of other errors.
+ */
+int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+ struct pv_vm_to_be_destroyed *p;
+ int ret = 0;
+
+ lockdep_assert_not_held(&kvm->lock);
+ mutex_lock(&kvm->lock);
+ p = kvm->arch.pv.set_aside;
+ kvm->arch.pv.set_aside = NULL;
+ mutex_unlock(&kvm->lock);
+ if (!p)
+ return -EINVAL;
+
+ /* When a fatal signal is received, stop immediately */
+ if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+ goto done;
+ if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
+ ret = -EIO;
+ kfree(p);
+ p = NULL;
+done:
+ /*
+ * p is not NULL if we aborted because of a fatal signal, in which
+ * case queue the leftover for later cleanup.
+ */
+ if (p) {
+ mutex_lock(&kvm->lock);
+ list_add(&p->list, &kvm->arch.pv.need_cleanup);
+ mutex_unlock(&kvm->lock);
+ /* Did not finish, but pretend things went well */
+ *rc = UVC_RC_EXECUTED;
+ *rrc = 42;
+ }
+ return ret;
+}
+
static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
struct mm_struct *mm)
{
struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
u16 dummy;
+ int r;
/*
* No locking is needed since this is the last thread of the last user of this
@@ -202,7 +522,9 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
* unregistered. This means that if this notifier runs, then the
* struct kvm is still valid.
*/
- kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+ r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+ if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
+ kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
}
static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
@@ -226,8 +548,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
uvcb.guest_stor_len = kvm->arch.pv.guest_len;
uvcb.guest_asce = kvm->arch.gmap->asce;
- uvcb.guest_sca = (unsigned long)kvm->arch.sca;
- uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
+ uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
+ uvcb.conf_base_stor_origin =
+ virt_to_phys((void *)kvm->arch.pv.stor_base);
uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
cc = uv_call_sched(0, (u64)&uvcb);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 94138f8f0c1c..0e9d020d7093 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -654,7 +654,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
page = gfn_to_page(kvm, gpa_to_gfn(gpa));
if (is_error_page(page))
return -EINVAL;
- *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+ *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK);
return 0;
}
@@ -869,7 +869,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
WARN_ON_ONCE(rc);
return 1;
}
- vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+ vsie_page->scb_o = phys_to_virt(hpa);
return 0;
}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 02d15c8dc92e..2ccfcc8a3863 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -72,7 +72,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
goto out_free;
page->index = 0;
list_add(&page->lru, &gmap->crst_list);
- table = (unsigned long *) page_to_phys(page);
+ table = page_to_virt(page);
crst_table_init(table, etype);
gmap->table = table;
gmap->asce = atype | _ASCE_TABLE_LENGTH |
@@ -311,12 +311,12 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
- new = (unsigned long *) page_to_phys(page);
+ new = page_to_virt(page);
crst_table_init(new, init);
spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
list_add(&page->lru, &gmap->crst_list);
- *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
+ *table = __pa(new) | _REGION_ENTRY_LENGTH |
(*table & _REGION_ENTRY_TYPE_MASK);
page->index = gaddr;
page = NULL;
@@ -557,7 +557,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
gaddr & _REGION1_MASK))
return -ENOMEM;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
}
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
@@ -565,7 +565,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
gaddr & _REGION2_MASK))
return -ENOMEM;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
}
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
@@ -573,7 +573,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
gaddr & _REGION3_MASK))
return -ENOMEM;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
}
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
/* Walk the parent mm page table */
@@ -813,7 +813,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
break;
if (*table & _REGION_ENTRY_INVALID)
return NULL;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_REGION2:
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
@@ -821,7 +821,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
break;
if (*table & _REGION_ENTRY_INVALID)
return NULL;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_REGION3:
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
@@ -829,7 +829,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
break;
if (*table & _REGION_ENTRY_INVALID)
return NULL;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
@@ -837,7 +837,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
break;
if (*table & _REGION_ENTRY_INVALID)
return NULL;
- table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
}
return table;
@@ -1150,7 +1150,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
address = pte_val(pte) & PAGE_MASK;
address += gaddr & ~PAGE_MASK;
- *val = *(unsigned long *) address;
+ *val = *(unsigned long *)__va(address);
set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
/* Do *NOT* clear the _PAGE_INVALID bit! */
rc = 0;
@@ -1335,7 +1335,8 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
*/
static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
{
- unsigned long sto, *ste, *pgt;
+ unsigned long *ste;
+ phys_addr_t sto, pgt;
struct page *page;
BUG_ON(!gmap_is_shadow(sg));
@@ -1343,13 +1344,13 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
return;
gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
- sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
+ sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
- pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+ pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
*ste = _SEGMENT_ENTRY_EMPTY;
- __gmap_unshadow_pgt(sg, raddr, pgt);
+ __gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
- page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ page = phys_to_page(pgt);
list_del(&page->lru);
page_table_free_pgste(page);
}
@@ -1365,19 +1366,19 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
unsigned long *sgt)
{
- unsigned long *pgt;
struct page *page;
+ phys_addr_t pgt;
int i;
BUG_ON(!gmap_is_shadow(sg));
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
continue;
- pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+ pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
sgt[i] = _SEGMENT_ENTRY_EMPTY;
- __gmap_unshadow_pgt(sg, raddr, pgt);
+ __gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
- page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ page = phys_to_page(pgt);
list_del(&page->lru);
page_table_free_pgste(page);
}
@@ -1392,7 +1393,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
*/
static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
{
- unsigned long r3o, *r3e, *sgt;
+ unsigned long r3o, *r3e;
+ phys_addr_t sgt;
struct page *page;
BUG_ON(!gmap_is_shadow(sg));
@@ -1401,12 +1403,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
return;
gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
- gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
- sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+ gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
+ sgt = *r3e & _REGION_ENTRY_ORIGIN;
*r3e = _REGION3_ENTRY_EMPTY;
- __gmap_unshadow_sgt(sg, raddr, sgt);
+ __gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
- page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ page = phys_to_page(sgt);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1422,19 +1424,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
unsigned long *r3t)
{
- unsigned long *sgt;
struct page *page;
+ phys_addr_t sgt;
int i;
BUG_ON(!gmap_is_shadow(sg));
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
continue;
- sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+ sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
r3t[i] = _REGION3_ENTRY_EMPTY;
- __gmap_unshadow_sgt(sg, raddr, sgt);
+ __gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
- page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ page = phys_to_page(sgt);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1449,7 +1451,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
*/
static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
{
- unsigned long r2o, *r2e, *r3t;
+ unsigned long r2o, *r2e;
+ phys_addr_t r3t;
struct page *page;
BUG_ON(!gmap_is_shadow(sg));
@@ -1458,12 +1461,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
return;
gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
- gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
- r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+ gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
+ r3t = *r2e & _REGION_ENTRY_ORIGIN;
*r2e = _REGION2_ENTRY_EMPTY;
- __gmap_unshadow_r3t(sg, raddr, r3t);
+ __gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
- page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ page = phys_to_page(r3t);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1479,7 +1482,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
unsigned long *r2t)
{
- unsigned long *r3t;
+ phys_addr_t r3t;
struct page *page;
int i;
@@ -1487,11 +1490,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
continue;
- r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+ r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
r2t[i] = _REGION2_ENTRY_EMPTY;
- __gmap_unshadow_r3t(sg, raddr, r3t);
+ __gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
- page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ page = phys_to_page(r3t);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1506,8 +1509,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
*/
static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
{
- unsigned long r1o, *r1e, *r2t;
+ unsigned long r1o, *r1e;
struct page *page;
+ phys_addr_t r2t;
BUG_ON(!gmap_is_shadow(sg));
r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
@@ -1515,12 +1519,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
return;
gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
- gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
- r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+ gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
+ r2t = *r1e & _REGION_ENTRY_ORIGIN;
*r1e = _REGION1_ENTRY_EMPTY;
- __gmap_unshadow_r2t(sg, raddr, r2t);
+ __gmap_unshadow_r2t(sg, raddr, __va(r2t));
/* Free region 2 table */
- page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ page = phys_to_page(r2t);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1536,22 +1540,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
unsigned long *r1t)
{
- unsigned long asce, *r2t;
+ unsigned long asce;
struct page *page;
+ phys_addr_t r2t;
int i;
BUG_ON(!gmap_is_shadow(sg));
- asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+ asce = __pa(r1t) | _ASCE_TYPE_REGION1;
for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
continue;
- r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
- __gmap_unshadow_r2t(sg, raddr, r2t);
+ r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
+ __gmap_unshadow_r2t(sg, raddr, __va(r2t));
/* Clear entry and flush translation r1t -> r2t */
gmap_idte_one(asce, raddr);
r1t[i] = _REGION1_ENTRY_EMPTY;
/* Free region 2 table */
- page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ page = phys_to_page(r2t);
list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1573,7 +1578,7 @@ static void gmap_unshadow(struct gmap *sg)
sg->removed = 1;
gmap_call_notifier(sg, 0, -1UL);
gmap_flush_tlb(sg);
- table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+ table = __va(sg->asce & _ASCE_ORIGIN);
switch (sg->asce & _ASCE_TYPE_MASK) {
case _ASCE_TYPE_REGION1:
__gmap_unshadow_r1t(sg, 0, table);
@@ -1748,7 +1753,8 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
int fake)
{
unsigned long raddr, origin, offset, len;
- unsigned long *s_r2t, *table;
+ unsigned long *table;
+ phys_addr_t s_r2t;
struct page *page;
int rc;
@@ -1760,7 +1766,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
page->index = r2t & _REGION_ENTRY_ORIGIN;
if (fake)
page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_r2t = (unsigned long *) page_to_phys(page);
+ s_r2t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
@@ -1775,9 +1781,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
rc = -EAGAIN; /* Race with shadow */
goto out_free;
}
- crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+ crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
- *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+ *table = s_r2t | _REGION_ENTRY_LENGTH |
_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r2t & _REGION_ENTRY_PROTECT);
@@ -1798,8 +1804,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
spin_lock(&sg->guest_table_lock);
if (!rc) {
table = gmap_table_walk(sg, saddr, 4);
- if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
- (unsigned long) s_r2t)
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
rc = -EAGAIN; /* Race with unshadow */
else
*table &= ~_REGION_ENTRY_INVALID;
@@ -1832,7 +1837,8 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
int fake)
{
unsigned long raddr, origin, offset, len;
- unsigned long *s_r3t, *table;
+ unsigned long *table;
+ phys_addr_t s_r3t;
struct page *page;
int rc;
@@ -1844,7 +1850,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
page->index = r3t & _REGION_ENTRY_ORIGIN;
if (fake)
page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_r3t = (unsigned long *) page_to_phys(page);
+ s_r3t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
@@ -1859,9 +1865,9 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
rc = -EAGAIN; /* Race with shadow */
goto out_free;
}
- crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+ crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
- *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+ *table = s_r3t | _REGION_ENTRY_LENGTH |
_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r3t & _REGION_ENTRY_PROTECT);
@@ -1882,8 +1888,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
spin_lock(&sg->guest_table_lock);
if (!rc) {
table = gmap_table_walk(sg, saddr, 3);
- if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
- (unsigned long) s_r3t)
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
rc = -EAGAIN; /* Race with unshadow */
else
*table &= ~_REGION_ENTRY_INVALID;
@@ -1916,7 +1921,8 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
int fake)
{
unsigned long raddr, origin, offset, len;
- unsigned long *s_sgt, *table;
+ unsigned long *table;
+ phys_addr_t s_sgt;
struct page *page;
int rc;
@@ -1928,7 +1934,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
page->index = sgt & _REGION_ENTRY_ORIGIN;
if (fake)
page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_sgt = (unsigned long *) page_to_phys(page);
+ s_sgt = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
@@ -1943,9 +1949,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
rc = -EAGAIN; /* Race with shadow */
goto out_free;
}
- crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+ crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
- *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+ *table = s_sgt | _REGION_ENTRY_LENGTH |
_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= sgt & _REGION_ENTRY_PROTECT;
@@ -1966,8 +1972,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
spin_lock(&sg->guest_table_lock);
if (!rc) {
table = gmap_table_walk(sg, saddr, 2);
- if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
- (unsigned long) s_sgt)
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
rc = -EAGAIN; /* Race with unshadow */
else
*table &= ~_REGION_ENTRY_INVALID;
@@ -2040,8 +2045,9 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
int fake)
{
unsigned long raddr, origin;
- unsigned long *s_pgt, *table;
+ unsigned long *table;
struct page *page;
+ phys_addr_t s_pgt;
int rc;
BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
@@ -2052,7 +2058,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
if (fake)
page->index |= GMAP_SHADOW_FAKE_TABLE;
- s_pgt = (unsigned long *) page_to_phys(page);
+ s_pgt = page_to_phys(page);
/* Install shadow page table */
spin_lock(&sg->guest_table_lock);
table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
@@ -2085,8 +2091,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
spin_lock(&sg->guest_table_lock);
if (!rc) {
table = gmap_table_walk(sg, saddr, 1);
- if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
- (unsigned long) s_pgt)
+ if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
rc = -EAGAIN; /* Race with unshadow */
else
*table &= ~_SEGMENT_ENTRY_INVALID;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 97d66a3e60fb..d509656c67d7 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -140,25 +140,25 @@ void mark_rodata_ro(void)
debug_checkwx();
}
-int set_memory_encrypted(unsigned long addr, int numpages)
+int set_memory_encrypted(unsigned long vaddr, int numpages)
{
int i;
/* make specified pages unshared, (swiotlb, dma_free) */
for (i = 0; i < numpages; ++i) {
- uv_remove_shared(addr);
- addr += PAGE_SIZE;
+ uv_remove_shared(virt_to_phys((void *)vaddr));
+ vaddr += PAGE_SIZE;
}
return 0;
}
-int set_memory_decrypted(unsigned long addr, int numpages)
+int set_memory_decrypted(unsigned long vaddr, int numpages)
{
int i;
/* make specified pages shared (swiotlb, dma_alloca) */
for (i = 0; i < numpages; ++i) {
- uv_set_shared(addr);
- addr += PAGE_SIZE;
+ uv_set_shared(virt_to_phys((void *)vaddr));
+ vaddr += PAGE_SIZE;
}
return 0;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8259d725054d..4dbde69c423b 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1603,10 +1603,8 @@ clear_arch_lbr:
* x86_perf_get_lbr - get the LBR records information
*
* @lbr: the caller's memory to store the LBR records information
- *
- * Returns: 0 indicates the LBR info has been successfully obtained
*/
-int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
+void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
int lbr_fmt = x86_pmu.intel_cap.lbr_format;
@@ -1614,8 +1612,6 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
lbr->from = x86_pmu.lbr_from;
lbr->to = x86_pmu.lbr_to;
lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0;
-
- return 0;
}
EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index b71f4f2ecdd5..1419c4e04d45 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -308,6 +308,9 @@
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */
+#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
+#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 3089ec352743..e3efaf6e6b62 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -61,6 +61,8 @@
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10)
/* Support for debug MSRs available */
#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11)
+/* Support for extended gva ranges for flush hypercalls available */
+#define HV_FEATURE_EXT_GVA_RANGES_FLUSH BIT(14)
/*
* Support for returning hypercall output block via XMM
* registers is available
@@ -598,6 +600,41 @@ struct hv_enlightened_vmcs {
#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
+/*
+ * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by
+ * pairing it with architecturally impossible exit reasons. Bit 28 is set only
+ * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit
+ * is pending. I.e. it will never be set by hardware for non-SMI exits (there
+ * are only three), nor will it ever be set unless the VMM is an STM.
+ */
+#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031
+
+/*
+ * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose
+ * SVM enlightenments to guests.
+ */
+struct hv_vmcb_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB.
+ */
+#define HV_VMCB_NESTED_ENLIGHTENMENTS 31
+
+/* Synthetic VM-Exit */
+#define HV_SVM_EXITCODE_ENL 0xf0000000
+#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1)
+
struct hv_partition_assist_pg {
u32 tlb_lock_count;
};
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 82ba4a564e58..abccd51dcfca 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -110,10 +110,12 @@ KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
KVM_X86_OP_OPTIONAL(set_hv_timer)
KVM_X86_OP_OPTIONAL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
+#ifdef CONFIG_KVM_SMM
KVM_X86_OP(smi_allowed)
KVM_X86_OP(enter_smm)
KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
+#endif
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
@@ -123,7 +125,7 @@ KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
KVM_X86_OP(get_msr_feature)
KVM_X86_OP(can_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
-KVM_X86_OP_OPTIONAL(enable_direct_tlbflush)
+KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
KVM_X86_OP_OPTIONAL(migrate_timers)
KVM_X86_OP(msr_filter_changed)
KVM_X86_OP(complete_emulated_msr)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4dbde7d9eb1..ad9f8b02071d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
#include <linux/clocksource.h>
#include <linux/irqbypass.h>
#include <linux/hyperv.h>
+#include <linux/kfifo.h>
#include <asm/apic.h>
#include <asm/pvclock-abi.h>
@@ -81,7 +82,9 @@
#define KVM_REQ_NMI KVM_ARCH_REQ(9)
#define KVM_REQ_PMU KVM_ARCH_REQ(10)
#define KVM_REQ_PMI KVM_ARCH_REQ(11)
+#ifdef CONFIG_KVM_SMM
#define KVM_REQ_SMI KVM_ARCH_REQ(12)
+#endif
#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13)
#define KVM_REQ_MCLOCK_INPROGRESS \
KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
@@ -108,6 +111,8 @@
KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_TLB_FLUSH \
+ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -204,6 +209,7 @@ typedef enum exit_fastpath_completion fastpath_t;
struct x86_emulate_ctxt;
struct x86_exception;
+union kvm_smram;
enum x86_intercept;
enum x86_intercept_stage;
@@ -253,16 +259,16 @@ enum x86_intercept_stage;
#define PFERR_GUEST_PAGE_BIT 33
#define PFERR_IMPLICIT_ACCESS_BIT 48
-#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
-#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
-#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
-#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
-#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
-#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
-#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
-#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT)
+#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT)
+#define PFERR_USER_MASK BIT(PFERR_USER_BIT)
+#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT)
+#define PFERR_PK_MASK BIT(PFERR_PK_BIT)
+#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT)
+#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT)
+#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
PFERR_WRITE_MASK | \
@@ -488,20 +494,27 @@ enum pmc_type {
struct kvm_pmc {
enum pmc_type type;
u8 idx;
+ bool is_paused;
+ bool intr;
u64 counter;
+ u64 prev_counter;
u64 eventsel;
struct perf_event *perf_event;
struct kvm_vcpu *vcpu;
/*
+ * only for creating or reusing perf_event,
* eventsel value for general purpose counters,
* ctrl value for fixed counters.
*/
u64 current_config;
- bool is_paused;
- bool intr;
};
+/* More counters may conflict with other existing Architectural MSRs */
+#define KVM_INTEL_PMC_MAX_GENERIC 8
+#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
+#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
#define KVM_PMC_MAX_FIXED 3
+#define KVM_AMD_PMC_MAX_GENERIC 6
struct kvm_pmu {
unsigned nr_arch_gp_counters;
unsigned nr_arch_fixed_counters;
@@ -516,10 +529,19 @@ struct kvm_pmu {
u64 reserved_bits;
u64 raw_event_mask;
u8 version;
- struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
+ struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
struct irq_work irq_work;
- DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
+
+ /*
+ * Overlay the bitmap with a 64-bit atomic so that all bits can be
+ * set in a single access, e.g. to reprogram all counters when the PMU
+ * filter changes.
+ */
+ union {
+ DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
+ atomic64_t __reprogram_pmi;
+ };
DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
@@ -597,6 +619,29 @@ struct kvm_vcpu_hv_synic {
bool dont_zero_synic_pages;
};
+/* The maximum number of entries on the TLB flush fifo. */
+#define KVM_HV_TLB_FLUSH_FIFO_SIZE (16)
+/*
+ * Note: the following 'magic' entry is made up by KVM to avoid putting
+ * anything besides GVA on the TLB flush fifo. It is theoretically possible
+ * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000
+ * which will look identical. KVM's action to 'flush everything' instead of
+ * flushing these particular addresses is, however, fully legitimate as
+ * flushing more than requested is always OK.
+ */
+#define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1)
+
+enum hv_tlb_flush_fifos {
+ HV_L1_TLB_FLUSH_FIFO,
+ HV_L2_TLB_FLUSH_FIFO,
+ HV_NR_TLB_FLUSH_FIFOS,
+};
+
+struct kvm_vcpu_hv_tlb_flush_fifo {
+ spinlock_t write_lock;
+ DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+};
+
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
struct kvm_vcpu *vcpu;
@@ -618,6 +663,19 @@ struct kvm_vcpu_hv {
u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
} cpuid_cache;
+
+ struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS];
+
+ /* Preallocated buffer for handling hypercalls passing sparse vCPU set */
+ u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS];
+
+ struct hv_vp_assist_page vp_assist_page;
+
+ struct {
+ u64 pa_page_gpa;
+ u64 vm_id;
+ u32 vp_id;
+ } nested;
};
/* Xen HVM per vcpu emulation context */
@@ -1151,7 +1209,18 @@ struct kvm_arch {
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
- struct list_head lpage_disallowed_mmu_pages;
+ /*
+ * A list of kvm_mmu_page structs that, if zapped, could possibly be
+ * replaced by an NX huge page. A shadow page is on this list if its
+ * existence disallows an NX huge page (nx_huge_page_disallowed is set)
+ * and there are no other conditions that prevent a huge page, e.g.
+ * the backing host page is huge, dirtly logging is not enabled for its
+ * memslot, etc... Note, zapping shadow pages on this list doesn't
+ * guarantee an NX huge page will be created in its stead, e.g. if the
+ * guest attempts to execute from the region then KVM obviously can't
+ * create an NX huge page (without hanging the guest).
+ */
+ struct list_head possible_nx_huge_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
/*
@@ -1267,7 +1336,7 @@ struct kvm_arch {
bool sgx_provisioning_allowed;
struct kvm_pmu_event_filter __rcu *pmu_event_filter;
- struct task_struct *nx_lpage_recovery_thread;
+ struct task_struct *nx_huge_page_recovery_thread;
#ifdef CONFIG_X86_64
/*
@@ -1279,6 +1348,9 @@ struct kvm_arch {
*/
bool tdp_mmu_enabled;
+ /* The number of TDP MMU pages across all roots. */
+ atomic64_t tdp_mmu_pages;
+
/*
* List of kvm_mmu_page structs being used as roots.
* All kvm_mmu_page structs in the list should have
@@ -1300,20 +1372,12 @@ struct kvm_arch {
struct list_head tdp_mmu_roots;
/*
- * List of kvm_mmu_page structs not being used as roots.
- * All kvm_mmu_page structs in the list should have
- * tdp_mmu_page set and a tdp_mmu_root_count of 0.
- */
- struct list_head tdp_mmu_pages;
-
- /*
* Protects accesses to the following fields when the MMU lock
* is held in read mode:
* - tdp_mmu_roots (above)
- * - tdp_mmu_pages (above)
* - the link field of kvm_mmu_page structs used by the TDP MMU
- * - lpage_disallowed_mmu_pages
- * - the lpage_disallowed_link field of kvm_mmu_page structs used
+ * - possible_nx_huge_pages;
+ * - the possible_nx_huge_page_link field of kvm_mmu_page structs used
* by the TDP MMU
* It is acceptable, but not necessary, to acquire this lock when
* the thread holds the MMU lock in write mode.
@@ -1607,10 +1671,12 @@ struct kvm_x86_ops {
void (*setup_mce)(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_KVM_SMM
int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
- int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
- int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
+ int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+ int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
+#endif
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
@@ -1625,7 +1691,7 @@ struct kvm_x86_ops {
void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
- int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
+ int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
void (*migrate_timers)(struct kvm_vcpu *vcpu);
void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
@@ -1658,6 +1724,7 @@ struct kvm_x86_nested_ops {
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+ void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_init_ops {
@@ -1839,6 +1906,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
@@ -1904,8 +1972,6 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
@@ -1989,14 +2055,18 @@ enum {
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+
+#ifdef CONFIG_KVM_SMM
#define HF_SMM_MASK (1 << 6)
#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
-#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
-#define KVM_ADDRESS_SPACE_NUM 2
-
-#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
-#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+# define KVM_ADDRESS_SPACE_NUM 2
+# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
+# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+#else
+# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
+#endif
#define KVM_ARCH_WANT_MMU_NOTIFIER
@@ -2084,12 +2154,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#endif
}
-#define put_smstate(type, buf, offset, val) \
- *(type *)((buf) + (offset) - 0x7e00) = val
-
-#define GET_SMSTATE(type, buf, offset) \
- (*(type *)((buf) + (offset) - 0x7e00))
-
int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
#define KVM_CLOCK_VALID_FLAGS \
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 9ac46dbe57d4..5d0f6891ae61 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -543,12 +543,12 @@ static inline void perf_check_microcode(void) { }
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
-extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
+extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
#else
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
-static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
+static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
- return -1;
+ memset(lbr, 0, sizeof(*lbr));
}
#endif
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index 5393babc0598..cb0386fc4dc3 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -13,7 +13,7 @@
* Takes the guest view of SPEC_CTRL MSR as a parameter and also
* the guest's version of VIRT_SPEC_CTRL, if emulated.
*/
-extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest);
+extern void x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool guest);
/**
* x86_spec_ctrl_set_guest - Set speculation control registers for the guest
@@ -24,9 +24,9 @@ extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bo
* Avoids writing to the MSR if the content/bits are the same
*/
static inline
-void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+void x86_spec_ctrl_set_guest(u64 guest_virt_spec_ctrl)
{
- x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true);
+ x86_virt_spec_ctrl(guest_virt_spec_ctrl, true);
}
/**
@@ -38,9 +38,9 @@ void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
* Avoids writing to the MSR if the content/bits are the same
*/
static inline
-void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+void x86_spec_ctrl_restore_host(u64 guest_virt_spec_ctrl)
{
- x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false);
+ x86_virt_spec_ctrl(guest_virt_spec_ctrl, false);
}
/* AMD specific Speculative Store Bypass MSR data */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0361626841bc..cb1ee53ad3b1 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -5,6 +5,8 @@
#include <uapi/asm/svm.h>
#include <uapi/asm/kvm.h>
+#include <asm/hyperv-tlfs.h>
+
/*
* 32-bit intercept words in the VMCB Control Area, starting
* at Byte offset 000h.
@@ -161,7 +163,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
*/
- u8 reserved_sw[32];
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
};
@@ -293,12 +298,13 @@ struct vmcb_save_area {
struct vmcb_seg ldtr;
struct vmcb_seg idtr;
struct vmcb_seg tr;
- u8 reserved_1[42];
+ /* Reserved fields are named following their struct offset */
+ u8 reserved_0xa0[42];
u8 vmpl;
u8 cpl;
- u8 reserved_2[4];
+ u8 reserved_0xcc[4];
u64 efer;
- u8 reserved_3[112];
+ u8 reserved_0xd8[112];
u64 cr4;
u64 cr3;
u64 cr0;
@@ -306,7 +312,7 @@ struct vmcb_save_area {
u64 dr6;
u64 rflags;
u64 rip;
- u8 reserved_4[88];
+ u8 reserved_0x180[88];
u64 rsp;
u64 s_cet;
u64 ssp;
@@ -321,14 +327,14 @@ struct vmcb_save_area {
u64 sysenter_esp;
u64 sysenter_eip;
u64 cr2;
- u8 reserved_5[32];
+ u8 reserved_0x248[32];
u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
- u8 reserved_6[72];
+ u8 reserved_0x298[72];
u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
} __packed;
@@ -349,12 +355,12 @@ struct sev_es_save_area {
u64 vmpl2_ssp;
u64 vmpl3_ssp;
u64 u_cet;
- u8 reserved_1[2];
+ u8 reserved_0xc8[2];
u8 vmpl;
u8 cpl;
- u8 reserved_2[4];
+ u8 reserved_0xcc[4];
u64 efer;
- u8 reserved_3[104];
+ u8 reserved_0xd8[104];
u64 xss;
u64 cr4;
u64 cr3;
@@ -371,7 +377,7 @@ struct sev_es_save_area {
u64 dr1_addr_mask;
u64 dr2_addr_mask;
u64 dr3_addr_mask;
- u8 reserved_4[24];
+ u8 reserved_0x1c0[24];
u64 rsp;
u64 s_cet;
u64 ssp;
@@ -386,21 +392,21 @@ struct sev_es_save_area {
u64 sysenter_esp;
u64 sysenter_eip;
u64 cr2;
- u8 reserved_5[32];
+ u8 reserved_0x248[32];
u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
- u8 reserved_7[80];
+ u8 reserved_0x298[80];
u32 pkru;
- u8 reserved_8[20];
- u64 reserved_9; /* rax already available at 0x01f8 */
+ u32 tsc_aux;
+ u8 reserved_0x2f0[24];
u64 rcx;
u64 rdx;
u64 rbx;
- u64 reserved_10; /* rsp already available at 0x01d8 */
+ u64 reserved_0x320; /* rsp already available at 0x01d8 */
u64 rbp;
u64 rsi;
u64 rdi;
@@ -412,7 +418,7 @@ struct sev_es_save_area {
u64 r13;
u64 r14;
u64 r15;
- u8 reserved_11[16];
+ u8 reserved_0x380[16];
u64 guest_exit_info_1;
u64 guest_exit_info_2;
u64 guest_exit_int_info;
@@ -425,7 +431,7 @@ struct sev_es_save_area {
u64 pcpu_id;
u64 event_inj;
u64 xcr0;
- u8 reserved_12[16];
+ u8 reserved_0x3f0[16];
/* Floating point area */
u64 x87_dp;
@@ -443,23 +449,23 @@ struct sev_es_save_area {
} __packed;
struct ghcb_save_area {
- u8 reserved_1[203];
+ u8 reserved_0x0[203];
u8 cpl;
- u8 reserved_2[116];
+ u8 reserved_0xcc[116];
u64 xss;
- u8 reserved_3[24];
+ u8 reserved_0x148[24];
u64 dr7;
- u8 reserved_4[16];
+ u8 reserved_0x168[16];
u64 rip;
- u8 reserved_5[88];
+ u8 reserved_0x180[88];
u64 rsp;
- u8 reserved_6[24];
+ u8 reserved_0x1e0[24];
u64 rax;
- u8 reserved_7[264];
+ u8 reserved_0x200[264];
u64 rcx;
u64 rdx;
u64 rbx;
- u8 reserved_8[8];
+ u8 reserved_0x320[8];
u64 rbp;
u64 rsi;
u64 rdi;
@@ -471,12 +477,12 @@ struct ghcb_save_area {
u64 r13;
u64 r14;
u64 r15;
- u8 reserved_9[16];
+ u8 reserved_0x380[16];
u64 sw_exit_code;
u64 sw_exit_info_1;
u64 sw_exit_info_2;
u64 sw_scratch;
- u8 reserved_10[56];
+ u8 reserved_0x3b0[56];
u64 xcr0;
u8 valid_bitmap[16];
u64 x87_state_gpa;
@@ -490,7 +496,7 @@ struct ghcb {
u8 shared_buffer[GHCB_SHARED_BUF_SIZE];
- u8 reserved_1[10];
+ u8 reserved_0xff0[10];
u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
u32 ghcb_usage;
} __packed;
@@ -502,6 +508,9 @@ struct ghcb {
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
#define EXPECTED_GHCB_SIZE PAGE_SIZE
+#define BUILD_BUG_RESERVED_OFFSET(x, y) \
+ ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y)
+
static inline void __unused_size_checks(void)
{
BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
@@ -509,6 +518,39 @@ static inline void __unused_size_checks(void)
BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
+
+ /* Check offsets of reserved fields */
+
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298);
+
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0);
+
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0);
+
+ BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0);
}
struct vmcb {
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 46de10a809ec..c6df6b16a088 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -214,6 +214,8 @@ struct kvm_msr_list {
struct kvm_msr_filter_range {
#define KVM_MSR_FILTER_READ (1 << 0)
#define KVM_MSR_FILTER_WRITE (1 << 1)
+#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \
+ KVM_MSR_FILTER_WRITE)
__u32 flags;
__u32 nmsrs; /* number of msrs in bitmap */
__u32 base; /* MSR index the bitmap starts at */
@@ -222,8 +224,11 @@ struct kvm_msr_filter_range {
#define KVM_MSR_FILTER_MAX_RANGES 16
struct kvm_msr_filter {
+#ifndef __KERNEL__
#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+#endif
#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY)
__u32 flags;
struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
};
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cb50589a7102..437308004ef2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -19,7 +19,6 @@
#include <asm/suspend.h>
#include <asm/tlbflush.h>
#include <asm/tdx.h>
-#include "../kvm/vmx/vmx.h"
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -108,9 +107,4 @@ static void __used common(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
-
- if (IS_ENABLED(CONFIG_KVM_INTEL)) {
- BLANK();
- OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
- }
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index da7c361f47e0..3e3230cccaa7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -196,22 +196,15 @@ void __init check_bugs(void)
}
/*
- * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
*/
void
-x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
+x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+ u64 guestval, hostval;
struct thread_info *ti = current_thread_info();
- if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- if (hostval != guestval) {
- msrval = setguest ? guestval : hostval;
- wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
- }
- }
-
/*
* If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
* MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d4e48b4a438b..cf886f86038a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -349,7 +349,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
static void kvm_guest_cpu_init(void)
{
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
- u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+ u64 pa;
WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
diff --git a/arch/x86/kvm/.gitignore b/arch/x86/kvm/.gitignore
new file mode 100644
index 000000000000..615d6ff35c00
--- /dev/null
+++ b/arch/x86/kvm/.gitignore
@@ -0,0 +1,2 @@
+/kvm-asm-offsets.s
+/kvm-asm-offsets.h
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 67be7f217e37..fbeaa9ddef59 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -118,6 +118,17 @@ config KVM_AMD_SEV
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
with Encrypted State (SEV-ES) on AMD processors.
+config KVM_SMM
+ bool "System Management Mode emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate System Management Mode (SMM)
+ in virtual machines. This can be used by the virtual machine
+ firmware to implement UEFI secure boot.
+
+ If unsure, say Y.
+
config KVM_XEN
bool "Support for Xen hypercall interface"
depends on KVM
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 30f244b64523..80e3fe184d17 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -20,12 +20,14 @@ endif
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-$(CONFIG_KVM_XEN) += xen.o
+kvm-$(CONFIG_KVM_SMM) += smm.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
- vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+ vmx/hyperv.o vmx/nested.o vmx/posted_intr.o
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
-kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
+ svm/sev.o svm/hyperv.o
ifdef CONFIG_HYPERV
kvm-amd-y += svm/svm_onhyperv.o
@@ -34,3 +36,15 @@ endif
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+
+AFLAGS_svm/vmenter.o := -iquote $(obj)
+$(obj)/svm/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+AFLAGS_vmx/vmenter.o := -iquote $(obj)
+$(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+$(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE
+ $(call filechk,offsets,__KVM_ASM_OFFSETS_H__)
+
+targets += kvm-asm-offsets.s
+clean-files += kvm-asm-offsets.h
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 62bc7a01cecc..723502181a3a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -62,10 +62,16 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
* This one is tied to SSB in the user API, and not
* visible in /proc/cpuinfo.
*/
-#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
+#define KVM_X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
#define F feature_bit
-#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
+
+/* Scattered Flag - For features that are scattered by cpufeatures.h. */
+#define SF(name) \
+({ \
+ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
+ (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \
+})
/*
* Magic value used by KVM when querying userspace-provided CPUID entries and
@@ -543,9 +549,9 @@ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
}
static __always_inline
-void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
+void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask)
{
- /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+ /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */
BUILD_BUG_ON(leaf < NCAPINTS);
kvm_cpu_caps[leaf] = mask;
@@ -555,7 +561,7 @@ void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
{
- /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+ /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */
BUILD_BUG_ON(leaf >= NCAPINTS);
kvm_cpu_caps[leaf] &= mask;
@@ -657,14 +663,19 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(AVX_VNNI) | F(AVX512_BF16)
+ F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) |
+ F(AVX_IFMA)
+ );
+
+ kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
+ F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
);
- kvm_cpu_cap_init_scattered(CPUID_12_EAX,
+ kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX,
SF(SGX1) | SF(SGX2)
);
@@ -694,7 +705,7 @@ void kvm_set_cpu_caps(void)
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
- __feature_bit(KVM_X86_FEATURE_PSFD)
+ __feature_bit(KVM_X86_FEATURE_AMD_PSFD)
);
/*
@@ -913,9 +924,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
cpuid_entry_override(entry, CPUID_7_1_EAX);
+ cpuid_entry_override(entry, CPUID_7_1_EDX);
entry->ebx = 0;
entry->ecx = 0;
- entry->edx = 0;
}
break;
case 0xa: { /* Architectural Performance Monitoring */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4a43261d25a2..5cc3efa0e21c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -242,37 +242,6 @@ enum x86_transfer_type {
X86_TRANSFER_TASK_SWITCH,
};
-static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
- nr &= NR_EMULATOR_GPRS - 1;
-
- if (!(ctxt->regs_valid & (1 << nr))) {
- ctxt->regs_valid |= 1 << nr;
- ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
- }
- return ctxt->_regs[nr];
-}
-
-static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
- nr &= NR_EMULATOR_GPRS - 1;
-
- BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
- BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
-
- ctxt->regs_valid |= 1 << nr;
- ctxt->regs_dirty |= 1 << nr;
- return &ctxt->_regs[nr];
-}
-
-static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- reg_read(ctxt, nr);
- return reg_write(ctxt, nr);
-}
-
static void writeback_registers(struct x86_emulate_ctxt *ctxt)
{
unsigned long dirty = ctxt->regs_dirty;
@@ -2338,335 +2307,15 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
}
-static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
-{
-#ifdef CONFIG_X86_64
- return ctxt->ops->guest_has_long_mode(ctxt);
-#else
- return false;
-#endif
-}
-
-static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
-{
- desc->g = (flags >> 23) & 1;
- desc->d = (flags >> 22) & 1;
- desc->l = (flags >> 21) & 1;
- desc->avl = (flags >> 20) & 1;
- desc->p = (flags >> 15) & 1;
- desc->dpl = (flags >> 13) & 3;
- desc->s = (flags >> 12) & 1;
- desc->type = (flags >> 8) & 15;
-}
-
-static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
- int n)
-{
- struct desc_struct desc;
- int offset;
- u16 selector;
-
- selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4);
-
- if (n < 3)
- offset = 0x7f84 + n * 12;
- else
- offset = 0x7f2c + (n - 3) * 12;
-
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
- return X86EMUL_CONTINUE;
-}
-
-#ifdef CONFIG_X86_64
-static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
- int n)
-{
- struct desc_struct desc;
- int offset;
- u16 selector;
- u32 base3;
-
- offset = 0x7e00 + n * 16;
-
- selector = GET_SMSTATE(u16, smstate, offset);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
- base3 = GET_SMSTATE(u32, smstate, offset + 12);
-
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
- return X86EMUL_CONTINUE;
-}
-#endif
-
-static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
- u64 cr0, u64 cr3, u64 cr4)
-{
- int bad;
- u64 pcid;
-
- /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
- pcid = 0;
- if (cr4 & X86_CR4_PCIDE) {
- pcid = cr3 & 0xfff;
- cr3 &= ~0xfff;
- }
-
- bad = ctxt->ops->set_cr(ctxt, 3, cr3);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- /*
- * First enable PAE, long mode needs it before CR0.PG = 1 is set.
- * Then enable protected mode. However, PCID cannot be enabled
- * if EFER.LMA=0, so set it separately.
- */
- bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- bad = ctxt->ops->set_cr(ctxt, 0, cr0);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- if (cr4 & X86_CR4_PCIDE) {
- bad = ctxt->ops->set_cr(ctxt, 4, cr4);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
- if (pcid) {
- bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
- }
-
- }
-
- return X86EMUL_CONTINUE;
-}
-
-static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
-{
- struct desc_struct desc;
- struct desc_ptr dt;
- u16 selector;
- u32 val, cr0, cr3, cr4;
- int i;
-
- cr0 = GET_SMSTATE(u32, smstate, 0x7ffc);
- cr3 = GET_SMSTATE(u32, smstate, 0x7ff8);
- ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
- ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
-
- for (i = 0; i < 8; i++)
- *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
-
- val = GET_SMSTATE(u32, smstate, 0x7fcc);
-
- if (ctxt->ops->set_dr(ctxt, 6, val))
- return X86EMUL_UNHANDLEABLE;
-
- val = GET_SMSTATE(u32, smstate, 0x7fc8);
-
- if (ctxt->ops->set_dr(ctxt, 7, val))
- return X86EMUL_UNHANDLEABLE;
-
- selector = GET_SMSTATE(u32, smstate, 0x7fc4);
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
-
- selector = GET_SMSTATE(u32, smstate, 0x7fc0);
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
-
- dt.address = GET_SMSTATE(u32, smstate, 0x7f74);
- dt.size = GET_SMSTATE(u32, smstate, 0x7f70);
- ctxt->ops->set_gdt(ctxt, &dt);
-
- dt.address = GET_SMSTATE(u32, smstate, 0x7f58);
- dt.size = GET_SMSTATE(u32, smstate, 0x7f54);
- ctxt->ops->set_idt(ctxt, &dt);
-
- for (i = 0; i < 6; i++) {
- int r = rsm_load_seg_32(ctxt, smstate, i);
- if (r != X86EMUL_CONTINUE)
- return r;
- }
-
- cr4 = GET_SMSTATE(u32, smstate, 0x7f14);
-
- ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8));
-
- return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
-}
-
-#ifdef CONFIG_X86_64
-static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
-{
- struct desc_struct desc;
- struct desc_ptr dt;
- u64 val, cr0, cr3, cr4;
- u32 base3;
- u16 selector;
- int i, r;
-
- for (i = 0; i < 16; i++)
- *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
-
- ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
- ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
-
- val = GET_SMSTATE(u64, smstate, 0x7f68);
-
- if (ctxt->ops->set_dr(ctxt, 6, val))
- return X86EMUL_UNHANDLEABLE;
-
- val = GET_SMSTATE(u64, smstate, 0x7f60);
-
- if (ctxt->ops->set_dr(ctxt, 7, val))
- return X86EMUL_UNHANDLEABLE;
-
- cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
- cr3 = GET_SMSTATE(u64, smstate, 0x7f50);
- cr4 = GET_SMSTATE(u64, smstate, 0x7f48);
- ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
- val = GET_SMSTATE(u64, smstate, 0x7ed0);
-
- if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA))
- return X86EMUL_UNHANDLEABLE;
-
- selector = GET_SMSTATE(u32, smstate, 0x7e90);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98));
- base3 = GET_SMSTATE(u32, smstate, 0x7e9c);
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
-
- dt.size = GET_SMSTATE(u32, smstate, 0x7e84);
- dt.address = GET_SMSTATE(u64, smstate, 0x7e88);
- ctxt->ops->set_idt(ctxt, &dt);
-
- selector = GET_SMSTATE(u32, smstate, 0x7e70);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78));
- base3 = GET_SMSTATE(u32, smstate, 0x7e7c);
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
-
- dt.size = GET_SMSTATE(u32, smstate, 0x7e64);
- dt.address = GET_SMSTATE(u64, smstate, 0x7e68);
- ctxt->ops->set_gdt(ctxt, &dt);
-
- r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
- if (r != X86EMUL_CONTINUE)
- return r;
-
- for (i = 0; i < 6; i++) {
- r = rsm_load_seg_64(ctxt, smstate, i);
- if (r != X86EMUL_CONTINUE)
- return r;
- }
-
- return X86EMUL_CONTINUE;
-}
-#endif
-
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
- unsigned long cr0, cr4, efer;
- char buf[512];
- u64 smbase;
- int ret;
-
if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
return emulate_ud(ctxt);
- smbase = ctxt->ops->get_smbase(ctxt);
-
- ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf));
- if (ret != X86EMUL_CONTINUE)
- return X86EMUL_UNHANDLEABLE;
-
- if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
- ctxt->ops->set_nmi_mask(ctxt, false);
-
- ctxt->ops->exiting_smm(ctxt);
-
- /*
- * Get back to real mode, to prepare a safe state in which to load
- * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
- * supports long mode.
- */
- if (emulator_has_longmode(ctxt)) {
- struct desc_struct cs_desc;
-
- /* Zero CR4.PCIDE before CR0.PG. */
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- if (cr4 & X86_CR4_PCIDE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
-
- /* A 32-bit code segment is required to clear EFER.LMA. */
- memset(&cs_desc, 0, sizeof(cs_desc));
- cs_desc.type = 0xb;
- cs_desc.s = cs_desc.g = cs_desc.p = 1;
- ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
- }
-
- /* For the 64-bit case, this will clear EFER.LMA. */
- cr0 = ctxt->ops->get_cr(ctxt, 0);
- if (cr0 & X86_CR0_PE)
- ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
-
- if (emulator_has_longmode(ctxt)) {
- /* Clear CR4.PAE before clearing EFER.LME. */
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- if (cr4 & X86_CR4_PAE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
-
- /* And finally go back to 32-bit mode. */
- efer = 0;
- ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
- }
-
- /*
- * Give leave_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. enter guest mode) before loading state from the SMM
- * state-save area.
- */
- if (ctxt->ops->leave_smm(ctxt, buf))
- goto emulate_shutdown;
-
-#ifdef CONFIG_X86_64
- if (emulator_has_longmode(ctxt))
- ret = rsm_load_state_64(ctxt, buf);
- else
-#endif
- ret = rsm_load_state_32(ctxt, buf);
-
- if (ret != X86EMUL_CONTINUE)
- goto emulate_shutdown;
+ if (ctxt->ops->leave_smm(ctxt))
+ ctxt->ops->triple_fault(ctxt);
- /*
- * Note, the ctxt->ops callbacks are responsible for handling side
- * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID
- * runtime updates, etc... If that changes, e.g. this flow is moved
- * out of the emulator to make it look more like enter_smm(), then
- * those side effects need to be explicitly handled for both success
- * and shutdown.
- */
return emulator_recalc_and_set_mode(ctxt);
-
-emulate_shutdown:
- ctxt->ops->triple_fault(ctxt);
- return X86EMUL_CONTINUE;
}
static void
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 0adf4a437e85..2c7f2a26421e 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,22 +23,25 @@
#include "ioapic.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "mmu.h"
#include "xen.h"
#include <linux/cpu.h>
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <linux/sched/cputime.h>
+#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <asm/apicdef.h>
+#include <asm/mshyperv.h>
#include <trace/events/kvm.h>
#include "trace.h"
#include "irq.h"
#include "fpu.h"
-#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
bool vcpu_kick);
@@ -897,13 +900,15 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
-bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
- struct hv_vp_assist_page *assist_page)
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu)
{
- if (!kvm_hv_assist_page_enabled(vcpu))
- return false;
- return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
- assist_page, sizeof(*assist_page));
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu))
+ return -EFAULT;
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page));
}
EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
@@ -954,6 +959,11 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
hv_vcpu->vp_index = vcpu->vcpu_idx;
+ for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) {
+ INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries);
+ spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock);
+ }
+
return 0;
}
@@ -1736,6 +1746,28 @@ static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
}
}
+static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[])
+{
+ int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK;
+ unsigned long sbank;
+
+ if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask))
+ return false;
+
+ /*
+ * The index into the sparse bank is the number of preceding bits in
+ * the valid mask. Optimize for VMs with <64 vCPUs by skipping the
+ * fancy math if there can't possibly be preceding bits.
+ */
+ if (valid_bit_nr)
+ sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0));
+ else
+ sbank = 0;
+
+ return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK,
+ (unsigned long *)&sparse_banks[sbank]);
+}
+
struct kvm_hv_hcall {
u64 param;
u64 ingpa;
@@ -1749,57 +1781,173 @@ struct kvm_hv_hcall {
sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
};
-static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
- int consumed_xmm_halves,
- u64 *sparse_banks, gpa_t offset)
-{
- u16 var_cnt;
- int i;
-
- if (hc->var_cnt > 64)
- return -EINVAL;
- /* Ignore banks that cannot possibly contain a legal VP index. */
- var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS);
+static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u16 orig_cnt, u16 cnt_cap, u64 *data,
+ int consumed_xmm_halves, gpa_t offset)
+{
+ /*
+ * Preserve the original count when ignoring entries via a "cap", KVM
+ * still needs to validate the guest input (though the non-XMM path
+ * punts on the checks).
+ */
+ u16 cnt = min(orig_cnt, cnt_cap);
+ int i, j;
if (hc->fast) {
/*
* Each XMM holds two sparse banks, but do not count halves that
* have already been consumed for hypercall parameters.
*/
- if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
+ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- for (i = 0; i < var_cnt; i++) {
- int j = i + consumed_xmm_halves;
+
+ for (i = 0; i < cnt; i++) {
+ j = i + consumed_xmm_halves;
if (j % 2)
- sparse_banks[i] = sse128_hi(hc->xmm[j / 2]);
+ data[i] = sse128_hi(hc->xmm[j / 2]);
else
- sparse_banks[i] = sse128_lo(hc->xmm[j / 2]);
+ data[i] = sse128_lo(hc->xmm[j / 2]);
}
return 0;
}
- return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks,
- var_cnt * sizeof(*sparse_banks));
+ return kvm_read_guest(kvm, hc->ingpa + offset, data,
+ cnt * sizeof(*data));
+}
+
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u64 *sparse_banks, int consumed_xmm_halves,
+ gpa_t offset)
+{
+ if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS)
+ return -EINVAL;
+
+ /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */
+ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS,
+ sparse_banks, consumed_xmm_halves, offset);
+}
+
+static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[],
+ int consumed_xmm_halves, gpa_t offset)
+{
+ return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt,
+ entries, consumed_xmm_halves, offset);
+}
+
+static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo,
+ u64 *entries, int count)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY;
+
+ if (!hv_vcpu)
+ return;
+
+ spin_lock(&tlb_flush_fifo->write_lock);
+
+ /*
+ * All entries should fit on the fifo leaving one free for 'flush all'
+ * entry in case another request comes in. In case there's not enough
+ * space, just put 'flush all' entry there.
+ */
+ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) {
+ WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count);
+ goto out_unlock;
+ }
+
+ /*
+ * Note: full fifo always contains 'flush all' entry, no need to check the
+ * return value.
+ */
+ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1);
+
+out_unlock:
+ spin_unlock(&tlb_flush_fifo->write_lock);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE];
+ int i, j, count;
+ gva_t gva;
+
+ if (!tdp_enabled || !hv_vcpu)
+ return -EINVAL;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+
+ for (i = 0; i < count; i++) {
+ if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
+ goto out_flush_all;
+
+ /*
+ * Lower 12 bits of 'address' encode the number of additional
+ * pages to flush.
+ */
+ gva = entries[i] & PAGE_MASK;
+ for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+
+ ++vcpu->stat.tlb_flush;
+ }
+ return 0;
+
+out_flush_all:
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+
+ /* Fall back to full flush. */
+ return -ENOSPC;
}
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
struct kvm *kvm = vcpu->kvm;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ /*
+ * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
+ * entries on the TLB flush fifo. The last entry, however, needs to be
+ * always left free for 'flush all' entry which gets placed when
+ * there is not enough space to put all the requested entries.
+ */
+ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1];
+ u64 *tlb_flush_entries;
u64 valid_bank_mask;
- u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+ struct kvm_vcpu *v;
+ unsigned long i;
bool all_cpus;
+ int consumed_xmm_halves = 0;
+ gpa_t data_offset;
/*
- * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
- * valid mask is a u64. Fail the build if KVM's max allowed number of
- * vCPUs (>4096) would exceed this limit, KVM will additional changes
- * for Hyper-V support to avoid setting the guest up to fail.
+ * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS
+ * sparse banks. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) exceeds this limit.
*/
- BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64);
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS);
+
+ /*
+ * 'Slow' hypercall's first parameter is the address in guest's memory
+ * where hypercall parameters are placed. This is either a GPA or a
+ * nested GPA when KVM is handling the call from L2 ('direct' TLB
+ * flush). Translate the address here so the memory can be uniformly
+ * read with kvm_read_guest().
+ */
+ if (!hc->fast && is_guest_mode(vcpu)) {
+ hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
+ if (unlikely(hc->ingpa == INVALID_GPA))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
@@ -1807,14 +1955,17 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
flush.address_space = hc->ingpa;
flush.flags = hc->outgpa;
flush.processor_mask = sse128_lo(hc->xmm[0]);
+ consumed_xmm_halves = 1;
} else {
if (unlikely(kvm_read_guest(kvm, hc->ingpa,
&flush, sizeof(flush))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ data_offset = sizeof(flush);
}
trace_kvm_hv_flush_tlb(flush.processor_mask,
- flush.address_space, flush.flags);
+ flush.address_space, flush.flags,
+ is_guest_mode(vcpu));
valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
@@ -1834,16 +1985,18 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
flush_ex.flags = hc->outgpa;
memcpy(&flush_ex.hv_vp_set,
&hc->xmm[0], sizeof(hc->xmm[0]));
+ consumed_xmm_halves = 2;
} else {
if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
sizeof(flush_ex))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ data_offset = sizeof(flush_ex);
}
trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
flush_ex.hv_vp_set.format,
flush_ex.address_space,
- flush_ex.flags);
+ flush_ex.flags, is_guest_mode(vcpu));
valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
all_cpus = flush_ex.hv_vp_set.format !=
@@ -1852,29 +2005,95 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
if (hc->var_cnt != hweight64(valid_bank_mask))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- if (all_cpus)
- goto do_flush;
+ if (!all_cpus) {
+ if (!hc->var_cnt)
+ goto ret_success;
- if (!hc->var_cnt)
- goto ret_success;
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks,
+ consumed_xmm_halves, data_offset))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
- if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks,
- offsetof(struct hv_tlb_flush_ex,
- hv_vp_set.bank_contents)))
+ /*
+ * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU
+ * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs'
+ * case (HV_GENERIC_SET_ALL). Always adjust data_offset and
+ * consumed_xmm_halves to make sure TLB flush entries are read
+ * from the correct offset.
+ */
+ data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
+ consumed_xmm_halves += hc->var_cnt;
+ }
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) {
+ tlb_flush_entries = NULL;
+ } else {
+ if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries,
+ consumed_xmm_halves, data_offset))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ tlb_flush_entries = __tlb_flush_entries;
}
-do_flush:
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
- if (all_cpus) {
- kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
- } else {
+ if (all_cpus && !is_guest_mode(vcpu)) {
+ kvm_for_each_vcpu(i, v, kvm) {
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
+ } else if (!is_guest_mode(vcpu)) {
sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask);
+ for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
+ v = kvm_get_vcpu(kvm, i);
+ if (!v)
+ continue;
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ } else {
+ struct kvm_vcpu_hv *hv_v;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+
+ kvm_for_each_vcpu(i, v, kvm) {
+ hv_v = to_hv_vcpu(v);
+
+ /*
+ * The following check races with nested vCPUs entering/exiting
+ * and/or migrating between L1's vCPUs, however the only case when
+ * KVM *must* flush the TLB is when the target L2 vCPU keeps
+ * running on the same L1 vCPU from the moment of the request until
+ * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other
+ * cases, e.g. when the target L2 vCPU migrates to a different L1
+ * vCPU or when the corresponding L1 vCPU temporary switches to a
+ * different L2 vCPU while the request is being processed.
+ */
+ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id)
+ continue;
+
+ if (!all_cpus &&
+ !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask,
+ sparse_banks))
+ continue;
+
+ __set_bit(i, vcpu_mask);
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
}
ret_success:
@@ -1883,8 +2102,8 @@ ret_success:
((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
}
-static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
- unsigned long *vcpu_bitmap)
+static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
+ u64 *sparse_banks, u64 valid_bank_mask)
{
struct kvm_lapic_irq irq = {
.delivery_mode = APIC_DM_FIXED,
@@ -1894,7 +2113,9 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
+ if (sparse_banks &&
+ !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu),
+ valid_bank_mask, sparse_banks))
continue;
/* We fail only when APIC is disabled */
@@ -1904,12 +2125,12 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
- DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
u64 valid_bank_mask;
- u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
u32 vector;
bool all_cpus;
@@ -1959,7 +2180,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
if (!hc->var_cnt)
goto ret_success;
- if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks,
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 1,
offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents)))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -1969,13 +2190,10 @@ check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- if (all_cpus) {
- kvm_send_ipi_to_many(kvm, vector, NULL);
- } else {
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
-
- kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
- }
+ if (all_cpus)
+ kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0);
+ else
+ kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask);
ret_success:
return HV_STATUS_SUCCESS;
@@ -2062,10 +2280,25 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
{
+ u32 tlb_lock_count = 0;
+ int ret;
+
+ if (hv_result_success(result) && is_guest_mode(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu) &&
+ kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
+ &tlb_lock_count, sizeof(tlb_lock_count)))
+ result = HV_STATUS_INVALID_HYPERCALL_INPUT;
+
trace_kvm_hv_hypercall_done(result);
kvm_hv_hypercall_set_result(vcpu, result);
++vcpu->stat.hypercalls;
- return kvm_skip_emulated_instruction(vcpu);
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (tlb_lock_count)
+ kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu);
+
+ return ret;
}
static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
@@ -2502,6 +2735,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->ebx |= HV_DEBUGGING;
ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE;
ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH;
/*
* Direct Synthetic timers only make sense with in-kernel
@@ -2545,6 +2779,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
+ ent->eax |= HV_X64_NESTED_DIRECT_FLUSH;
ent->eax |= HV_X64_NESTED_MSR_BITMAP;
ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 1030b1b50552..9f96414a31c5 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -22,6 +22,7 @@
#define __ARCH_X86_KVM_HYPERV_H__
#include <linux/kvm_host.h>
+#include "x86.h"
/* "Hv#1" signature */
#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
@@ -107,8 +108,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
-bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
- struct hv_vp_assist_page *assist_page);
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu);
static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
int timer_index)
@@ -151,4 +151,64 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
+static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu,
+ bool is_guest_mode)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO :
+ HV_L1_TLB_FLUSH_FIFO;
+
+ return &hv_vcpu->tlb_flush_fifo[i];
+}
+
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+
+ if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ return;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+}
+
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu &&
+ (hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH);
+}
+
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u16 code;
+
+ if (!hv_vcpu)
+ return false;
+
+ code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) :
+ kvm_rax_read(vcpu);
+
+ return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX);
+}
+
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ if (!to_hv_vcpu(vcpu))
+ return 0;
+
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return 0;
+
+ return kvm_hv_get_assist_page(vcpu);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f371f1292ca3..d8d50558f165 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -165,3 +165,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
new file mode 100644
index 000000000000..24a710d37323
--- /dev/null
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include "vmx/vmx.h"
+#include "svm/svm.h"
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_KVM_AMD)) {
+ BLANK();
+ OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
+ OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
+ OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
+ OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
+ OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
+ }
+
+ if (IS_ENABLED(CONFIG_KVM_INTEL)) {
+ BLANK();
+ OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
+ }
+}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 3febc342360c..c09174f73a34 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -200,9 +200,4 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
return vcpu->arch.hflags & HF_GUEST_MASK;
}
-static inline bool is_smm(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hflags & HF_SMM_MASK;
-}
-
#endif
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 89246446d6aa..2d9662be8333 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -117,16 +117,6 @@ struct x86_emulate_ops {
struct x86_exception *fault, bool system);
/*
- * read_phys: Read bytes of standard (non-emulated/special) memory.
- * Used for descriptor reading.
- * @addr: [IN ] Physical address from which to read.
- * @val: [OUT] Value read from memory.
- * @bytes: [IN ] Number of bytes to read from memory.
- */
- int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
- void *val, unsigned int bytes);
-
- /*
* write_std: Write bytes of standard (non-emulated/special) memory.
* Used for descriptor writing.
* @addr: [IN ] Linear address to which to write.
@@ -209,11 +199,8 @@ struct x86_emulate_ops {
int (*cpl)(struct x86_emulate_ctxt *ctxt);
void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
- u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
- void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
- int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
@@ -234,8 +221,7 @@ struct x86_emulate_ops {
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
- void (*exiting_smm)(struct x86_emulate_ctxt *ctxt);
- int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate);
+ int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
};
@@ -292,7 +278,6 @@ enum x86emul_mode {
/* These match some of the HF_* flags defined in kvm_host.h */
#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
#define X86EMUL_SMM_MASK (1 << 6)
-#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
/*
* fastop functions are declared as taking a never-defined fastop parameter,
@@ -526,4 +511,35 @@ void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
+static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+ BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d7639d126e6c..1bb63746e991 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -42,6 +42,7 @@
#include "x86.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "smm.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -1170,9 +1171,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_SMI:
- result = 1;
- kvm_make_request(KVM_REQ_SMI, vcpu);
- kvm_vcpu_kick(vcpu);
+ if (!kvm_inject_smi(vcpu)) {
+ kvm_vcpu_kick(vcpu);
+ result = 1;
+ }
break;
case APIC_DM_NMI:
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a5ac4a5a5179..28e3769066e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -7,7 +7,7 @@
#include <linux/kvm_host.h>
#include "hyperv.h"
-#include "kvm_cache_regs.h"
+#include "smm.h"
#define KVM_APIC_INIT 0
#define KVM_APIC_SIPI 1
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6f81539061d6..4736d7849c60 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -22,6 +22,7 @@
#include "tdp_mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "smm.h"
#include "kvm_emulate.h"
#include "cpuid.h"
#include "spte.h"
@@ -802,15 +803,31 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
}
-void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- if (sp->lpage_disallowed)
+ /*
+ * If it's possible to replace the shadow page with an NX huge page,
+ * i.e. if the shadow page is the only thing currently preventing KVM
+ * from using a huge page, add the shadow page to the list of "to be
+ * zapped for NX recovery" pages. Note, the shadow page can already be
+ * on the list if KVM is reusing an existing shadow page, i.e. if KVM
+ * links a shadow page at multiple points.
+ */
+ if (!list_empty(&sp->possible_nx_huge_page_link))
return;
++kvm->stat.nx_lpage_splits;
- list_add_tail(&sp->lpage_disallowed_link,
- &kvm->arch.lpage_disallowed_mmu_pages);
- sp->lpage_disallowed = true;
+ list_add_tail(&sp->possible_nx_huge_page_link,
+ &kvm->arch.possible_nx_huge_pages);
+}
+
+static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool nx_huge_page_possible)
+{
+ sp->nx_huge_page_disallowed = true;
+
+ if (nx_huge_page_possible)
+ track_possible_nx_huge_page(kvm, sp);
}
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -830,11 +847,20 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
+ if (list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
--kvm->stat.nx_lpage_splits;
- sp->lpage_disallowed = false;
- list_del(&sp->lpage_disallowed_link);
+ list_del_init(&sp->possible_nx_huge_page_link);
+}
+
+static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ sp->nx_huge_page_disallowed = false;
+
+ untrack_possible_nx_huge_page(kvm, sp);
}
static struct kvm_memory_slot *
@@ -1645,7 +1671,7 @@ static int is_empty_shadow_page(u64 *spt)
u64 *pos;
u64 *end;
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+ for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
if (is_shadow_present_pte(*pos)) {
printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
@@ -1793,7 +1819,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
continue;
}
- child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(ent);
if (child->unsync_children) {
if (mmu_pages_add(pvec, child, i))
@@ -1894,7 +1920,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (sp->role.invalid)
return true;
- /* TDP MMU pages due not use the MMU generation. */
+ /* TDP MMU pages do not use the MMU generation. */
return !sp->tdp_mmu_page &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2129,6 +2155,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
/*
* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
* depends on valid pages being added to the head of the list. See
@@ -2350,7 +2378,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* so we should update the spte at this point to get
* a new sp with the correct access.
*/
- child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(*sptep);
if (child->role.access == direct_access)
return;
@@ -2371,7 +2399,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
} else {
- child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(pte);
drop_parent_pte(child, spte);
/*
@@ -2486,8 +2514,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
zapped_root = !is_obsolete_sp(kvm, sp);
}
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ if (sp->nx_huge_page_disallowed)
+ unaccount_nx_huge_page(kvm, sp);
sp->role.invalid = 1;
@@ -2810,7 +2838,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
struct kvm_mmu_page *child;
u64 pte = *sptep;
- child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(pte);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
@@ -3084,7 +3112,8 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
if (cur_level > PG_LEVEL_4K &&
cur_level == fault->goal_level &&
is_shadow_present_pte(spte) &&
- !is_large_pte(spte)) {
+ !is_large_pte(spte) &&
+ spte_to_child_sp(spte)->nx_huge_page_disallowed) {
/*
* A small SPTE exists for this pfn, but FNAME(fetch)
* and __direct_map would like to create a large PTE
@@ -3126,9 +3155,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
continue;
link_shadow_page(vcpu, it.sptep, sp);
- if (fault->is_tdp && fault->huge_page_disallowed &&
- fault->req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
}
if (WARN_ON_ONCE(it.level != fault->goal_level))
@@ -3148,8 +3177,13 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
{
+ if (is_sigpending_pfn(pfn)) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
/*
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
@@ -3171,7 +3205,7 @@ static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fau
{
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(fault->pfn)))
- return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
+ return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn);
if (unlikely(!fault->slot)) {
gva_t gva = fault->is_tdp ? 0 : fault->addr;
@@ -3422,7 +3456,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ sp = spte_to_child_sp(*root_hpa);
if (WARN_ON(!sp))
return;
@@ -3907,8 +3945,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu->pae_root[i];
if (IS_VALID_PAE_ROOT(root)) {
- root &= SPTE_BASE_ADDR_MASK;
- sp = to_shadow_page(root);
+ sp = spte_to_child_sp(root);
mmu_sync_children(vcpu, sp, true);
}
}
@@ -4169,7 +4206,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
async = false;
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
fault->write, &fault->map_writable,
&fault->hva);
if (!async)
@@ -4186,7 +4223,12 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
}
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
+ /*
+ * Allow gup to bail on pending non-fatal signals when it's also allowed
+ * to wait for IO. Note, gup always bails if it is unable to quickly
+ * get a page and a fatal signal, i.e. SIGKILL, is pending.
+ */
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
fault->write, &fault->map_writable,
&fault->hva);
return RET_PF_CONTINUE;
@@ -5971,7 +6013,7 @@ int kvm_mmu_init_vm(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
- INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
r = kvm_mmu_init_tdp_mmu(kvm);
@@ -6656,7 +6698,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
}
mutex_unlock(&kvm_lock);
}
@@ -6788,7 +6830,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
mutex_unlock(&kvm_lock);
}
@@ -6796,9 +6838,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
return err;
}
-static void kvm_recover_nx_lpages(struct kvm *kvm)
+static void kvm_recover_nx_huge_pages(struct kvm *kvm)
{
unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
+ struct kvm_memory_slot *slot;
int rcu_idx;
struct kvm_mmu_page *sp;
unsigned int ratio;
@@ -6819,24 +6862,55 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
- if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+ if (list_empty(&kvm->arch.possible_nx_huge_pages))
break;
/*
* We use a separate list instead of just using active_mmu_pages
- * because the number of lpage_disallowed pages is expected to
- * be relatively small compared to the total.
+ * because the number of shadow pages that be replaced with an
+ * NX huge page is expected to be relatively small compared to
+ * the total number of shadow pages. And because the TDP MMU
+ * doesn't use active_mmu_pages.
*/
- sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
struct kvm_mmu_page,
- lpage_disallowed_link);
- WARN_ON_ONCE(!sp->lpage_disallowed);
- if (is_tdp_mmu_page(sp)) {
+ possible_nx_huge_page_link);
+ WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+ WARN_ON_ONCE(!sp->role.direct);
+
+ /*
+ * Unaccount and do not attempt to recover any NX Huge Pages
+ * that are being dirty tracked, as they would just be faulted
+ * back in as 4KiB pages. The NX Huge Pages in this slot will be
+ * recovered, along with all the other huge pages in the slot,
+ * when dirty logging is disabled.
+ *
+ * Since gfn_to_memslot() is relatively expensive, it helps to
+ * skip it if it the test cannot possibly return true. On the
+ * other hand, if any memslot has logging enabled, chances are
+ * good that all of them do, in which case unaccount_nx_huge_page()
+ * is much cheaper than zapping the page.
+ *
+ * If a memslot update is in progress, reading an incorrect value
+ * of kvm->nr_memslots_dirty_logging is not a problem: if it is
+ * becoming zero, gfn_to_memslot() will be done unnecessarily; if
+ * it is becoming nonzero, the page will be zapped unnecessarily.
+ * Either way, this only affects efficiency in racy situations,
+ * and not correctness.
+ */
+ slot = NULL;
+ if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ slot = gfn_to_memslot(kvm, sp->gfn);
+ WARN_ON_ONCE(!slot);
+ }
+
+ if (slot && kvm_slot_dirty_track_enabled(slot))
+ unaccount_nx_huge_page(kvm, sp);
+ else if (is_tdp_mmu_page(sp))
flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
- } else {
+ else
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- WARN_ON_ONCE(sp->lpage_disallowed);
- }
+ WARN_ON_ONCE(sp->nx_huge_page_disallowed);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
@@ -6856,7 +6930,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
-static long get_nx_lpage_recovery_timeout(u64 start_time)
+static long get_nx_huge_page_recovery_timeout(u64 start_time)
{
bool enabled;
uint period;
@@ -6867,19 +6941,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time)
: MAX_SCHEDULE_TIMEOUT;
}
-static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
{
u64 start_time;
long remaining_time;
while (true) {
start_time = get_jiffies_64();
- remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop() && remaining_time > 0) {
schedule_timeout(remaining_time);
- remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
set_current_state(TASK_INTERRUPTIBLE);
}
@@ -6888,7 +6962,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
if (kthread_should_stop())
return 0;
- kvm_recover_nx_lpages(kvm);
+ kvm_recover_nx_huge_pages(kvm);
}
}
@@ -6896,17 +6970,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
{
int err;
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
"kvm-nx-lpage-recovery",
- &kvm->arch.nx_lpage_recovery_thread);
+ &kvm->arch.nx_huge_page_recovery_thread);
if (!err)
- kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+ kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
return err;
}
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
- if (kvm->arch.nx_lpage_recovery_thread)
- kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+ if (kvm->arch.nx_huge_page_recovery_thread)
+ kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 582def531d4d..dbaf6755c5a7 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -57,7 +57,13 @@ struct kvm_mmu_page {
bool tdp_mmu_page;
bool unsync;
u8 mmu_valid_gen;
- bool lpage_disallowed; /* Can't be replaced by an equiv large page */
+
+ /*
+ * The shadow page can't be replaced by an equivalent huge page
+ * because it is being used to map an executable page in the guest
+ * and the NX huge page mitigation is enabled.
+ */
+ bool nx_huge_page_disallowed;
/*
* The following two entries are used to key the shadow page in the
@@ -100,7 +106,14 @@ struct kvm_mmu_page {
};
};
- struct list_head lpage_disallowed_link;
+ /*
+ * Tracks shadow pages that, if zapped, would allow KVM to create an NX
+ * huge page. A shadow page will have nx_huge_page_disallowed set but
+ * not be on the list if a huge page is disallowed for other reasons,
+ * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
+ * isn't properly aligned, etc...
+ */
+ struct list_head possible_nx_huge_page_link;
#ifdef CONFIG_X86_32
/*
* Used out of the mmu-lock to avoid reading spte values while an
@@ -120,18 +133,6 @@ struct kvm_mmu_page {
extern struct kmem_cache *mmu_page_header_cache;
-static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
-{
- struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
- return (struct kvm_mmu_page *)page_private(page);
-}
-
-static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
-{
- return to_shadow_page(__pa(sptep));
-}
-
static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
{
return role.smm ? 1 : 0;
@@ -315,7 +316,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
-void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5ab5f94dcb6f..0f6455072055 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -713,9 +713,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
continue;
link_shadow_page(vcpu, it.sptep, sp);
- if (fault->huge_page_disallowed &&
- fault->req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
}
if (WARN_ON_ONCE(it.level != fault->goal_level))
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 2e08b2a45361..c0fd7e049b4e 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -161,6 +161,18 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
+ /*
+ * For simplicity, enforce the NX huge page mitigation even if not
+ * strictly necessary. KVM could ignore the mitigation if paging is
+ * disabled in the guest, as the guest doesn't have an page tables to
+ * abuse. But to safely ignore the mitigation, KVM would have to
+ * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
+ * is toggled on, and that's a net negative for performance when TDP is
+ * enabled. When TDP is disabled, KVM will always switch to a new MMU
+ * when CR0.PG is toggled, but leveraging that to ignore the mitigation
+ * would tie make_spte() further to vCPU/MMU state, and add complexity
+ * just to optimize a mode that is anything but performance critical.
+ */
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
is_nx_huge_page_enabled(vcpu->kvm)) {
pte_access &= ~ACC_EXEC_MASK;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 7670c13ce251..1f03701b943a 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -188,7 +188,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
* should not modify the SPTE.
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
- * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
* vulnerability. Use only low bits to avoid 64-bit immediates.
*
* Only used by the TDP MMU.
@@ -219,6 +219,23 @@ static inline int spte_index(u64 *sptep)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte)
+{
+ return to_shadow_page(spte & SPTE_BASE_ADDR_MASK);
+}
+
+static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
+{
+ return to_shadow_page(__pa(sptep));
+}
+
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 672f0432d777..771210ce5181 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -29,7 +29,6 @@ int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
kvm->arch.tdp_mmu_enabled = true;
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
- INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
kvm->arch.tdp_mmu_zap_wq = wq;
return 1;
}
@@ -54,7 +53,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
/* Also waits for any queued work items. */
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
- WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
+ WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
@@ -284,6 +283,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
gfn_t gfn, union kvm_mmu_page_role role)
{
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
sp->role = role;
@@ -375,11 +376,13 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
+ atomic64_inc(&kvm->arch.tdp_mmu_pages);
}
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, -1);
+ atomic64_dec(&kvm->arch.tdp_mmu_pages);
}
/**
@@ -395,14 +398,17 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
bool shared)
{
tdp_unaccount_mmu_page(kvm, sp);
+
+ if (!sp->nx_huge_page_disallowed)
+ return;
+
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);
- list_del(&sp->link);
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ sp->nx_huge_page_disallowed = false;
+ untrack_possible_nx_huge_page(kvm, sp);
if (shared)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
@@ -1116,16 +1122,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @sp: The new TDP page table to install.
- * @account_nx: True if this page table is being installed to split a
- * non-executable huge page.
* @shared: This operation is running under the MMU lock in read mode.
*
* Returns: 0 if the new page table was installed. Non-0 if the page table
* could not be installed (e.g. the atomic compare-exchange failed).
*/
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_mmu_page *sp, bool account_nx,
- bool shared)
+ struct kvm_mmu_page *sp, bool shared)
{
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
int ret = 0;
@@ -1138,16 +1141,14 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
tdp_mmu_set_spte(kvm, iter, spte);
}
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
- if (account_nx)
- account_huge_nx_page(kvm, sp);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
tdp_account_mmu_page(kvm, sp);
return 0;
}
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared);
+
/*
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
@@ -1155,9 +1156,10 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm *kvm = vcpu->kvm;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
- int ret;
+ int ret = RET_PF_RETRY;
kvm_mmu_hugepage_adjust(vcpu, fault);
@@ -1166,6 +1168,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ int r;
+
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
@@ -1173,57 +1177,52 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
break;
/*
- * If there is an SPTE mapping a large page at a higher level
- * than the target, that SPTE must be cleared and replaced
- * with a non-leaf SPTE.
+ * If SPTE has been frozen by another thread, just give up and
+ * retry, avoiding unnecessary page table allocation and free.
*/
- if (is_shadow_present_pte(iter.old_spte) &&
- is_large_pte(iter.old_spte)) {
- if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
- break;
+ if (is_removed_spte(iter.old_spte))
+ goto retry;
- /*
- * The iter must explicitly re-read the spte here
- * because the new value informs the !present
- * path below.
- */
- iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
- }
+ /* Step down into the lower level page table if it exists. */
+ if (is_shadow_present_pte(iter.old_spte) &&
+ !is_large_pte(iter.old_spte))
+ continue;
- if (!is_shadow_present_pte(iter.old_spte)) {
- bool account_nx = fault->huge_page_disallowed &&
- fault->req_level >= iter.level;
+ /*
+ * The SPTE is either non-present or points to a huge page that
+ * needs to be split.
+ */
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
- /*
- * If SPTE has been frozen by another thread, just
- * give up and retry, avoiding unnecessary page table
- * allocation and free.
- */
- if (is_removed_spte(iter.old_spte))
- break;
+ sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
- sp = tdp_mmu_alloc_sp(vcpu);
- tdp_mmu_init_child_sp(sp, &iter);
+ if (is_shadow_present_pte(iter.old_spte))
+ r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
+ else
+ r = tdp_mmu_link_sp(kvm, &iter, sp, true);
- if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
- tdp_mmu_free_sp(sp);
- break;
- }
+ /*
+ * Also force the guest to retry the access if the upper level SPTEs
+ * aren't in place.
+ */
+ if (r) {
+ tdp_mmu_free_sp(sp);
+ goto retry;
}
- }
- /*
- * Force the guest to retry the access if the upper level SPTEs aren't
- * in place, or if the target leaf SPTE is frozen by another CPU.
- */
- if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
- rcu_read_unlock();
- return RET_PF_RETRY;
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= iter.level) {
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ track_possible_nx_huge_page(kvm, sp);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ }
}
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
- rcu_read_unlock();
+retry:
+ rcu_read_unlock();
return ret;
}
@@ -1472,6 +1471,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
return sp;
}
+/* Note, the caller is responsible for initializing @sp. */
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
{
@@ -1479,8 +1479,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
const int level = iter->level;
int ret, i;
- tdp_mmu_init_child_sp(sp, iter);
-
/*
* No need for atomics when writing to sp->spt since the page table has
* not been linked in yet and thus is not reachable from any other CPU.
@@ -1496,7 +1494,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
* correctness standpoint since the translation will be the same either
* way.
*/
- ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
+ ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
if (ret)
goto out;
@@ -1556,6 +1554,8 @@ retry:
continue;
}
+ tdp_mmu_init_child_sp(sp, &iter);
+
if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
goto retry;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index c163f7cc23ca..d3714200b932 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -5,6 +5,8 @@
#include <linux/kvm_host.h>
+#include "spte.h"
+
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index d9b9a0f0db17..684393c22105 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -56,7 +56,7 @@ static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
* code. Each pmc, stored in kvm_pmc.idx field, is unique across
* all perf counters (both gp and fixed). The mapping relationship
* between pmc and perf counters is as the following:
- * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
* [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
@@ -101,10 +101,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
bool skip_pmi = false;
- /* Ignore counters that have been reprogrammed already. */
- if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
- return;
-
if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
if (!in_pmi) {
/*
@@ -122,7 +118,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
} else {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
}
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
if (!pmc->intr || skip_pmi)
return;
@@ -147,12 +142,22 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ /*
+ * Ignore overflow events for counters that are scheduled to be
+ * reprogrammed, e.g. if a PMI for the previous event races with KVM's
+ * handling of a related guest WRMSR.
+ */
+ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
+ return;
+
__kvm_perf_overflow(pmc, true);
+
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
-static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
- u64 config, bool exclude_user,
- bool exclude_kernel, bool intr)
+static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
+ bool exclude_user, bool exclude_kernel,
+ bool intr)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
struct perf_event *event;
@@ -204,14 +209,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
PTR_ERR(event), pmc->idx);
- return;
+ return PTR_ERR(event);
}
pmc->perf_event = event;
pmc_to_pmu(pmc)->event_count++;
- clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
pmc->is_paused = false;
pmc->intr = intr || pebs;
+ return 0;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
@@ -245,7 +250,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
perf_event_enable(pmc->perf_event);
pmc->is_paused = false;
- clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
return true;
}
@@ -293,7 +297,7 @@ out:
return allow_event;
}
-void reprogram_counter(struct kvm_pmc *pmc)
+static void reprogram_counter(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 eventsel = pmc->eventsel;
@@ -303,10 +307,13 @@ void reprogram_counter(struct kvm_pmc *pmc)
pmc_pause_counter(pmc);
if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
- return;
+ goto reprogram_complete;
if (!check_pmu_event_filter(pmc))
- return;
+ goto reprogram_complete;
+
+ if (pmc->counter < pmc->prev_counter)
+ __kvm_perf_overflow(pmc, false);
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
printk_once("kvm pmu: pin control bit is ignored\n");
@@ -324,18 +331,29 @@ void reprogram_counter(struct kvm_pmc *pmc)
}
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
- return;
+ goto reprogram_complete;
pmc_release_perf_event(pmc);
pmc->current_config = new_config;
- pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
- (eventsel & pmu->raw_event_mask),
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT);
+
+ /*
+ * If reprogramming fails, e.g. due to contention, leave the counter's
+ * regprogram bit set, i.e. opportunistically try again on the next PMU
+ * refresh. Don't make a new request as doing so can stall the guest
+ * if reprogramming repeatedly fails.
+ */
+ if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT))
+ return;
+
+reprogram_complete:
+ clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc->prev_counter = 0;
}
-EXPORT_SYMBOL_GPL(reprogram_counter);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
@@ -345,10 +363,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
- if (unlikely(!pmc || !pmc->perf_event)) {
+ if (unlikely(!pmc)) {
clear_bit(bit, pmu->reprogram_pmi);
continue;
}
+
reprogram_counter(pmc);
}
@@ -522,14 +541,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
- u64 prev_count;
-
- prev_count = pmc->counter;
+ pmc->prev_counter = pmc->counter;
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
-
- reprogram_counter(pmc);
- if (pmc->counter < prev_count)
- __kvm_perf_overflow(pmc, false);
+ kvm_pmu_request_counter_reprogam(pmc);
}
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
@@ -542,12 +556,15 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
{
bool select_os, select_user;
- u64 config = pmc->current_config;
+ u64 config;
if (pmc_is_gp(pmc)) {
+ config = pmc->eventsel;
select_os = config & ARCH_PERFMON_EVENTSEL_OS;
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
} else {
+ config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED);
select_os = config & 0x1;
select_user = config & 0x2;
}
@@ -577,6 +594,8 @@ EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_pmu_event_filter tmp, *filter;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
size_t size;
int r;
@@ -613,9 +632,18 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
+ sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
+
mutex_unlock(&kvm->lock);
- synchronize_srcu_expedited(&kvm->srcu);
r = 0;
cleanup:
kfree(filter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 5cc5721f260b..85ff3c0588ba 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -183,7 +183,11 @@ static inline void kvm_init_pmu_capability(void)
KVM_PMC_MAX_FIXED);
}
-void reprogram_counter(struct kvm_pmc *pmc);
+static inline void kvm_pmu_request_counter_reprogam(struct kvm_pmc *pmc)
+{
+ set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index a19d473d0184..203fdad07bae 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -7,23 +7,41 @@
#include <asm/cpufeatures.h>
/*
- * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
- * be directly used by KVM. Note, these word values conflict with the kernel's
- * "bug" caps, but KVM doesn't use those.
+ * Hardware-defined CPUID leafs that are either scattered by the kernel or are
+ * unknown to the kernel, but need to be directly used by KVM. Note, these
+ * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
*/
enum kvm_only_cpuid_leafs {
CPUID_12_EAX = NCAPINTS,
+ CPUID_7_1_EDX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
};
+/*
+ * Define a KVM-only feature flag.
+ *
+ * For features that are scattered by cpufeatures.h, __feature_translate() also
+ * needs to be updated to translate the kernel-defined feature into the
+ * KVM-defined feature.
+ *
+ * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h,
+ * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so
+ * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is
+ * needed in this case.
+ */
#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
+#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
+#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+
struct cpuid_reg {
u32 function;
u32 index;
@@ -48,6 +66,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
[CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+ [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
};
/*
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
new file mode 100644
index 000000000000..a9c1c2af8d94
--- /dev/null
+++ b/arch/x86/kvm/smm.c
@@ -0,0 +1,649 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define CHECK_SMRAM32_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00)
+
+#define CHECK_SMRAM64_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00)
+
+static void check_smram_offsets(void)
+{
+ /* 32 bit SMRAM image */
+ CHECK_SMRAM32_OFFSET(reserved1, 0xFE00);
+ CHECK_SMRAM32_OFFSET(smbase, 0xFEF8);
+ CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC);
+ CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00);
+ CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02);
+ CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04);
+ CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08);
+ CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C);
+ CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10);
+ CHECK_SMRAM32_OFFSET(cr4, 0xFF14);
+ CHECK_SMRAM32_OFFSET(reserved2, 0xFF18);
+ CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A);
+ CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B);
+ CHECK_SMRAM32_OFFSET(ds, 0xFF2C);
+ CHECK_SMRAM32_OFFSET(fs, 0xFF38);
+ CHECK_SMRAM32_OFFSET(gs, 0xFF44);
+ CHECK_SMRAM32_OFFSET(idtr, 0xFF50);
+ CHECK_SMRAM32_OFFSET(tr, 0xFF5C);
+ CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C);
+ CHECK_SMRAM32_OFFSET(ldtr, 0xFF78);
+ CHECK_SMRAM32_OFFSET(es, 0xFF84);
+ CHECK_SMRAM32_OFFSET(cs, 0xFF90);
+ CHECK_SMRAM32_OFFSET(ss, 0xFF9C);
+ CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8);
+ CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC);
+ CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0);
+ CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4);
+ CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8);
+ CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC);
+ CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0);
+ CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4);
+ CHECK_SMRAM32_OFFSET(dr7, 0xFFC8);
+ CHECK_SMRAM32_OFFSET(dr6, 0xFFCC);
+ CHECK_SMRAM32_OFFSET(gprs, 0xFFD0);
+ CHECK_SMRAM32_OFFSET(eip, 0xFFF0);
+ CHECK_SMRAM32_OFFSET(eflags, 0xFFF4);
+ CHECK_SMRAM32_OFFSET(cr3, 0xFFF8);
+ CHECK_SMRAM32_OFFSET(cr0, 0xFFFC);
+
+ /* 64 bit SMRAM image */
+ CHECK_SMRAM64_OFFSET(es, 0xFE00);
+ CHECK_SMRAM64_OFFSET(cs, 0xFE10);
+ CHECK_SMRAM64_OFFSET(ss, 0xFE20);
+ CHECK_SMRAM64_OFFSET(ds, 0xFE30);
+ CHECK_SMRAM64_OFFSET(fs, 0xFE40);
+ CHECK_SMRAM64_OFFSET(gs, 0xFE50);
+ CHECK_SMRAM64_OFFSET(gdtr, 0xFE60);
+ CHECK_SMRAM64_OFFSET(ldtr, 0xFE70);
+ CHECK_SMRAM64_OFFSET(idtr, 0xFE80);
+ CHECK_SMRAM64_OFFSET(tr, 0xFE90);
+ CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0);
+ CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8);
+ CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0);
+ CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8);
+ CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0);
+ CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4);
+ CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8);
+ CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9);
+ CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA);
+ CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB);
+ CHECK_SMRAM64_OFFSET(reserved2, 0xFECC);
+ CHECK_SMRAM64_OFFSET(efer, 0xFED0);
+ CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8);
+ CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0);
+ CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8);
+ CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0);
+ CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC);
+ CHECK_SMRAM64_OFFSET(smbase, 0xFF00);
+ CHECK_SMRAM64_OFFSET(reserved4, 0xFF04);
+ CHECK_SMRAM64_OFFSET(ssp, 0xFF18);
+ CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20);
+ CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28);
+ CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30);
+ CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38);
+ CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40);
+ CHECK_SMRAM64_OFFSET(cr4, 0xFF48);
+ CHECK_SMRAM64_OFFSET(cr3, 0xFF50);
+ CHECK_SMRAM64_OFFSET(cr0, 0xFF58);
+ CHECK_SMRAM64_OFFSET(dr7, 0xFF60);
+ CHECK_SMRAM64_OFFSET(dr6, 0xFF68);
+ CHECK_SMRAM64_OFFSET(rflags, 0xFF70);
+ CHECK_SMRAM64_OFFSET(rip, 0xFF78);
+ CHECK_SMRAM64_OFFSET(gprs, 0xFF80);
+
+ BUILD_BUG_ON(sizeof(union kvm_smram) != 512);
+}
+
+#undef CHECK_SMRAM64_OFFSET
+#undef CHECK_SMRAM32_OFFSET
+
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
+{
+ BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
+
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
+ * on SMM exit we still need to reload them from
+ * guest memory
+ */
+ vcpu->arch.pdptrs_from_userspace = false;
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_32 *state,
+ u32 *selector, int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ *selector = seg.selector;
+ state->base = seg.base;
+ state->limit = seg.limit;
+ state->flags = enter_smm_get_segment_flags(&seg);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ state->selector = seg.selector;
+ state->attributes = enter_smm_get_segment_flags(&seg) >> 8;
+ state->limit = seg.limit;
+ state->base = seg.base;
+}
+#endif
+
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_32 *smram)
+{
+ struct desc_ptr dt;
+ unsigned long val;
+ int i;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->eflags = kvm_get_rflags(vcpu);
+ smram->eip = kvm_rip_read(vcpu);
+
+ for (i = 0; i < 8; i++)
+ smram->gprs[i] = kvm_register_read_raw(vcpu, i);
+
+ kvm_get_dr(vcpu, 6, &val);
+ smram->dr6 = (u32)val;
+ kvm_get_dr(vcpu, 7, &val);
+ smram->dr7 = (u32)val;
+
+ enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
+ enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ smram->gdtr.base = dt.address;
+ smram->gdtr.limit = dt.size;
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ smram->idtr.base = dt.address;
+ smram->idtr.limit = dt.size;
+
+ enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES);
+ enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS);
+ enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS);
+
+ enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS);
+ enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS);
+ enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS);
+
+ smram->cr4 = kvm_read_cr4(vcpu);
+ smram->smm_revision = 0x00020000;
+ smram->smbase = vcpu->arch.smbase;
+
+ smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_64 *smram)
+{
+ struct desc_ptr dt;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i);
+
+ smram->rip = kvm_rip_read(vcpu);
+ smram->rflags = kvm_get_rflags(vcpu);
+
+
+ kvm_get_dr(vcpu, 6, &val);
+ smram->dr6 = val;
+ kvm_get_dr(vcpu, 7, &val);
+ smram->dr7 = val;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->cr4 = kvm_read_cr4(vcpu);
+
+ smram->smbase = vcpu->arch.smbase;
+ smram->smm_revison = 0x00020064;
+
+ smram->efer = vcpu->arch.efer;
+
+ enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ smram->idtr.limit = dt.size;
+ smram->idtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ smram->gdtr.limit = dt.size;
+ smram->gdtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES);
+ enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS);
+ enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS);
+ enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS);
+ enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
+ enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
+
+ smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+}
+#endif
+
+void enter_smm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ struct desc_ptr dt;
+ unsigned long cr0;
+ union kvm_smram smram;
+
+ check_smram_offsets();
+
+ memset(smram.bytes, 0, sizeof(smram.bytes));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, &smram.smram64);
+ else
+#endif
+ enter_smm_save_state_32(vcpu, &smram.smram32);
+
+ /*
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
+ *
+ * Kill the VM in the unlikely case of failure, because the VM
+ * can be in undefined state in this case.
+ */
+ if (static_call(kvm_x86_enter_smm)(vcpu, &smram))
+ goto error;
+
+ kvm_smm_changed(vcpu, true);
+
+ if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
+ goto error;
+
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ static_call(kvm_x86_set_nmi_mask)(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
+
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
+ goto error;
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (static_call(kvm_x86_set_efer)(vcpu, 0))
+ goto error;
+#endif
+
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_mmu_reset_context(vcpu);
+ return;
+error:
+ kvm_vm_dead(vcpu->kvm);
+}
+
+static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->db = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->present = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+
+ desc->unusable = !desc->present;
+ desc->padding = 0;
+}
+
+static int rsm_load_seg_32(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_32 *state,
+ u16 selector, int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = selector;
+ desc.base = state->base;
+ desc.limit = state->limit;
+ rsm_set_desc_flags(&desc, state->flags);
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+
+#ifdef CONFIG_X86_64
+
+static int rsm_load_seg_64(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = state->selector;
+ rsm_set_desc_flags(&desc, state->attributes << 8);
+ desc.limit = state->limit;
+ desc.base = state->base;
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu,
+ u64 cr0, u64 cr3, u64 cr4)
+{
+ int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = kvm_set_cr3(vcpu, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = kvm_set_cr0(vcpu, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = kvm_set_cr4(vcpu, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = kvm_set_cr3(vcpu, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_32 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED;
+ ctxt->_eip = smstate->eip;
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = smstate->gprs[i];
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR);
+ rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR);
+
+ dt.address = smstate->gdtr.base;
+ dt.size = smstate->gdtr.limit;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ dt.address = smstate->idtr.base;
+ dt.size = smstate->idtr.limit;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
+ rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
+ rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS);
+
+ rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS);
+ rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS);
+ rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS);
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0,
+ smstate->cr3, smstate->cr4);
+
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return r;
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_64 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = smstate->gprs[15 - i];
+
+ ctxt->_eip = smstate->rip;
+ ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED;
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR);
+
+ dt.size = smstate->idtr.limit;
+ dt.address = smstate->idtr.base;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
+
+ dt.size = smstate->gdtr.limit;
+ dt.address = smstate->gdtr.base;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES);
+ rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS);
+ rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS);
+ rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS);
+ rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
+ rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ unsigned long cr0;
+ union kvm_smram smram;
+ u64 smbase;
+ int ret;
+
+ smbase = vcpu->arch.smbase;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram));
+ if (ret < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
+ static_call(kvm_x86_set_nmi_mask)(vcpu, false);
+
+ kvm_smm_changed(vcpu, false);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ struct kvm_segment cs_desc;
+ unsigned long cr4;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PCIDE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.present = 1;
+ kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS);
+ }
+#endif
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
+ cr0 = kvm_read_cr0(vcpu);
+ if (cr0 & X86_CR0_PE)
+ kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ unsigned long cr4, efer;
+
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PAE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ kvm_set_msr(vcpu, MSR_EFER, efer);
+ }
+#endif
+
+ /*
+ * Give leave_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (static_call(kvm_x86_leave_smm)(vcpu, &smram))
+ return X86EMUL_UNHANDLEABLE;
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return rsm_load_state_64(ctxt, &smram.smram64);
+ else
+#endif
+ return rsm_load_state_32(ctxt, &smram.smram32);
+}
diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h
new file mode 100644
index 000000000000..a1cf2ac5bd78
--- /dev/null
+++ b/arch/x86/kvm/smm.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_SMM_H
+#define ASM_KVM_SMM_H
+
+#include <linux/build_bug.h>
+
+#ifdef CONFIG_KVM_SMM
+
+
+/*
+ * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout
+ * (https://www.sandpile.org/x86/smm.htm).
+ */
+
+struct kvm_smm_seg_state_32 {
+ u32 flags;
+ u32 limit;
+ u32 base;
+} __packed;
+
+struct kvm_smram_state_32 {
+ u32 reserved1[62];
+ u32 smbase;
+ u32 smm_revision;
+ u16 io_inst_restart;
+ u16 auto_hlt_restart;
+ u32 io_restart_rdi;
+ u32 io_restart_rcx;
+ u32 io_restart_rsi;
+ u32 io_restart_rip;
+ u32 cr4;
+
+ /* A20M#, CPL, shutdown and other reserved/undocumented fields */
+ u16 reserved2;
+ u8 int_shadow; /* KVM extension */
+ u8 reserved3[17];
+
+ struct kvm_smm_seg_state_32 ds;
+ struct kvm_smm_seg_state_32 fs;
+ struct kvm_smm_seg_state_32 gs;
+ struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */
+ struct kvm_smm_seg_state_32 tr;
+ u32 reserved;
+ struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */
+ struct kvm_smm_seg_state_32 ldtr;
+ struct kvm_smm_seg_state_32 es;
+ struct kvm_smm_seg_state_32 cs;
+ struct kvm_smm_seg_state_32 ss;
+
+ u32 es_sel;
+ u32 cs_sel;
+ u32 ss_sel;
+ u32 ds_sel;
+ u32 fs_sel;
+ u32 gs_sel;
+ u32 ldtr_sel;
+ u32 tr_sel;
+
+ u32 dr7;
+ u32 dr6;
+ u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */
+ u32 eip;
+ u32 eflags;
+ u32 cr3;
+ u32 cr0;
+} __packed;
+
+
+/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */
+
+struct kvm_smm_seg_state_64 {
+ u16 selector;
+ u16 attributes;
+ u32 limit;
+ u64 base;
+};
+
+struct kvm_smram_state_64 {
+
+ struct kvm_smm_seg_state_64 es;
+ struct kvm_smm_seg_state_64 cs;
+ struct kvm_smm_seg_state_64 ss;
+ struct kvm_smm_seg_state_64 ds;
+ struct kvm_smm_seg_state_64 fs;
+ struct kvm_smm_seg_state_64 gs;
+ struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 ldtr;
+ struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 tr;
+
+ /* I/O restart and auto halt restart are not implemented by KVM */
+ u64 io_restart_rip;
+ u64 io_restart_rcx;
+ u64 io_restart_rsi;
+ u64 io_restart_rdi;
+ u32 io_restart_dword;
+ u32 reserved1;
+ u8 io_inst_restart;
+ u8 auto_hlt_restart;
+ u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */
+ u8 int_shadow;
+ u32 reserved2;
+
+ u64 efer;
+
+ /*
+ * Two fields below are implemented on AMD only, to store
+ * SVM guest vmcb address if the #SMI was received while in the guest mode.
+ */
+ u64 svm_guest_flag;
+ u64 svm_guest_vmcb_gpa;
+ u64 svm_guest_virtual_int; /* unknown purpose, not implemented */
+
+ u32 reserved3[3];
+ u32 smm_revison;
+ u32 smbase;
+ u32 reserved4[5];
+
+ /* ssp and svm_* fields below are not implemented by KVM */
+ u64 ssp;
+ u64 svm_guest_pat;
+ u64 svm_host_efer;
+ u64 svm_host_cr4;
+ u64 svm_host_cr3;
+ u64 svm_host_cr0;
+
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */
+};
+
+union kvm_smram {
+ struct kvm_smram_state_64 smram64;
+ struct kvm_smram_state_32 smram32;
+ u8 bytes[512];
+};
+
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ return 0;
+}
+
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm);
+void enter_smm(struct kvm_vcpu *vcpu);
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt);
+void process_smi(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; }
+static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; }
+
+/*
+ * emulator_leave_smm is used as a function pointer, so the
+ * stub is defined in x86.c.
+ */
+#endif
+
+#endif
diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
new file mode 100644
index 000000000000..088f6429b24c
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM specific code for Hyper-V on KVM.
+ *
+ * Copyright 2022 Red Hat, Inc. and/or its affiliates.
+ */
+#include "hyperv.h"
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
+ svm->vmcb->control.exit_info_2 = 0;
+ nested_svm_vmexit(svm);
+}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
index 7d6d97968fb9..02f4784b5d44 100644
--- a/arch/x86/kvm/svm/hyperv.h
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -9,27 +9,37 @@
#include <asm/mshyperv.h>
#include "../hyperv.h"
+#include "svm.h"
-/*
- * Hyper-V uses the software reserved 32 bytes in VMCB
- * control area to expose SVM enlightenments to guests.
- */
-struct hv_enlightenments {
- struct __packed hv_enlightenments_control {
- u32 nested_flush_hypercall:1;
- u32 msr_bitmap:1;
- u32 enlightened_npt_tlb: 1;
- u32 reserved:29;
- } __packed hv_enlightenments_control;
- u32 hv_vp_id;
- u64 hv_vm_id;
- u64 partition_assist_page;
- u64 reserved;
-} __packed;
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
-/*
- * Hyper-V uses the software reserved clean bit in VMCB
- */
-#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
+ if (!hv_vcpu)
+ return;
+
+ hv_vcpu->nested.pa_page_gpa = hve->partition_assist_page;
+ hv_vcpu->nested.vm_id = hve->hv_vm_id;
+ hv_vcpu->nested.vp_id = hve->hv_vp_id;
+}
+
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4c620999d230..bc9cd7086fa9 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -25,6 +25,7 @@
#include "trace.h"
#include "mmu.h"
#include "x86.h"
+#include "smm.h"
#include "cpuid.h"
#include "lapic.h"
#include "svm.h"
@@ -149,8 +150,12 @@ void recalc_intercepts(struct vcpu_svm *svm)
vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
- /* We don't want to see VMMCALLs from a nested guest */
- vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
+ /*
+ * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB
+ * flush feature is enabled.
+ */
+ if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu))
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
for (i = 0; i < MAX_INTERCEPT; i++)
c->intercepts[i] |= g->intercepts[i];
@@ -179,8 +184,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
*/
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)svm->nested.ctl.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
int i;
/*
@@ -194,7 +198,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!svm->nested.force_msr_bitmap_recalc &&
kvm_hv_hypercall_enabled(&svm->vcpu) &&
hve->hv_enlightenments_control.msr_bitmap &&
- (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS)))
+ (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
goto set_msrpm_base_pa;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
@@ -369,8 +373,8 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
/* Hyper-V extensions (Enlightened VMCB) */
if (kvm_hv_hypercall_enabled(vcpu)) {
to->clean = from->clean;
- memcpy(to->reserved_sw, from->reserved_sw,
- sizeof(struct hv_enlightenments));
+ memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
+ sizeof(to->hv_enlightenments));
}
}
@@ -474,6 +478,15 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
/*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && npt_enabled)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
* TODO: optimize unconditional TLB flush/MMU sync. A partial list of
* things to fix before this can be conditional:
*
@@ -800,6 +813,8 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
if (kvm_vcpu_apicv_active(vcpu))
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ nested_svm_hv_update_vm_vp_ids(vcpu);
+
return 0;
}
@@ -822,6 +837,13 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
return 1;
}
+ /* This fails when VP assist page is enabled but the supplied GPA is bogus */
+ ret = kvm_hv_verify_vp_assist(vcpu);
+ if (ret) {
+ kvm_inject_gp(vcpu, 0);
+ return ret;
+ }
+
vmcb12_gpa = svm->vmcb->save.rax;
ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
@@ -1091,6 +1113,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
+ return;
+
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
}
@@ -1125,6 +1153,9 @@ void svm_free_nested(struct vcpu_svm *svm)
if (!svm->nested.initialized)
return;
+ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
svm_vcpu_free_msrpm(svm->nested.msrpm);
svm->nested.msrpm = NULL;
@@ -1143,9 +1174,6 @@ void svm_free_nested(struct vcpu_svm *svm)
svm->nested.initialized = false;
}
-/*
- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
- */
void svm_leave_nested(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1377,6 +1405,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return 0;
}
+#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
if (block_nested_events)
return -EBUSY;
@@ -1385,6 +1414,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
return 0;
}
+#endif
if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
if (block_nested_events)
@@ -1411,6 +1441,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
int nested_svm_exit_special(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
switch (exit_code) {
case SVM_EXIT_INTR:
@@ -1429,6 +1460,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_HOST;
break;
}
+ case SVM_EXIT_VMMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_svm_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu))
+ return NESTED_EXIT_HOST;
+ break;
default:
break;
}
@@ -1479,7 +1517,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
dst->virt_ext = from->virt_ext;
dst->pause_filter_count = from->pause_filter_count;
dst->pause_filter_thresh = from->pause_filter_thresh;
- /* 'clean' and 'reserved_sw' are not changed by KVM */
+ /* 'clean' and 'hv_enlightenments' are not changed by KVM */
}
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
@@ -1709,6 +1747,9 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
return false;
}
+ if (kvm_hv_verify_vp_assist(vcpu))
+ return false;
+
return true;
}
@@ -1720,4 +1761,5 @@ struct kvm_x86_nested_ops svm_nested_ops = {
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush,
};
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index b68956299fa8..0e313fbae055 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -159,7 +159,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~pmu->reserved_bits;
if (data != pmc->eventsel) {
pmc->eventsel = data;
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
}
return 0;
}
@@ -192,9 +192,10 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC);
+ BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE);
+ BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC);
- for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) {
+ for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
@@ -207,11 +208,11 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) {
+ for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) {
struct kvm_pmc *pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
+ pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 28064060413a..69dbf17f0d6a 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -196,7 +196,7 @@ static void sev_asid_free(struct kvm_sev_info *sev)
__set_bit(sev->asid, sev_reclaim_asid_bitmap);
for_each_possible_cpu(cpu) {
- sd = per_cpu(svm_data, cpu);
+ sd = per_cpu_ptr(&svm_data, cpu);
sd->sev_vmcbs[sev->asid] = NULL;
}
@@ -605,7 +605,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->dr6 = svm->vcpu.arch.dr6;
pr_debug("Virtual Machine Save Area (VMSA):\n");
- print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
+ print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
return 0;
}
@@ -2600,7 +2600,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
void pre_sev_run(struct vcpu_svm *svm, int cpu)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
int asid = sev_get_asid(svm->vcpu.kvm);
/* Assign the asid allocated with this SEV guest */
@@ -2648,7 +2648,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
ghcb_scratch_beg = control->ghcb_gpa +
offsetof(struct ghcb, shared_buffer);
ghcb_scratch_end = control->ghcb_gpa +
- offsetof(struct ghcb, reserved_1);
+ offsetof(struct ghcb, reserved_0xff0);
/*
* If the scratch area begins within the GHCB, it must be
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 58f0077d9357..91352d692845 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -6,6 +6,7 @@
#include "mmu.h"
#include "kvm_cache_regs.h"
#include "x86.h"
+#include "smm.h"
#include "cpuid.h"
#include "pmu.h"
@@ -245,7 +246,7 @@ struct kvm_ldttss_desc {
u32 zero1;
} __attribute__((packed));
-DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
/*
* Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
@@ -346,12 +347,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return 0;
}
-static int is_external_interrupt(u32 info)
-{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -581,12 +576,7 @@ static int svm_hardware_enable(void)
pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
return -EINVAL;
}
- sd = per_cpu(svm_data, me);
- if (!sd) {
- pr_err("%s: svm_data is NULL on %d\n", __func__, me);
- return -EINVAL;
- }
-
+ sd = per_cpu_ptr(&svm_data, me);
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
@@ -597,7 +587,7 @@ static int svm_hardware_enable(void)
wrmsrl(MSR_EFER, efer | EFER_SVME);
- wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
+ wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
/*
@@ -646,42 +636,37 @@ static int svm_hardware_enable(void)
static void svm_cpu_uninit(int cpu)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
- if (!sd)
+ if (!sd->save_area)
return;
- per_cpu(svm_data, cpu) = NULL;
kfree(sd->sev_vmcbs);
__free_page(sd->save_area);
- kfree(sd);
+ sd->save_area_pa = 0;
+ sd->save_area = NULL;
}
static int svm_cpu_init(int cpu)
{
- struct svm_cpu_data *sd;
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
int ret = -ENOMEM;
- sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
- if (!sd)
- return ret;
- sd->cpu = cpu;
+ memset(sd, 0, sizeof(struct svm_cpu_data));
sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!sd->save_area)
- goto free_cpu_data;
+ return ret;
ret = sev_cpu_init(sd);
if (ret)
goto free_save_area;
- per_cpu(svm_data, cpu) = sd;
-
+ sd->save_area_pa = __sme_page_pa(sd->save_area);
return 0;
free_save_area:
__free_page(sd->save_area);
-free_cpu_data:
- kfree(sd);
+ sd->save_area = NULL;
return ret;
}
@@ -730,6 +715,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
u32 offset;
u32 *msrpm;
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
to_svm(vcpu)->msrpm;
@@ -1425,7 +1419,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb)
int i;
for_each_online_cpu(i)
- cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
+ cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
}
static void svm_vcpu_free(struct kvm_vcpu *vcpu)
@@ -1439,6 +1433,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
*/
svm_clear_current_vmcb(svm->vmcb);
+ svm_leave_nested(vcpu);
svm_free_nested(svm);
sev_free_vcpu(vcpu);
@@ -1450,7 +1445,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
if (sev_es_guest(vcpu->kvm))
sev_es_unmap_ghcb(svm);
@@ -1462,7 +1457,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
* Save additional host state that will be restored on VMEXIT (sev-es)
* or subsequent vmload of host save area.
*/
- vmsave(__sme_page_pa(sd->save_area));
+ vmsave(sd->save_area_pa);
if (sev_es_guest(vcpu->kvm)) {
struct sev_es_save_area *hostsa;
hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
@@ -1487,7 +1482,7 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
@@ -2714,8 +2709,6 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr)
if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
break;
- case MSR_IA32_PERF_CAPABILITIES:
- return 0;
default:
return KVM_MSR_RET_INVALID;
}
@@ -3426,15 +3419,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 0;
}
- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
- printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
- "exit_code 0x%x\n",
- __func__, svm->vmcb->control.exit_int_info,
- exit_code);
-
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
@@ -3443,7 +3427,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
static void reload_tss(struct kvm_vcpu *vcpu)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
sd->tss_desc->type = 9; /* available 32/64-bit TSS */
load_TR_desc();
@@ -3451,7 +3435,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
static void pre_svm_run(struct kvm_vcpu *vcpu)
{
- struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
struct vcpu_svm *svm = to_svm(vcpu);
/*
@@ -3739,6 +3723,13 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
/*
+ * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
+ * A TLB flush for the current ASID flushes both "host" and "guest" TLB
+ * entries, and thus is a superset of Hyper-V's fine grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
+
+ /*
* Flush only the current ASID even if the TLB flush was invoked via
* kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
* ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
@@ -3911,30 +3902,16 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
-static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
struct vcpu_svm *svm = to_svm(vcpu);
- unsigned long vmcb_pa = svm->current_vmcb->pa;
guest_state_enter_irqoff();
- if (sev_es_guest(vcpu->kvm)) {
- __svm_sev_es_vcpu_run(vmcb_pa);
- } else {
- struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
-
- /*
- * Use a single vmcb (vmcb01 because it's always valid) for
- * context switching guest state via VMLOAD/VMSAVE, that way
- * the state doesn't need to be copied between vmcb01 and
- * vmcb02 when switching vmcbs for nested virtualization.
- */
- vmload(svm->vmcb01.pa);
- __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
- vmsave(svm->vmcb01.pa);
-
- vmload(__sme_page_pa(sd->save_area));
- }
+ if (sev_es_guest(vcpu->kvm))
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
+ else
+ __svm_vcpu_run(svm, spec_ctrl_intercepted);
guest_state_exit_irqoff();
}
@@ -3942,6 +3919,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
trace_kvm_entry(vcpu);
@@ -3998,34 +3976,15 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* being speculatively taken.
*/
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
- x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+ x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
- svm_vcpu_enter_exit(vcpu);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
- unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+ svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
if (!sev_es_guest(vcpu->kvm))
reload_tss(vcpu);
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
- x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+ x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
if (!sev_es_guest(vcpu->kvm)) {
vcpu->arch.cr2 = svm->vmcb->save.cr2;
@@ -4149,6 +4108,8 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return false;
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/* SEV-ES guests do not support SMM, so report false */
if (kvm && sev_es_guest(kvm))
return false;
@@ -4405,6 +4366,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
vcpu->arch.mcg_cap &= 0x1ff;
}
+#ifdef CONFIG_KVM_SMM
bool svm_smi_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4432,7 +4394,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return 1;
}
-static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map_save;
@@ -4441,10 +4403,16 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
if (!is_guest_mode(vcpu))
return 0;
- /* FED8h - SVM Guest */
- put_smstate(u64, smstate, 0x7ed8, 1);
- /* FEE0h - SVM Guest VMCB Physical Address */
- put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
+ /*
+ * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is
+ * responsible for ensuring nested SVM and SMIs are mutually exclusive.
+ */
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return 1;
+
+ smram->smram64.svm_guest_flag = 1;
+ smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -4466,8 +4434,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
* that, see svm_prepare_switch_to_guest()) which must be
* preserved.
*/
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
- &map_save) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
return 1;
BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
@@ -4479,34 +4446,33 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map, map_save;
- u64 saved_efer, vmcb12_gpa;
struct vmcb *vmcb12;
int ret;
+ const struct kvm_smram_state_64 *smram64 = &smram->smram64;
+
if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
return 0;
/* Non-zero if SMI arrived while vCPU was in guest mode. */
- if (!GET_SMSTATE(u64, smstate, 0x7ed8))
+ if (!smram64->svm_guest_flag)
return 0;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
return 1;
- saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
- if (!(saved_efer & EFER_SVME))
+ if (!(smram64->efer & EFER_SVME))
return 1;
- vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
return 1;
ret = 1;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
goto unmap_map;
if (svm_allocate_nested(svm))
@@ -4528,7 +4494,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
vmcb12 = map.hva;
nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
- ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+ ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
if (ret)
goto unmap_save;
@@ -4554,6 +4520,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
/* We must be in SMM; RSM will cause a vmexit anyway. */
}
}
+#endif
static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
@@ -4829,10 +4796,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.pi_update_irte = avic_pi_update_irte,
.setup_mce = svm_setup_mce,
+#ifdef CONFIG_KVM_SMM
.smi_allowed = svm_smi_allowed,
.enter_smm = svm_enter_smm,
.leave_smm = svm_leave_smm,
.enable_smi_window = svm_enable_smi_window,
+#endif
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
@@ -4898,6 +4867,7 @@ static __init void svm_set_cpu_caps(void)
{
kvm_set_cpu_caps();
+ kvm_caps.supported_perf_cap = 0;
kvm_caps.supported_xss = 0;
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6a7686bf6900..4826e6cc611b 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -151,7 +151,10 @@ struct vmcb_ctrl_area_cached {
u64 nested_cr3;
u64 virt_ext;
u32 clean;
- u8 reserved_sw[32];
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
};
struct svm_nested_state {
@@ -209,7 +212,6 @@ struct vcpu_svm {
struct vmcb *vmcb;
struct kvm_vmcb_info vmcb01;
struct kvm_vmcb_info *current_vmcb;
- struct svm_cpu_data *svm_data;
u32 asid;
u32 sysenter_esp_hi;
u32 sysenter_eip_hi;
@@ -281,8 +283,6 @@ struct vcpu_svm {
};
struct svm_cpu_data {
- int cpu;
-
u64 asid_generation;
u32 max_asid;
u32 next_asid;
@@ -290,13 +290,15 @@ struct svm_cpu_data {
struct kvm_ldttss_desc *tss_desc;
struct page *save_area;
+ unsigned long save_area_pa;
+
struct vmcb *current_vmcb;
/* index = sev_asid, value = vmcb pointer */
struct vmcb **sev_vmcbs;
};
-DECLARE_PER_CPU(struct svm_cpu_data *, svm_data);
+DECLARE_PER_CPU(struct svm_cpu_data, svm_data);
void recalc_intercepts(struct vcpu_svm *svm);
@@ -683,7 +685,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(unsigned long vmcb_pa);
-void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
#endif
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
index 8cdc62c74a96..26a89d0da93e 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.c
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -14,9 +14,9 @@
#include "kvm_onhyperv.h"
#include "svm_onhyperv.h"
-int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
- struct hv_enlightenments *hve;
+ struct hv_vmcb_enlightenments *hve;
struct hv_partition_assist_pg **p_hv_pa_pg =
&to_kvm_hv(vcpu->kvm)->hv_pa_pg;
@@ -26,13 +26,13 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
if (!*p_hv_pa_pg)
return -ENOMEM;
- hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw;
+ hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
hve->partition_assist_page = __pa(*p_hv_pa_pg);
hve->hv_vm_id = (unsigned long)vcpu->kvm;
if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
hve->hv_enlightenments_control.nested_flush_hypercall = 1;
- vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
return 0;
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index e2fc59380465..45faf84476ce 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -13,12 +13,14 @@
static struct kvm_x86_ops svm_x86_ops;
-int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
+int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu);
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) !=
+ sizeof(vmcb->control.reserved_sw));
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
@@ -51,8 +53,8 @@ static inline void svm_hv_hardware_setup(void)
vp_ap->nested_control.features.directhypercall = 1;
}
- svm_x86_ops.enable_direct_tlbflush =
- svm_hv_enable_direct_tlbflush;
+ svm_x86_ops.enable_l2_tlb_flush =
+ svm_hv_enable_l2_tlb_flush;
}
}
@@ -60,23 +62,20 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
struct kvm_vcpu *vcpu)
{
struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
if (hve->hv_enlightenments_control.msr_bitmap)
- vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
-static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
- struct kvm_vcpu *vcpu)
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
u32 vp_index = kvm_hv_get_vpindex(vcpu);
if (hve->hv_vp_id != vp_index) {
hve->hv_vp_id = vp_index;
- vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
}
#else
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 9430d6437c9f..36c8af87a707 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -61,9 +61,4 @@ static __always_inline void vmsave(unsigned long pa)
svm_asm1(vmsave, "a" (pa), "memory");
}
-static __always_inline void vmload(unsigned long pa)
-{
- svm_asm1(vmload, "a" (pa), "memory");
-}
-
#endif /* __KVM_X86_SVM_OPS_H */
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 723f8534986c..34367dc203f2 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -4,35 +4,97 @@
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
+#include "kvm-asm-offsets.h"
#define WORD_SIZE (BITS_PER_LONG / 8)
/* Intentionally omit RAX as it's context switched by hardware */
-#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
-#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
-#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
+#define VCPU_RCX (SVM_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE)
+#define VCPU_RDX (SVM_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE)
+#define VCPU_RBX (SVM_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE)
/* Intentionally omit RSP as it's context switched by hardware */
-#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
-#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
-#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
+#define VCPU_RBP (SVM_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE)
+#define VCPU_RSI (SVM_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE)
+#define VCPU_RDI (SVM_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE)
#ifdef CONFIG_X86_64
-#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
-#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
-#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
-#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
-#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
-#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
-#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
-#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
+#define VCPU_R8 (SVM_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE)
+#define VCPU_R9 (SVM_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE)
+#define VCPU_R10 (SVM_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE)
+#define VCPU_R11 (SVM_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE)
+#define VCPU_R12 (SVM_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE)
+#define VCPU_R13 (SVM_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE)
+#define VCPU_R14 (SVM_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE)
+#define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE)
#endif
+#define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa)
+
.section .noinstr.text, "ax"
+.macro RESTORE_GUEST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+801:
+.endm
+.macro RESTORE_GUEST_SPEC_CTRL_BODY
+800:
+ /*
+ * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the
+ * host's, write the MSR. This is kept out-of-line so that the common
+ * case does not have to jump.
+ *
+ * IMPORTANT: To avoid RSB underflow attacks and any other nastiness,
+ * there must not be any returns or indirect branches between this code
+ * and vmentry.
+ */
+ movl SVM_spec_ctrl(%_ASM_DI), %eax
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ je 801b
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+ xor %edx, %edx
+ wrmsr
+ jmp 801b
+.endm
+
+.macro RESTORE_HOST_SPEC_CTRL
+ /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */
+ ALTERNATIVE_2 "", \
+ "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \
+ "", X86_FEATURE_V_SPEC_CTRL
+901:
+.endm
+.macro RESTORE_HOST_SPEC_CTRL_BODY
+900:
+ /* Same for after vmexit. */
+ mov $MSR_IA32_SPEC_CTRL, %ecx
+
+ /*
+ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
+ * if it was not intercepted during guest execution.
+ */
+ cmpb $0, (%_ASM_SP)
+ jnz 998f
+ rdmsr
+ movl %eax, SVM_spec_ctrl(%_ASM_DI)
+998:
+
+ /* Now restore the host value of the MSR if different from the guest's. */
+ movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ cmp SVM_spec_ctrl(%_ASM_DI), %eax
+ je 901b
+ xor %edx, %edx
+ wrmsr
+ jmp 901b
+.endm
+
+
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
- * @vmcb_pa: unsigned long
- * @regs: unsigned long * (to guest registers)
+ * @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
@@ -47,49 +109,71 @@ SYM_FUNC_START(__svm_vcpu_run)
#endif
push %_ASM_BX
- /* Save @regs. */
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
push %_ASM_ARG2
- /* Save @vmcb. */
+ /* Needed to restore access to percpu variables. */
+ __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa)
+
+ /* Finally save @svm. */
push %_ASM_ARG1
- /* Move @regs to RAX. */
- mov %_ASM_ARG2, %_ASM_AX
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
+ /*
+ * Use a single vmcb (vmcb01 because it's always valid) for
+ * context switching guest state via VMLOAD/VMSAVE, that way
+ * the state doesn't need to be copied between vmcb01 and
+ * vmcb02 when switching vmcbs for nested virtualization.
+ */
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+1: vmload %_ASM_AX
+2:
+
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Load guest registers. */
- mov VCPU_RCX(%_ASM_AX), %_ASM_CX
- mov VCPU_RDX(%_ASM_AX), %_ASM_DX
- mov VCPU_RBX(%_ASM_AX), %_ASM_BX
- mov VCPU_RBP(%_ASM_AX), %_ASM_BP
- mov VCPU_RSI(%_ASM_AX), %_ASM_SI
- mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+ mov VCPU_RCX(%_ASM_DI), %_ASM_CX
+ mov VCPU_RDX(%_ASM_DI), %_ASM_DX
+ mov VCPU_RBX(%_ASM_DI), %_ASM_BX
+ mov VCPU_RBP(%_ASM_DI), %_ASM_BP
+ mov VCPU_RSI(%_ASM_DI), %_ASM_SI
#ifdef CONFIG_X86_64
- mov VCPU_R8 (%_ASM_AX), %r8
- mov VCPU_R9 (%_ASM_AX), %r9
- mov VCPU_R10(%_ASM_AX), %r10
- mov VCPU_R11(%_ASM_AX), %r11
- mov VCPU_R12(%_ASM_AX), %r12
- mov VCPU_R13(%_ASM_AX), %r13
- mov VCPU_R14(%_ASM_AX), %r14
- mov VCPU_R15(%_ASM_AX), %r15
+ mov VCPU_R8 (%_ASM_DI), %r8
+ mov VCPU_R9 (%_ASM_DI), %r9
+ mov VCPU_R10(%_ASM_DI), %r10
+ mov VCPU_R11(%_ASM_DI), %r11
+ mov VCPU_R12(%_ASM_DI), %r12
+ mov VCPU_R13(%_ASM_DI), %r13
+ mov VCPU_R14(%_ASM_DI), %r14
+ mov VCPU_R15(%_ASM_DI), %r15
#endif
-
- /* "POP" @vmcb to RAX. */
- pop %_ASM_AX
+ mov VCPU_RDI(%_ASM_DI), %_ASM_DI
/* Enter guest mode */
sti
-1: vmrun %_ASM_AX
-
-2: cli
-
-#ifdef CONFIG_RETPOLINE
- /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
- FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
-#endif
+3: vmrun %_ASM_AX
+4:
+ cli
- /* "POP" @regs to RAX. */
+ /* Pop @svm to RAX while it's the only available register. */
pop %_ASM_AX
/* Save all guest registers. */
@@ -110,6 +194,26 @@ SYM_FUNC_START(__svm_vcpu_run)
mov %r15, VCPU_R15(%_ASM_AX)
#endif
+ /* @svm can stay in RDI from now on. */
+ mov %_ASM_AX, %_ASM_DI
+
+ mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX
+5: vmsave %_ASM_AX
+6:
+
+ /* Restores GSBASE among other things, allowing access to percpu data. */
+ pop %_ASM_AX
+7: vmload %_ASM_AX
+8:
+
+#ifdef CONFIG_RETPOLINE
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+#endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -145,6 +249,9 @@ SYM_FUNC_START(__svm_vcpu_run)
xor %r15d, %r15d
#endif
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -159,17 +266,33 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %_ASM_BP
RET
-3: cmpb $0, kvm_rebooting
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
+10: cmpb $0, kvm_rebooting
jne 2b
ud2
+30: cmpb $0, kvm_rebooting
+ jne 4b
+ ud2
+50: cmpb $0, kvm_rebooting
+ jne 6b
+ ud2
+70: cmpb $0, kvm_rebooting
+ jne 8b
+ ud2
- _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE(1b, 10b)
+ _ASM_EXTABLE(3b, 30b)
+ _ASM_EXTABLE(5b, 50b)
+ _ASM_EXTABLE(7b, 70b)
SYM_FUNC_END(__svm_vcpu_run)
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
- * @vmcb_pa: unsigned long
+ * @svm: struct vcpu_svm *
+ * @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
push %_ASM_BP
@@ -184,8 +307,31 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
#endif
push %_ASM_BX
- /* Move @vmcb to RAX. */
- mov %_ASM_ARG1, %_ASM_AX
+ /*
+ * Save variables needed after vmexit on the stack, in inverse
+ * order compared to when they are needed.
+ */
+
+ /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
+ push %_ASM_ARG2
+
+ /* Save @svm. */
+ push %_ASM_ARG1
+
+.ifnc _ASM_ARG1, _ASM_DI
+ /*
+ * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
+ * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ */
+ mov %_ASM_ARG1, %_ASM_DI
+.endif
+
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_GUEST_SPEC_CTRL
+
+ /* Get svm->current_vmcb->pa into RAX. */
+ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
+ mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
sti
@@ -194,11 +340,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
+ /* Pop @svm to RDI, guest registers have been saved already. */
+ pop %_ASM_DI
+
#ifdef CONFIG_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
+ /* Clobbers RAX, RCX, RDX. */
+ RESTORE_HOST_SPEC_CTRL
+
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
* untrained as soon as we exit the VM and are back to the
@@ -208,6 +360,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
*/
UNTRAIN_RET
+ /* "Pop" @spec_ctrl_intercepted. */
+ pop %_ASM_BX
+
pop %_ASM_BX
#ifdef CONFIG_X86_64
@@ -222,6 +377,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %_ASM_BP
RET
+ RESTORE_GUEST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY
+
3: cmpb $0, kvm_rebooting
jne 2b
ud2
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bc25589ad588..83843379813e 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -113,12 +113,13 @@ TRACE_EVENT(kvm_hv_hypercall_done,
* Tracepoint for Xen hypercall.
*/
TRACE_EVENT(kvm_xen_hypercall,
- TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3, unsigned long a4,
- unsigned long a5),
- TP_ARGS(nr, a0, a1, a2, a3, a4, a5),
+ TP_PROTO(u8 cpl, unsigned long nr,
+ unsigned long a0, unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4, unsigned long a5),
+ TP_ARGS(cpl, nr, a0, a1, a2, a3, a4, a5),
TP_STRUCT__entry(
+ __field(u8, cpl)
__field(unsigned long, nr)
__field(unsigned long, a0)
__field(unsigned long, a1)
@@ -129,6 +130,7 @@ TRACE_EVENT(kvm_xen_hypercall,
),
TP_fast_assign(
+ __entry->cpl = cpl;
__entry->nr = nr;
__entry->a0 = a0;
__entry->a1 = a1;
@@ -138,8 +140,9 @@ TRACE_EVENT(kvm_xen_hypercall,
__entry->a4 = a5;
),
- TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
- __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+ __entry->cpl, __entry->nr,
+ __entry->a0, __entry->a1, __entry->a2,
__entry->a3, __entry->a4, __entry->a5)
);
@@ -1547,38 +1550,41 @@ TRACE_EVENT(kvm_hv_timer_state,
* Tracepoint for kvm_hv_flush_tlb.
*/
TRACE_EVENT(kvm_hv_flush_tlb,
- TP_PROTO(u64 processor_mask, u64 address_space, u64 flags),
- TP_ARGS(processor_mask, address_space, flags),
+ TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(processor_mask, address_space, flags, guest_mode),
TP_STRUCT__entry(
__field(u64, processor_mask)
__field(u64, address_space)
__field(u64, flags)
+ __field(bool, guest_mode)
),
TP_fast_assign(
__entry->processor_mask = processor_mask;
__entry->address_space = address_space;
__entry->flags = flags;
+ __entry->guest_mode = guest_mode;
),
- TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx",
+ TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s",
__entry->processor_mask, __entry->address_space,
- __entry->flags)
+ __entry->flags, __entry->guest_mode ? "(L2)" : "")
);
/*
* Tracepoint for kvm_hv_flush_tlb_ex.
*/
TRACE_EVENT(kvm_hv_flush_tlb_ex,
- TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags),
- TP_ARGS(valid_bank_mask, format, address_space, flags),
+ TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode),
TP_STRUCT__entry(
__field(u64, valid_bank_mask)
__field(u64, format)
__field(u64, address_space)
__field(u64, flags)
+ __field(bool, guest_mode)
),
TP_fast_assign(
@@ -1586,12 +1592,14 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
__entry->format = format;
__entry->address_space = address_space;
__entry->flags = flags;
+ __entry->guest_mode = guest_mode;
),
TP_printk("valid_bank_mask 0x%llx format 0x%llx "
- "address_space 0x%llx flags 0x%llx",
+ "address_space 0x%llx flags 0x%llx %s",
__entry->valid_bank_mask, __entry->format,
- __entry->address_space, __entry->flags)
+ __entry->address_space, __entry->flags,
+ __entry->guest_mode ? "(L2)" : "")
);
/*
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 07254314f3dd..cd2ac9536c99 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -395,30 +395,6 @@ static inline bool vmx_pebs_supported(void)
return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
}
-static inline u64 vmx_get_perf_capabilities(void)
-{
- u64 perf_cap = PMU_CAP_FW_WRITES;
- struct x86_pmu_lbr lbr;
- u64 host_perf_cap = 0;
-
- if (!enable_pmu)
- return 0;
-
- if (boot_cpu_has(X86_FEATURE_PDCM))
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
-
- if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr)
- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
-
- if (vmx_pebs_supported()) {
- perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
- if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
- perf_cap &= ~PERF_CAP_PEBS_BASELINE;
- }
-
- return perf_cap;
-}
-
static inline bool cpu_has_notify_vmexit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/hyperv.c
index d8b23c96d627..ae03d1fe0355 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -3,9 +3,9 @@
#include <linux/errno.h>
#include <linux/smp.h>
-#include "../hyperv.h"
#include "../cpuid.h"
-#include "evmcs.h"
+#include "hyperv.h"
+#include "nested.h"
#include "vmcs.h"
#include "vmx.h"
#include "trace.h"
@@ -322,24 +322,17 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu)
{
- struct hv_vp_assist_page assist_page;
-
- *evmcs_gpa = -1ull;
-
- if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
- return false;
-
- if (unlikely(!assist_page.enlighten_vmentry))
- return false;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs)))
- return false;
+ if (unlikely(kvm_hv_get_assist_page(vcpu)))
+ return EVMPTR_INVALID;
- *evmcs_gpa = assist_page.current_nested_vmcs;
+ if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry))
+ return EVMPTR_INVALID;
- return true;
+ return hv_vcpu->vp_assist_page.current_nested_vmcs;
}
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
@@ -507,3 +500,23 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return 0;
}
+
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ if (!hv_vcpu || !evmcs)
+ return false;
+
+ if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0);
+}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/hyperv.h
index 6f746ef3c038..571e7929d14e 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __KVM_X86_VMX_EVMCS_H
-#define __KVM_X86_VMX_EVMCS_H
+#ifndef __KVM_X86_VMX_HYPERV_H
+#define __KVM_X86_VMX_HYPERV_H
#include <linux/jump_label.h>
@@ -8,6 +8,8 @@
#include <asm/mshyperv.h>
#include <asm/vmx.h>
+#include "../hyperv.h"
+
#include "capabilities.h"
#include "vmcs.h"
#include "vmcs12.h"
@@ -235,11 +237,13 @@ enum nested_evmptrld_status {
EVMPTRLD_ERROR,
};
-bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
-#endif /* __KVM_X86_VMX_EVMCS_H */
+#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0c62352dda6a..b28be793de29 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,7 +7,6 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
@@ -16,6 +15,7 @@
#include "trace.h"
#include "vmx.h"
#include "x86.h"
+#include "smm.h"
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
@@ -225,6 +225,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
@@ -233,6 +234,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
}
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ if (hv_vcpu) {
+ hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
+ hv_vcpu->nested.vm_id = 0;
+ hv_vcpu->nested.vp_id = 0;
+ }
}
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
@@ -1126,6 +1133,15 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && enable_ept)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
* If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
* for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
* full TLB flush from the guest's perspective. This is required even
@@ -1557,12 +1573,20 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
+ hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
+ hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
+ hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
+ }
+
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
vmcs12->guest_rsp = evmcs->guest_rsp;
vmcs12->guest_rflags = evmcs->guest_rflags;
@@ -1977,7 +2001,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
if (likely(!guest_cpuid_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
- if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
+ evmcs_gpa = nested_get_evmptr(vcpu);
+ if (!evmptr_is_valid(evmcs_gpa)) {
nested_release_evmcs(vcpu);
return EVMPTRLD_DISABLED;
}
@@ -3251,6 +3276,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
+ /*
+ * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
+ * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
+ * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
+ * migration.
+ */
if (!nested_get_evmcs_page(vcpu)) {
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
__func__);
@@ -4854,6 +4885,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
{
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
}
@@ -5205,7 +5237,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 zero = 0;
gpa_t vmptr;
- u64 evmcs_gpa;
int r;
if (!nested_vmx_check_permission(vcpu))
@@ -5231,7 +5262,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
if (likely(!guest_cpuid_has_evmcs(vcpu) ||
- !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
+ !evmptr_is_valid(nested_get_evmptr(vcpu)))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
@@ -6128,6 +6159,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
* Handle L2's bus locks in L0 directly.
*/
return true;
+ case EXIT_REASON_VMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu);
default:
break;
}
@@ -6440,9 +6476,6 @@ out:
return kvm_state.size;
}
-/*
- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
- */
void vmx_leave_nested(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
@@ -6982,4 +7015,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
.write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
};
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 25b70a85bef5..e5cec07ca8d9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -52,7 +52,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
}
}
@@ -76,7 +76,7 @@ static void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
pmc = intel_pmc_idx_to_pmc(pmu, bit);
if (pmc)
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
}
}
@@ -477,7 +477,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
if (!(data & reserved_bits)) {
pmc->eventsel = data;
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogam(pmc);
return 0;
}
} else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
@@ -617,7 +617,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
- for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
@@ -631,7 +631,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->fixed_counters[i].current_config = 0;
}
- vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
lbr_desc->records.nr = 0;
lbr_desc->event = NULL;
lbr_desc->msr_passthrough = false;
@@ -643,18 +642,18 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmc *pmc = NULL;
int i;
- for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
+ pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmc = &pmu->fixed_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = 0;
+ pmc->counter = pmc->prev_counter = 0;
}
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 746129ddd5ae..01936013428b 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -208,9 +208,8 @@ struct __packed vmcs12 {
/*
* For save/restore compatibility, the vmcs12 field offsets must not change.
*/
-#define CHECK_OFFSET(field, loc) \
- BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
- "Offset of " #field " in struct vmcs12 has changed.")
+#define CHECK_OFFSET(field, loc) \
+ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
static inline void vmx_check_vmcs12_offsets(void)
{
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 8477d8bdd69c..0b5db4de4d09 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -1,12 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
-#include <asm/asm-offsets.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
#include <asm/percpu.h>
#include <asm/segment.h>
+#include "kvm-asm-offsets.h"
#include "run_flags.h"
#define WORD_SIZE (BITS_PER_LONG / 8)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 63247c57c72c..cea8c07f5229 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -51,7 +51,6 @@
#include "capabilities.h"
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
#include "irq.h"
@@ -66,6 +65,7 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "smm.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -526,7 +526,7 @@ static unsigned long host_idt_base;
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
struct hv_partition_assist_pg **p_hv_pa_pg =
@@ -1849,9 +1849,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
- case MSR_IA32_PERF_CAPABILITIES:
- msr->data = vmx_get_perf_capabilities();
- return 0;
default:
return KVM_MSR_RET_INVALID;
}
@@ -2029,7 +2026,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
+ if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
@@ -2342,14 +2339,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (data & PMU_CAP_LBR_FMT) {
if ((data & PMU_CAP_LBR_FMT) !=
- (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+ (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
}
if (data & PERF_CAP_PEBS_FORMAT) {
if ((data & PERF_CAP_PEBS_MASK) !=
- (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK))
+ (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
return 1;
@@ -6844,6 +6841,8 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/*
* We cannot do SMM unless we can run the guest in big
* real mode.
@@ -7669,6 +7668,31 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_update_exception_bitmap(vcpu);
}
+static u64 vmx_get_perf_capabilities(void)
+{
+ u64 perf_cap = PMU_CAP_FW_WRITES;
+ struct x86_pmu_lbr lbr;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+ x86_perf_get_lbr(&lbr);
+ if (lbr.nr)
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+ if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
+}
+
static __init void vmx_set_cpu_caps(void)
{
kvm_set_cpu_caps();
@@ -7691,6 +7715,7 @@ static __init void vmx_set_cpu_caps(void)
if (!enable_pmu)
kvm_cpu_cap_clear(X86_FEATURE_PDCM);
+ kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
if (!enable_sgx) {
kvm_cpu_cap_clear(X86_FEATURE_SGX);
@@ -7906,6 +7931,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEAT_CTL_LMCE_ENABLED;
}
+#ifdef CONFIG_KVM_SMM
static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
@@ -7914,7 +7940,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7935,7 +7961,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -7960,6 +7986,7 @@ static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
+#endif
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
@@ -8127,10 +8154,12 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.setup_mce = vmx_setup_mce,
+#ifdef CONFIG_KVM_SMM
.smi_allowed = vmx_smi_allowed,
.enter_smm = vmx_enter_smm,
.leave_smm = vmx_leave_smm,
.enable_smi_window = vmx_enable_smi_window,
+#endif
.can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
@@ -8490,8 +8519,8 @@ static int __init vmx_init(void)
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_direct_tlbflush
- = hv_enable_direct_tlbflush;
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
} else {
enlightened_vmcs = false;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index ec268df83ed6..f6f23c7397dc 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -6,7 +6,7 @@
#include <asm/vmx.h>
-#include "evmcs.h"
+#include "hyperv.h"
#include "vmcs.h"
#include "../x86.h"
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a228a1b872bb..2bc3eac3f14e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -30,6 +30,7 @@
#include "hyperv.h"
#include "lapic.h"
#include "xen.h"
+#include "smm.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -119,8 +120,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
-static void process_smi(struct kvm_vcpu *vcpu);
-static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
@@ -628,6 +627,12 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto
ex->payload = payload;
}
+/* Forcibly leave the nested mode in cases like a vCPU reset */
+static void kvm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+}
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
@@ -1438,32 +1443,27 @@ static const u32 msrs_to_save_all[] = {
MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
+ MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+
+ /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
- MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
- MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
- MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
- MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
- MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
- MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
- MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
- MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
- MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
- MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+
+ /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */
MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
MSR_IA32_XFD, MSR_IA32_XFD_ERR,
};
@@ -1653,6 +1653,9 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
case MSR_IA32_ARCH_CAPABILITIES:
msr->data = kvm_get_arch_capabilities();
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ msr->data = kvm_caps.supported_perf_cap;
+ break;
case MSR_IA32_UCODE_REV:
rdmsrl_safe(msr->index, &msr->data);
break;
@@ -3396,6 +3399,9 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
static_call(kvm_x86_flush_tlb_all)(vcpu);
+
+ /* Flushing all ASIDs flushes the current ASID... */
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
@@ -3414,6 +3420,12 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
}
static_call(kvm_x86_flush_tlb_guest)(vcpu);
+
+ /*
+ * Flushing all "guest" TLB is always a superset of Hyper-V's fine
+ * grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
}
@@ -3565,20 +3577,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.arch_capabilities = data;
break;
- case MSR_IA32_PERF_CAPABILITIES: {
- struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
-
+ case MSR_IA32_PERF_CAPABILITIES:
if (!msr_info->host_initiated)
return 1;
- if (kvm_get_msr_feature(&msr_ent))
- return 1;
- if (data & ~msr_ent.data)
+ if (data & ~kvm_caps.supported_perf_cap)
return 1;
vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu);
return 0;
- }
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
@@ -3650,7 +3657,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
}
case MSR_IA32_SMBASE:
- if (!msr_info->host_initiated)
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
return 1;
vcpu->arch.smbase = data;
break;
@@ -4066,7 +4073,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_misc_enable_msr;
break;
case MSR_IA32_SMBASE:
- if (!msr_info->host_initiated)
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
return 1;
msr_info->data = vcpu->arch.smbase;
break;
@@ -4440,6 +4447,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r |= KVM_X86_DISABLE_EXITS_MWAIT;
break;
case KVM_CAP_X86_SMM:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ break;
+
/* SMBASE is usually relocated above 1M on modern chipsets,
* and SMM handlers might indeed rely on 4G segment limits,
* so do not report SMM to be available if real mode is
@@ -4480,7 +4490,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
break;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
- r = kvm_x86_ops.enable_direct_tlbflush != NULL;
+ r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
break;
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
@@ -4896,13 +4906,6 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
return 0;
}
-static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
-{
- kvm_make_request(KVM_REQ_SMI, vcpu);
-
- return 0;
-}
-
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
@@ -5038,8 +5041,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
process_nmi(vcpu);
+#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
+#endif
/*
* KVM's ABI only allows for one exception to be migrated. Luckily,
@@ -5067,16 +5072,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
ex->pending && ex->has_payload)
kvm_deliver_exception_payload(vcpu, ex);
+ memset(events, 0, sizeof(*events));
+
/*
* The API doesn't provide the instruction length for software
* exceptions, so don't report them. As long as the guest RIP
* isn't advanced, we should expect to encounter the exception
* again.
*/
- if (kvm_exception_is_soft(ex->vector)) {
- events->exception.injected = 0;
- events->exception.pending = 0;
- } else {
+ if (!kvm_exception_is_soft(ex->vector)) {
events->exception.injected = ex->injected;
events->exception.pending = ex->pending;
/*
@@ -5096,20 +5100,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.soft = 0;
events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
- events->nmi.pad = 0;
- events->sipi_vector = 0; /* never valid when reporting to user space */
+ /* events->sipi_vector is never valid when reporting to user space */
+#ifdef CONFIG_KVM_SMM
events->smi.smm = is_smm(vcpu);
events->smi.pending = vcpu->arch.smi_pending;
events->smi.smm_inside_nmi =
!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+#endif
events->smi.latched_init = kvm_lapic_latched_init(vcpu);
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
@@ -5121,12 +5125,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
}
-
- memset(&events->reserved, 0, sizeof(events->reserved));
}
-static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
-
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
@@ -5199,8 +5199,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+#ifdef CONFIG_KVM_SMM
if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
- kvm_x86_ops.nested_ops->leave_nested(vcpu);
+ kvm_leave_nested(vcpu);
kvm_smm_changed(vcpu, events->smi.smm);
}
@@ -5213,6 +5214,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
}
+#else
+ if (events->smi.smm || events->smi.pending ||
+ events->smi.smm_inside_nmi)
+ return -EINVAL;
+#endif
+
if (lapic_in_kernel(vcpu)) {
if (events->smi.latched_init)
set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
@@ -5496,10 +5503,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
}
return r;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
- if (!kvm_x86_ops.enable_direct_tlbflush)
+ if (!kvm_x86_ops.enable_l2_tlb_flush)
return -ENOTTY;
- return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
+ return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
case KVM_CAP_HYPERV_ENFORCE_CPUID:
return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
@@ -5579,7 +5586,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SMI: {
- r = kvm_vcpu_ioctl_smi(vcpu);
+ r = kvm_inject_smi(vcpu);
break;
}
case KVM_SET_CPUID: {
@@ -6238,9 +6245,7 @@ split_irqchip_unlock:
break;
case KVM_CAP_X86_USER_SPACE_MSR:
r = -EINVAL;
- if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
- KVM_MSR_EXIT_REASON_UNKNOWN |
- KVM_MSR_EXIT_REASON_FILTER))
+ if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
break;
kvm->arch.user_space_msr_mask = cap->args[0];
r = 0;
@@ -6417,7 +6422,7 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
if (!user_range->nmsrs)
return 0;
- if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+ if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
return -EINVAL;
if (!user_range->flags)
@@ -6451,7 +6456,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
int r = 0;
u32 i;
- if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
+ if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
@@ -7041,14 +7046,14 @@ static void kvm_init_msr_list(void)
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
- min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
+ min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
- min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
+ min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
continue;
break;
case MSR_IA32_XFD:
@@ -7124,8 +7129,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return handled;
}
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
+void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
{
static_call(kvm_x86_set_segment)(vcpu, var, seg);
}
@@ -7161,16 +7166,6 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
-{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
-
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
- access |= PFERR_FETCH_MASK;
- return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
-}
-
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
@@ -7283,15 +7278,6 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
}
-static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
- unsigned long addr, void *val, unsigned int bytes)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
-
- return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
-}
-
static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
@@ -8083,26 +8069,6 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
}
-static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 data)
-{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
-}
-
-static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- return vcpu->arch.smbase;
-}
-
-static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- vcpu->arch.smbase = smbase;
-}
-
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
@@ -8177,18 +8143,13 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
return emul_to_vcpu(ctxt)->arch.hflags;
}
-static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- kvm_smm_changed(vcpu, false);
-}
-
-static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
+#ifndef CONFIG_KVM_SMM
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
{
- return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
+ WARN_ON_ONCE(1);
+ return X86EMUL_UNHANDLEABLE;
}
+#endif
static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
{
@@ -8214,7 +8175,6 @@ static const struct x86_emulate_ops emulate_ops = {
.write_gpr = emulator_write_gpr,
.read_std = emulator_read_std,
.write_std = emulator_write_std,
- .read_phys = kvm_read_guest_phys_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
@@ -8234,11 +8194,8 @@ static const struct x86_emulate_ops emulate_ops = {
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
- .get_smbase = emulator_get_smbase,
- .set_smbase = emulator_set_smbase,
.set_msr_with_filter = emulator_set_msr_with_filter,
.get_msr_with_filter = emulator_get_msr_with_filter,
- .set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
.read_pmc = emulator_read_pmc,
@@ -8253,7 +8210,6 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
- .exiting_smm = emulator_exiting_smm,
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
@@ -8326,8 +8282,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
- BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
- BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
ctxt->interruptibility = 0;
ctxt->have_exception = false;
@@ -8586,29 +8540,6 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
-static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
-{
- trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
-
- if (entering_smm) {
- vcpu->arch.hflags |= HF_SMM_MASK;
- } else {
- vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
-
- /* Process a latched INIT or SMI, if any. */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- /*
- * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
- * on SMM exit we still need to reload them from
- * guest memory
- */
- vcpu->arch.pdptrs_from_userspace = false;
- }
-
- kvm_mmu_reset_context(vcpu);
-}
-
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
@@ -9810,7 +9741,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
int kvm_check_nested_events(struct kvm_vcpu *vcpu)
{
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
kvm_x86_ops.nested_ops->triple_fault(vcpu);
return 1;
}
@@ -9998,6 +9929,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
* in order to make progress and get back here for another iteration.
* The kvm_x86_ops hooks communicate this by returning -EBUSY.
*/
+#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending) {
r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
@@ -10010,6 +9942,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
} else
static_call(kvm_x86_enable_smi_window)(vcpu);
}
+#endif
if (vcpu->arch.nmi_pending) {
r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
@@ -10085,246 +10018,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
-{
- u32 flags = 0;
- flags |= seg->g << 23;
- flags |= seg->db << 22;
- flags |= seg->l << 21;
- flags |= seg->avl << 20;
- flags |= seg->present << 15;
- flags |= seg->dpl << 13;
- flags |= seg->s << 12;
- flags |= seg->type << 8;
- return flags;
-}
-
-static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
-{
- struct kvm_segment seg;
- int offset;
-
- kvm_get_segment(vcpu, &seg, n);
- put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
-
- if (n < 3)
- offset = 0x7f84 + n * 12;
- else
- offset = 0x7f2c + (n - 3) * 12;
-
- put_smstate(u32, buf, offset + 8, seg.base);
- put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
-}
-
-#ifdef CONFIG_X86_64
-static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
-{
- struct kvm_segment seg;
- int offset;
- u16 flags;
-
- kvm_get_segment(vcpu, &seg, n);
- offset = 0x7e00 + n * 16;
-
- flags = enter_smm_get_segment_flags(&seg) >> 8;
- put_smstate(u16, buf, offset, seg.selector);
- put_smstate(u16, buf, offset + 2, flags);
- put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u64, buf, offset + 8, seg.base);
-}
-#endif
-
-static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
-{
- struct desc_ptr dt;
- struct kvm_segment seg;
- unsigned long val;
- int i;
-
- put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
- put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
- put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
- put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
-
- for (i = 0; i < 8; i++)
- put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i));
-
- kvm_get_dr(vcpu, 6, &val);
- put_smstate(u32, buf, 0x7fcc, (u32)val);
- kvm_get_dr(vcpu, 7, &val);
- put_smstate(u32, buf, 0x7fc8, (u32)val);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
- put_smstate(u32, buf, 0x7fc4, seg.selector);
- put_smstate(u32, buf, 0x7f64, seg.base);
- put_smstate(u32, buf, 0x7f60, seg.limit);
- put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
- put_smstate(u32, buf, 0x7fc0, seg.selector);
- put_smstate(u32, buf, 0x7f80, seg.base);
- put_smstate(u32, buf, 0x7f7c, seg.limit);
- put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
-
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7f74, dt.address);
- put_smstate(u32, buf, 0x7f70, dt.size);
-
- static_call(kvm_x86_get_idt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7f58, dt.address);
- put_smstate(u32, buf, 0x7f54, dt.size);
-
- for (i = 0; i < 6; i++)
- enter_smm_save_seg_32(vcpu, buf, i);
-
- put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
-
- /* revision id */
- put_smstate(u32, buf, 0x7efc, 0x00020000);
- put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
-}
-
-#ifdef CONFIG_X86_64
-static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
-{
- struct desc_ptr dt;
- struct kvm_segment seg;
- unsigned long val;
- int i;
-
- for (i = 0; i < 16; i++)
- put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i));
-
- put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
- put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
-
- kvm_get_dr(vcpu, 6, &val);
- put_smstate(u64, buf, 0x7f68, val);
- kvm_get_dr(vcpu, 7, &val);
- put_smstate(u64, buf, 0x7f60, val);
-
- put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
- put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
- put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
-
- put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
-
- /* revision id */
- put_smstate(u32, buf, 0x7efc, 0x00020064);
-
- put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
- put_smstate(u16, buf, 0x7e90, seg.selector);
- put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
- put_smstate(u32, buf, 0x7e94, seg.limit);
- put_smstate(u64, buf, 0x7e98, seg.base);
-
- static_call(kvm_x86_get_idt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7e84, dt.size);
- put_smstate(u64, buf, 0x7e88, dt.address);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
- put_smstate(u16, buf, 0x7e70, seg.selector);
- put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
- put_smstate(u32, buf, 0x7e74, seg.limit);
- put_smstate(u64, buf, 0x7e78, seg.base);
-
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7e64, dt.size);
- put_smstate(u64, buf, 0x7e68, dt.address);
-
- for (i = 0; i < 6; i++)
- enter_smm_save_seg_64(vcpu, buf, i);
-}
-#endif
-
-static void enter_smm(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs, ds;
- struct desc_ptr dt;
- unsigned long cr0;
- char buf[512];
-
- memset(buf, 0, 512);
-#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- enter_smm_save_state_64(vcpu, buf);
- else
-#endif
- enter_smm_save_state_32(vcpu, buf);
-
- /*
- * Give enter_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. leave guest mode) after we've saved the state into the
- * SMM state-save area.
- */
- static_call(kvm_x86_enter_smm)(vcpu, buf);
-
- kvm_smm_changed(vcpu, true);
- kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
-
- if (static_call(kvm_x86_get_nmi_mask)(vcpu))
- vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
- else
- static_call(kvm_x86_set_nmi_mask)(vcpu, true);
-
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- kvm_rip_write(vcpu, 0x8000);
-
- cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
- vcpu->arch.cr0 = cr0;
-
- static_call(kvm_x86_set_cr4)(vcpu, 0);
-
- /* Undocumented: IDT limit is set to zero on entry to SMM. */
- dt.address = dt.size = 0;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
-
- kvm_set_dr(vcpu, 7, DR7_FIXED_1);
-
- cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
- cs.base = vcpu->arch.smbase;
-
- ds.selector = 0;
- ds.base = 0;
-
- cs.limit = ds.limit = 0xffffffff;
- cs.type = ds.type = 0x3;
- cs.dpl = ds.dpl = 0;
- cs.db = ds.db = 0;
- cs.s = ds.s = 1;
- cs.l = ds.l = 0;
- cs.g = ds.g = 1;
- cs.avl = ds.avl = 0;
- cs.present = ds.present = 1;
- cs.unusable = ds.unusable = 0;
- cs.padding = ds.padding = 0;
-
- kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
-
-#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- static_call(kvm_x86_set_efer)(vcpu, 0);
-#endif
-
- kvm_update_cpuid_runtime(vcpu);
- kvm_mmu_reset_context(vcpu);
-}
-
-static void process_smi(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.smi_pending = true;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-}
-
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap)
{
@@ -10549,28 +10242,42 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
kvm_mmu_load_pgd(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+
+ /*
+ * Note, the order matters here, as flushing "all" TLB entries
+ * also flushes the "current" TLB entries, i.e. servicing the
+ * flush "all" will clear any request to flush "current".
+ */
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_vcpu_flush_tlb_all(vcpu);
- /* Flushing all ASIDs flushes the current ASID... */
- kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
kvm_service_local_tlb_flush_requests(vcpu);
+ /*
+ * Fall back to a "full" guest flush if Hyper-V's precise
+ * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
+ * the flushes are considered "remote" and not "local" because
+ * the requests can be initiated from other vCPUs.
+ */
+ if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
+ kvm_hv_vcpu_flush_tlb(vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
- if (is_guest_mode(vcpu)) {
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (is_guest_mode(vcpu))
kvm_x86_ops.nested_ops->triple_fault(vcpu);
- } else {
+
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
vcpu->mmio_needed = 0;
r = 0;
- goto out;
}
+ goto out;
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
@@ -10580,8 +10287,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
+#endif
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
@@ -11895,6 +11604,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
kvm_async_pf_hash_reset(vcpu);
+
+ vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
kvm_pmu_init(vcpu);
vcpu->arch.pending_external_vector = -1;
@@ -11999,8 +11710,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
WARN_ON_ONCE(!init_event &&
(old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
+ /*
+ * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
+ * possible to INIT the vCPU while L2 is active. Force the vCPU back
+ * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
+ * bits), i.e. virtualization is disabled.
+ */
+ if (is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+
kvm_lapic_reset(vcpu, init_event);
+ WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
vcpu->arch.hflags = 0;
vcpu->arch.smi_pending = 0;
@@ -12894,10 +12615,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
static_call(kvm_x86_nmi_allowed)(vcpu, false)))
return true;
+#ifdef CONFIG_KVM_SMM
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
(vcpu->arch.smi_pending &&
static_call(kvm_x86_smi_allowed)(vcpu, false)))
return true;
+#endif
if (kvm_arch_interrupt_allowed(vcpu) &&
(kvm_cpu_has_interrupt(vcpu) ||
@@ -12938,7 +12661,9 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
return true;
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+#ifdef CONFIG_KVM_SMM
kvm_test_request(KVM_REQ_SMI, vcpu) ||
+#endif
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 829d3134c1eb..9de72586f406 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -27,6 +27,7 @@ struct kvm_caps {
u64 supported_mce_cap;
u64 supported_xcr0;
u64 supported_xss;
+ u64 supported_perf_cap;
};
void kvm_spurious_fault(void);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 2dae413bd62a..4b8e9628fbf5 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -954,6 +954,14 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
}
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+ return EVTCHN_2L_NR_CHANNELS;
+ else
+ return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
evtchn_port_t *ports)
{
@@ -1042,6 +1050,10 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
*r = -EFAULT;
goto out;
}
+ if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
+ *r = -EINVAL;
+ goto out;
+ }
}
if (sched_poll.nr_ports == 1)
@@ -1215,6 +1227,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
bool longmode;
u64 input, params[6], r = -ENOSYS;
bool handled = false;
+ u8 cpl;
input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -1242,9 +1255,17 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
params[5] = (u64)kvm_r9_read(vcpu);
}
#endif
- trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+ cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
params[3], params[4], params[5]);
+ /*
+ * Only allow hypercall acceleration for CPL0. The rare hypercalls that
+ * are permitted in guest userspace can be handled by the VMM.
+ */
+ if (unlikely(cpl > 0))
+ goto handle_in_userspace;
+
switch (input) {
case __HYPERVISOR_xen_version:
if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
@@ -1279,10 +1300,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
if (handled)
return kvm_xen_hypercall_set_result(vcpu, r);
+handle_in_userspace:
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
- vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ vcpu->run->xen.u.hcall.cpl = cpl;
vcpu->run->xen.u.hcall.input = input;
vcpu->run->xen.u.hcall.params[0] = params[0];
vcpu->run->xen.u.hcall.params[1] = params[1];
@@ -1297,14 +1319,6 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
-static inline int max_evtchn_port(struct kvm *kvm)
-{
- if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
- return EVTCHN_2L_NR_CHANNELS;
- else
- return COMPAT_EVTCHN_2L_NR_CHANNELS;
-}
-
static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
{
int poll_evtchn = vcpu->arch.xen.poll_evtchn;
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 0b4cc8c597ae..205a00105858 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -429,7 +429,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
aqic_gisa.isc = nisc;
aqic_gisa.ir = 1;
- aqic_gisa.gisa = (uint64_t)gisa >> 4;
+ aqic_gisa.gisa = virt_to_phys(gisa) >> 4;
status = ap_aqic(q->apqn, aqic_gisa, h_nib);
switch (status.response_code) {
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index fdce7a4cfc6f..020ca9bdbb79 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -399,6 +399,11 @@ struct hv_vpset {
u64 bank_contents[];
} __packed;
+/* The maximum number of sparse vCPU banks which can be encoded by 'struct hv_vpset' */
+#define HV_MAX_SPARSE_VCPU_BANKS (64)
+/* The number of vCPUs in one sparse bank */
+#define HV_VCPUS_PER_SPARSE_BANK (64)
+
/* HvCallSendSyntheticClusterIpi hypercall */
struct hv_send_ipi {
u32 vector;
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index bfb9eb9d7215..d55d2833a37b 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -211,9 +211,10 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
{
int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
int this_cpu = smp_processor_id();
+ int max_vcpu_bank = hv_max_vp_index / HV_VCPUS_PER_SPARSE_BANK;
- /* valid_bank_mask can represent up to 64 banks */
- if (hv_max_vp_index / 64 >= 64)
+ /* vpset.valid_bank_mask can represent up to HV_MAX_SPARSE_VCPU_BANKS banks */
+ if (max_vcpu_bank >= HV_MAX_SPARSE_VCPU_BANKS)
return 0;
/*
@@ -221,7 +222,7 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
* structs are not cleared between calls, we risk flushing unneeded
* vCPUs otherwise.
*/
- for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+ for (vcpu_bank = 0; vcpu_bank <= max_vcpu_bank; vcpu_bank++)
vpset->bank_contents[vcpu_bank] = 0;
/*
@@ -233,8 +234,8 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
vcpu = hv_cpu_number_to_vp_number(cpu);
if (vcpu == VP_INVAL)
return -1;
- vcpu_bank = vcpu / 64;
- vcpu_offset = vcpu % 64;
+ vcpu_bank = vcpu / HV_VCPUS_PER_SPARSE_BANK;
+ vcpu_offset = vcpu % HV_VCPUS_PER_SPARSE_BANK;
__set_bit(vcpu_offset, (unsigned long *)
&vpset->bank_contents[vcpu_bank]);
if (vcpu_bank >= nr_bank)
diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h
index e3a0be2c90ad..3aa3640f8c18 100644
--- a/include/linux/build_bug.h
+++ b/include/linux/build_bug.h
@@ -77,4 +77,13 @@
#define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr)
#define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
+
+/*
+ * Compile time check that field has an expected offset
+ */
+#define ASSERT_STRUCT_OFFSET(type, field, expected_offset) \
+ BUILD_BUG_ON_MSG(offsetof(type, field) != (expected_offset), \
+ "Offset of " #field " in " #type " has changed.")
+
+
#endif /* _LINUX_BUILD_BUG_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f51eb9419bfc..1dc501ab874d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -96,6 +96,7 @@
#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK)
#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
+#define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3)
/*
* error pfns indicate that the gfn is in slot but faild to
@@ -107,6 +108,15 @@ static inline bool is_error_pfn(kvm_pfn_t pfn)
}
/*
+ * KVM_PFN_ERR_SIGPENDING indicates that fetching the PFN was interrupted
+ * by a pending signal. Note, the signal may or may not be fatal.
+ */
+static inline bool is_sigpending_pfn(kvm_pfn_t pfn)
+{
+ return pfn == KVM_PFN_ERR_SIGPENDING;
+}
+
+/*
* error_noslot pfns indicate that the gfn can not be
* translated to pfn - it is not in slot or failed to
* translate it to pfn.
@@ -656,6 +666,8 @@ struct kvm_irq_routing_table {
};
#endif
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm);
+
#ifndef KVM_INTERNAL_MEM_SLOTS
#define KVM_INTERNAL_MEM_SLOTS 0
#endif
@@ -711,6 +723,11 @@ struct kvm {
/* The current active memslot set for each address space */
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
struct xarray vcpu_array;
+ /*
+ * Protected by slots_lock, but can be read outside if an
+ * incorrect answer is acceptable.
+ */
+ atomic_t nr_memslots_dirty_logging;
/* Used to wait for completion of MMU notifiers. */
spinlock_t mn_invalidate_lock;
@@ -1142,8 +1159,8 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn);
kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn);
kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
- bool atomic, bool *async, bool write_fault,
- bool *writable, hva_t *hva);
+ bool atomic, bool interruptible, bool *async,
+ bool write_fault, bool *writable, hva_t *hva);
void kvm_release_pfn_clean(kvm_pfn_t pfn);
void kvm_release_pfn_dirty(kvm_pfn_t pfn);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8bbcccbc5565..3c84f4e48cd7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2958,6 +2958,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
+#define FOLL_INTERRUPTIBLE 0x100000 /* allow interrupts from generic signals */
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c87b5882d7ae..820efdf9fef8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -485,6 +485,9 @@ struct kvm_run {
#define KVM_MSR_EXIT_REASON_INVAL (1 << 0)
#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1)
#define KVM_MSR_EXIT_REASON_FILTER (1 << 2)
+#define KVM_MSR_EXIT_REASON_VALID_MASK (KVM_MSR_EXIT_REASON_INVAL | \
+ KVM_MSR_EXIT_REASON_UNKNOWN | \
+ KVM_MSR_EXIT_REASON_FILTER)
__u32 reason; /* kernel -> user */
__u32 index; /* kernel -> user */
__u64 data; /* kernel <-> user */
@@ -1178,7 +1181,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_S390_ZPCI_OP 221
#define KVM_CAP_S390_CPU_TOPOLOGY 222
#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
-#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 224
+#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
+#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1741,6 +1745,8 @@ enum pv_cmd_id {
KVM_PV_UNSHARE_ALL,
KVM_PV_INFO,
KVM_PV_DUMP,
+ KVM_PV_ASYNC_CLEANUP_PREPARE,
+ KVM_PV_ASYNC_CLEANUP_PERFORM,
};
struct kvm_pv_cmd {
diff --git a/mm/gup.c b/mm/gup.c
index fe195d47de74..90e372352e82 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -989,8 +989,17 @@ static int faultin_page(struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE;
- if (locked)
+ if (locked) {
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ /*
+ * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
+ * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
+ * That's because some callers may not be prepared to
+ * handle early exits caused by non-fatal signals.
+ */
+ if (*flags & FOLL_INTERRUPTIBLE)
+ fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
+ }
if (*flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
if (*flags & FOLL_TRIED) {
@@ -1392,6 +1401,22 @@ retry:
EXPORT_SYMBOL_GPL(fixup_user_fault);
/*
+ * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is
+ * specified, it'll also respond to generic signals. The caller of GUP
+ * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
+ */
+static bool gup_signal_pending(unsigned int flags)
+{
+ if (fatal_signal_pending(current))
+ return true;
+
+ if (!(flags & FOLL_INTERRUPTIBLE))
+ return false;
+
+ return signal_pending(current);
+}
+
+/*
* Please note that this function, unlike __get_user_pages will not
* return 0 for nr_pages > 0 without FOLL_NOWAIT
*/
@@ -1472,11 +1497,11 @@ retry:
* Repeat on the address that fired VM_FAULT_RETRY
* with both FAULT_FLAG_ALLOW_RETRY and
* FAULT_FLAG_TRIED. Note that GUP can be interrupted
- * by fatal signals, so we need to check it before we
+ * by fatal signals of even common signals, depending on
+ * the caller's request. So we need to check it before we
* start trying again otherwise it can loop forever.
*/
-
- if (fatal_signal_pending(current)) {
+ if (gup_signal_pending(flags)) {
if (!pages_done)
pages_done = -EINTR;
break;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..b5ed54f760bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6285,9 +6285,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_WRITE;
else if (unshare)
fault_flags |= FAULT_FLAG_UNSHARE;
- if (locked)
+ if (locked) {
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_KILLABLE;
+ if (flags & FOLL_INTERRUPTIBLE)
+ fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
+ }
if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_RETRY_NOWAIT;
diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h
index 1f5e26aae9fc..01cc27ec4520 100644
--- a/tools/arch/x86/include/asm/atomic.h
+++ b/tools/arch/x86/include/asm/atomic.h
@@ -8,6 +8,7 @@
#define LOCK_PREFIX "\n\tlock; "
+#include <asm/asm.h>
#include <asm/cmpxchg.h>
/*
@@ -70,4 +71,10 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
return cmpxchg(&v->counter, old, new);
}
+static inline int atomic_test_and_set_bit(long nr, unsigned long *addr)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, "Ir", nr, "%0", "c");
+
+}
+
#endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */
diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h
index 4c1966f7c77a..6daa68bf5b9e 100644
--- a/tools/include/asm-generic/atomic-gcc.h
+++ b/tools/include/asm-generic/atomic-gcc.h
@@ -4,6 +4,7 @@
#include <linux/compiler.h>
#include <linux/types.h>
+#include <linux/bitops.h>
/*
* Atomic operations that C can't guarantee us. Useful for
@@ -69,4 +70,15 @@ static inline int atomic_cmpxchg(atomic_t *v, int oldval, int newval)
return cmpxchg(&(v)->counter, oldval, newval);
}
+static inline int atomic_test_and_set_bit(long nr, unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ long old;
+
+ addr += BIT_WORD(nr);
+
+ old = __sync_fetch_and_or(addr, mask);
+ return !!(old & mask);
+}
+
#endif /* __TOOLS_ASM_GENERIC_ATOMIC_H */
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 9c366b3a676d..6f28180ffeea 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -41,11 +41,14 @@ VMX_EXIT_REASONS = {
'EXCEPTION_NMI': 0,
'EXTERNAL_INTERRUPT': 1,
'TRIPLE_FAULT': 2,
- 'PENDING_INTERRUPT': 7,
+ 'INIT_SIGNAL': 3,
+ 'SIPI_SIGNAL': 4,
+ 'INTERRUPT_WINDOW': 7,
'NMI_WINDOW': 8,
'TASK_SWITCH': 9,
'CPUID': 10,
'HLT': 12,
+ 'INVD': 13,
'INVLPG': 14,
'RDPMC': 15,
'RDTSC': 16,
@@ -65,26 +68,48 @@ VMX_EXIT_REASONS = {
'MSR_READ': 31,
'MSR_WRITE': 32,
'INVALID_STATE': 33,
+ 'MSR_LOAD_FAIL': 34,
'MWAIT_INSTRUCTION': 36,
+ 'MONITOR_TRAP_FLAG': 37,
'MONITOR_INSTRUCTION': 39,
'PAUSE_INSTRUCTION': 40,
'MCE_DURING_VMENTRY': 41,
'TPR_BELOW_THRESHOLD': 43,
'APIC_ACCESS': 44,
+ 'EOI_INDUCED': 45,
+ 'GDTR_IDTR': 46,
+ 'LDTR_TR': 47,
'EPT_VIOLATION': 48,
'EPT_MISCONFIG': 49,
+ 'INVEPT': 50,
+ 'RDTSCP': 51,
+ 'PREEMPTION_TIMER': 52,
+ 'INVVPID': 53,
'WBINVD': 54,
'XSETBV': 55,
'APIC_WRITE': 56,
+ 'RDRAND': 57,
'INVPCID': 58,
+ 'VMFUNC': 59,
+ 'ENCLS': 60,
+ 'RDSEED': 61,
+ 'PML_FULL': 62,
+ 'XSAVES': 63,
+ 'XRSTORS': 64,
+ 'UMWAIT': 67,
+ 'TPAUSE': 68,
+ 'BUS_LOCK': 74,
+ 'NOTIFY': 75,
}
SVM_EXIT_REASONS = {
'READ_CR0': 0x000,
+ 'READ_CR2': 0x002,
'READ_CR3': 0x003,
'READ_CR4': 0x004,
'READ_CR8': 0x008,
'WRITE_CR0': 0x010,
+ 'WRITE_CR2': 0x012,
'WRITE_CR3': 0x013,
'WRITE_CR4': 0x014,
'WRITE_CR8': 0x018,
@@ -105,6 +130,7 @@ SVM_EXIT_REASONS = {
'WRITE_DR6': 0x036,
'WRITE_DR7': 0x037,
'EXCP_BASE': 0x040,
+ 'LAST_EXCP': 0x05f,
'INTR': 0x060,
'NMI': 0x061,
'SMI': 0x062,
@@ -151,21 +177,45 @@ SVM_EXIT_REASONS = {
'MWAIT': 0x08b,
'MWAIT_COND': 0x08c,
'XSETBV': 0x08d,
+ 'RDPRU': 0x08e,
+ 'EFER_WRITE_TRAP': 0x08f,
+ 'CR0_WRITE_TRAP': 0x090,
+ 'CR1_WRITE_TRAP': 0x091,
+ 'CR2_WRITE_TRAP': 0x092,
+ 'CR3_WRITE_TRAP': 0x093,
+ 'CR4_WRITE_TRAP': 0x094,
+ 'CR5_WRITE_TRAP': 0x095,
+ 'CR6_WRITE_TRAP': 0x096,
+ 'CR7_WRITE_TRAP': 0x097,
+ 'CR8_WRITE_TRAP': 0x098,
+ 'CR9_WRITE_TRAP': 0x099,
+ 'CR10_WRITE_TRAP': 0x09a,
+ 'CR11_WRITE_TRAP': 0x09b,
+ 'CR12_WRITE_TRAP': 0x09c,
+ 'CR13_WRITE_TRAP': 0x09d,
+ 'CR14_WRITE_TRAP': 0x09e,
+ 'CR15_WRITE_TRAP': 0x09f,
+ 'INVPCID': 0x0a2,
'NPF': 0x400,
+ 'AVIC_INCOMPLETE_IPI': 0x401,
+ 'AVIC_UNACCELERATED_ACCESS': 0x402,
+ 'VMGEXIT': 0x403,
}
-# EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h)
+# EC definition of HSR (from arch/arm64/include/asm/esr.h)
AARCH64_EXIT_REASONS = {
'UNKNOWN': 0x00,
- 'WFI': 0x01,
+ 'WFx': 0x01,
'CP15_32': 0x03,
'CP15_64': 0x04,
'CP14_MR': 0x05,
'CP14_LS': 0x06,
'FP_ASIMD': 0x07,
'CP10_ID': 0x08,
+ 'PAC': 0x09,
'CP14_64': 0x0C,
- 'ILL_ISS': 0x0E,
+ 'BTI': 0x0D,
+ 'ILL': 0x0E,
'SVC32': 0x11,
'HVC32': 0x12,
'SMC32': 0x13,
@@ -173,21 +223,26 @@ AARCH64_EXIT_REASONS = {
'HVC64': 0x16,
'SMC64': 0x17,
'SYS64': 0x18,
- 'IABT': 0x20,
- 'IABT_HYP': 0x21,
+ 'SVE': 0x19,
+ 'ERET': 0x1A,
+ 'FPAC': 0x1C,
+ 'SME': 0x1D,
+ 'IMP_DEF': 0x1F,
+ 'IABT_LOW': 0x20,
+ 'IABT_CUR': 0x21,
'PC_ALIGN': 0x22,
- 'DABT': 0x24,
- 'DABT_HYP': 0x25,
+ 'DABT_LOW': 0x24,
+ 'DABT_CUR': 0x25,
'SP_ALIGN': 0x26,
'FP_EXC32': 0x28,
'FP_EXC64': 0x2C,
'SERROR': 0x2F,
- 'BREAKPT': 0x30,
- 'BREAKPT_HYP': 0x31,
- 'SOFTSTP': 0x32,
- 'SOFTSTP_HYP': 0x33,
- 'WATCHPT': 0x34,
- 'WATCHPT_HYP': 0x35,
+ 'BREAKPT_LOW': 0x30,
+ 'BREAKPT_CUR': 0x31,
+ 'SOFTSTP_LOW': 0x32,
+ 'SOFTSTP_CUR': 0x33,
+ 'WATCHPT_LOW': 0x34,
+ 'WATCHPT_CUR': 0x35,
'BKPT32': 0x38,
'VECTOR32': 0x3A,
'BRK64': 0x3C,
@@ -220,6 +275,19 @@ USERSPACE_EXIT_REASONS = {
'S390_TSCH': 22,
'EPR': 23,
'SYSTEM_EVENT': 24,
+ 'S390_STSI': 25,
+ 'IOAPIC_EOI': 26,
+ 'HYPERV': 27,
+ 'ARM_NISV': 28,
+ 'X86_RDMSR': 29,
+ 'X86_WRMSR': 30,
+ 'DIRTY_RING_FULL': 31,
+ 'AP_RESET_HOLD': 32,
+ 'X86_BUS_LOCK': 33,
+ 'XEN': 34,
+ 'RISCV_SBI': 35,
+ 'RISCV_CSR': 36,
+ 'NOTIFY': 37,
}
IOCTL_NUMBERS = {
@@ -1756,7 +1824,7 @@ def assign_globals():
debugfs = ''
for line in open('/proc/mounts'):
- if line.split(' ')[0] == 'debugfs':
+ if line.split(' ')[2] == 'debugfs':
debugfs = line.split(' ')[1]
break
if debugfs == '':
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 4a30d684e208..6ce8c488d62e 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -17,16 +17,18 @@
/x86_64/cpuid_test
/x86_64/cr4_cpuid_sync_test
/x86_64/debug_regs
-/x86_64/evmcs_test
-/x86_64/emulator_error_test
+/x86_64/exit_on_emulation_failure_test
/x86_64/fix_hypercall_test
/x86_64/get_msr_index_features
/x86_64/kvm_clock_test
/x86_64/kvm_pv_test
/x86_64/hyperv_clock
/x86_64/hyperv_cpuid
+/x86_64/hyperv_evmcs
/x86_64/hyperv_features
+/x86_64/hyperv_ipi
/x86_64/hyperv_svm_test
+/x86_64/hyperv_tlb_flush
/x86_64/max_vcpuid_cap_test
/x86_64/mmio_warning_test
/x86_64/monitor_mwait_test
@@ -37,11 +39,13 @@
/x86_64/set_boot_cpu_id
/x86_64/set_sregs_test
/x86_64/sev_migrate_tests
+/x86_64/smaller_maxphyaddr_emulation_test
/x86_64/smm_test
/x86_64/state_test
/x86_64/svm_vmcall_test
/x86_64/svm_int_ctl_test
/x86_64/svm_nested_soft_inject_test
+/x86_64/svm_nested_shutdown_test
/x86_64/sync_regs_test
/x86_64/tsc_msrs_test
/x86_64/tsc_scaling_sync
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 1d85b8e218a0..947676983da1 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -43,17 +43,19 @@ LIBKVM += lib/elf.c
LIBKVM += lib/guest_modes.c
LIBKVM += lib/io.c
LIBKVM += lib/kvm_util.c
-LIBKVM += lib/perf_test_util.c
+LIBKVM += lib/memstress.c
LIBKVM += lib/rbtree.c
LIBKVM += lib/sparsebit.c
LIBKVM += lib/test_util.c
+LIBKVM += lib/ucall_common.c
LIBKVM += lib/userfaultfd_util.c
LIBKVM_STRING += lib/string_override.c
LIBKVM_x86_64 += lib/x86_64/apic.c
LIBKVM_x86_64 += lib/x86_64/handlers.S
-LIBKVM_x86_64 += lib/x86_64/perf_test_util.c
+LIBKVM_x86_64 += lib/x86_64/hyperv.c
+LIBKVM_x86_64 += lib/x86_64/memstress.c
LIBKVM_x86_64 += lib/x86_64/processor.c
LIBKVM_x86_64 += lib/x86_64/svm.c
LIBKVM_x86_64 += lib/x86_64/ucall.c
@@ -81,13 +83,15 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
-TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
-TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
+TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_tlb_flush
TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test
TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
@@ -97,11 +101,13 @@ TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
+TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test
TEST_GEN_PROGS_x86_64 += x86_64/smm_test
TEST_GEN_PROGS_x86_64 += x86_64/state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
+TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_shutdown_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_soft_inject_test
TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
index b6a5e8861b35..4951ac53d1f8 100644
--- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c
@@ -159,12 +159,9 @@ int main(void)
TEST_REQUIRE(vcpu_aarch64_only(vcpu));
- ucall_init(vm, NULL);
-
test_user_raz_wi(vcpu);
test_user_raz_invariant(vcpu);
test_guest_raz(vcpu);
- ucall_uninit(vm);
kvm_vm_free(vm);
}
diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c
index 574eb73f0e90..f2a96779716a 100644
--- a/tools/testing/selftests/kvm/aarch64/arch_timer.c
+++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c
@@ -375,7 +375,6 @@ static struct kvm_vm *test_vm_create(void)
for (i = 0; i < nr_vcpus; i++)
vcpu_init_descriptor_tables(vcpus[i]);
- ucall_init(vm, NULL);
test_init_timer_irq(vm);
gic_fd = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA);
__TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3");
@@ -414,36 +413,21 @@ static bool parse_args(int argc, char *argv[])
while ((opt = getopt(argc, argv, "hn:i:p:m:")) != -1) {
switch (opt) {
case 'n':
- test_args.nr_vcpus = atoi(optarg);
- if (test_args.nr_vcpus <= 0) {
- pr_info("Positive value needed for -n\n");
- goto err;
- } else if (test_args.nr_vcpus > KVM_MAX_VCPUS) {
+ test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg);
+ if (test_args.nr_vcpus > KVM_MAX_VCPUS) {
pr_info("Max allowed vCPUs: %u\n",
KVM_MAX_VCPUS);
goto err;
}
break;
case 'i':
- test_args.nr_iter = atoi(optarg);
- if (test_args.nr_iter <= 0) {
- pr_info("Positive value needed for -i\n");
- goto err;
- }
+ test_args.nr_iter = atoi_positive("Number of iterations", optarg);
break;
case 'p':
- test_args.timer_period_ms = atoi(optarg);
- if (test_args.timer_period_ms <= 0) {
- pr_info("Positive value needed for -p\n");
- goto err;
- }
+ test_args.timer_period_ms = atoi_positive("Periodicity", optarg);
break;
case 'm':
- test_args.migration_freq_ms = atoi(optarg);
- if (test_args.migration_freq_ms < 0) {
- pr_info("0 or positive value needed for -m\n");
- goto err;
- }
+ test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg);
break;
case 'h':
default:
@@ -462,9 +446,6 @@ int main(int argc, char *argv[])
{
struct kvm_vm *vm;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
if (!parse_args(argc, argv))
exit(KSFT_SKIP);
diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
index b30add3e7726..8a3fb212084a 100644
--- a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
+++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c
@@ -377,7 +377,6 @@ static void guest_svc_handler(struct ex_regs *regs)
enum single_step_op {
SINGLE_STEP_ENABLE = 0,
- SINGLE_STEP_DISABLE = 1,
};
static void guest_code_ss(int test_cnt)
@@ -394,7 +393,7 @@ static void guest_code_ss(int test_cnt)
GUEST_SYNC(SINGLE_STEP_ENABLE);
/*
- * The userspace will veriry that the pc is as expected during
+ * The userspace will verify that the pc is as expected during
* single step execution between iter_ss_begin and iter_ss_end.
*/
asm volatile("iter_ss_begin:nop\n");
@@ -404,11 +403,9 @@ static void guest_code_ss(int test_cnt)
bvr = read_sysreg(dbgbvr0_el1);
wvr = read_sysreg(dbgwvr0_el1);
+ /* Userspace disables Single Step when the end is nigh. */
asm volatile("iter_ss_end:\n");
- /* Disable Single Step execution */
- GUEST_SYNC(SINGLE_STEP_DISABLE);
-
GUEST_ASSERT(bvr == w_bvr);
GUEST_ASSERT(wvr == w_wvr);
}
@@ -427,7 +424,6 @@ static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bp
struct ucall uc;
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
- ucall_init(vm, NULL);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vcpu);
@@ -474,7 +470,6 @@ void test_single_step_from_userspace(int test_cnt)
struct kvm_guest_debug debug = {};
vm = vm_create_with_one_vcpu(&vcpu, guest_code_ss);
- ucall_init(vm, NULL);
run = vcpu->run;
vcpu_args_set(vcpu, 1, test_cnt);
@@ -492,15 +487,12 @@ void test_single_step_from_userspace(int test_cnt)
TEST_ASSERT(cmd == UCALL_SYNC,
"Unexpected ucall cmd 0x%lx", cmd);
- if (uc.args[1] == SINGLE_STEP_ENABLE) {
- debug.control = KVM_GUESTDBG_ENABLE |
- KVM_GUESTDBG_SINGLESTEP;
- ss_enable = true;
- } else {
- debug.control = SINGLE_STEP_DISABLE;
- ss_enable = false;
- }
+ TEST_ASSERT(uc.args[1] == SINGLE_STEP_ENABLE,
+ "Unexpected ucall action 0x%lx", uc.args[1]);
+ debug.control = KVM_GUESTDBG_ENABLE |
+ KVM_GUESTDBG_SINGLESTEP;
+ ss_enable = true;
vcpu_guest_debug_set(vcpu, &debug);
continue;
}
@@ -513,6 +505,14 @@ void test_single_step_from_userspace(int test_cnt)
"Unexpected pc 0x%lx (expected 0x%lx)",
pc, test_pc);
+ if ((pc + 4) == (uint64_t)&iter_ss_end) {
+ test_pc = 0;
+ debug.control = KVM_GUESTDBG_ENABLE;
+ ss_enable = false;
+ vcpu_guest_debug_set(vcpu, &debug);
+ continue;
+ }
+
/*
* If the current pc is between iter_ss_bgin and
* iter_ss_end, the pc for the next KVM_EXIT_DEBUG should
@@ -590,7 +590,7 @@ int main(int argc, char *argv[])
while ((opt = getopt(argc, argv, "i:")) != -1) {
switch (opt) {
case 'i':
- ss_iteration = atoi(optarg);
+ ss_iteration = atoi_positive("Number of iterations", optarg);
break;
case 'h':
default:
diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c
index a39da3fe4952..bef1499fb465 100644
--- a/tools/testing/selftests/kvm/aarch64/hypercalls.c
+++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c
@@ -236,7 +236,6 @@ static struct kvm_vm *test_vm_create(struct kvm_vcpu **vcpu)
vm = vm_create_with_one_vcpu(vcpu, guest_code);
- ucall_init(vm, NULL);
steal_time_init(*vcpu);
return vm;
@@ -306,8 +305,6 @@ static void test_run(void)
int main(void)
{
- setbuf(stdout, NULL);
-
test_run();
return 0;
}
diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
index 05bb6a6369c2..95d22cfb7b41 100644
--- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c
+++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c
@@ -611,6 +611,13 @@ static void setup_memslots(struct kvm_vm *vm, struct test_params *p)
vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
}
+static void setup_ucall(struct kvm_vm *vm)
+{
+ struct userspace_mem_region *region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA);
+
+ ucall_init(vm, region->region.guest_phys_addr + region->region.memory_size);
+}
+
static void setup_default_handlers(struct test_desc *test)
{
if (!test->mmio_handler)
@@ -700,12 +707,11 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vm = ____vm_create(mode);
setup_memslots(vm, p);
kvm_vm_elf_load(vm, program_invocation_name);
+ setup_ucall(vm);
vcpu = vm_vcpu_add(vm, 0, guest_code);
setup_gva_maps(vm);
- ucall_init(vm, NULL);
-
reset_event_counts();
/*
@@ -722,7 +728,6 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vcpu_run_loop(vm, vcpu, test);
- ucall_uninit(vm);
kvm_vm_free(vm);
free_uffd(test, pt_uffd, data_uffd);
diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c
index e0b9e81a3e09..cfa36f387948 100644
--- a/tools/testing/selftests/kvm/aarch64/psci_test.c
+++ b/tools/testing/selftests/kvm/aarch64/psci_test.c
@@ -79,7 +79,6 @@ static struct kvm_vm *setup_vm(void *guest_code, struct kvm_vcpu **source,
struct kvm_vm *vm;
vm = vm_create(2);
- ucall_init(vm, NULL);
vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init);
init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2);
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c
index 9c131d977a1b..eef816b80993 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_init.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c
@@ -68,8 +68,6 @@ static void guest_code(void)
/* we don't want to assert on run execution, hence that helper */
static int run_vcpu(struct kvm_vcpu *vcpu)
{
- ucall_init(vcpu->vm, NULL);
-
return __vcpu_run(vcpu) ? -errno : 0;
}
diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
index 17417220a083..90d854e0fcff 100644
--- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c
@@ -756,7 +756,6 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split)
print_args(&args);
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
- ucall_init(vm, NULL);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vcpu);
@@ -818,22 +817,19 @@ int main(int argc, char **argv)
int opt;
bool eoi_split = false;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
while ((opt = getopt(argc, argv, "hn:e:l:")) != -1) {
switch (opt) {
case 'n':
- nr_irqs = atoi(optarg);
+ nr_irqs = atoi_non_negative("Number of IRQs", optarg);
if (nr_irqs > 1024 || nr_irqs % 32)
help(argv[0]);
break;
case 'e':
- eoi_split = (bool)atoi(optarg);
+ eoi_split = (bool)atoi_paranoid(optarg);
default_args = false;
break;
case 'l':
- level_sensitive = (bool)atoi(optarg);
+ level_sensitive = (bool)atoi_paranoid(optarg);
default_args = false;
break;
case 'h':
diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index 942370d57392..57a16371e9c2 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -44,7 +44,7 @@
#include "kvm_util.h"
#include "test_util.h"
-#include "perf_test_util.h"
+#include "memstress.h"
#include "guest_modes.h"
/* Global variable used to synchronize all of the vCPU threads. */
@@ -123,7 +123,7 @@ static void mark_page_idle(int page_idle_fd, uint64_t pfn)
}
static void mark_vcpu_memory_idle(struct kvm_vm *vm,
- struct perf_test_vcpu_args *vcpu_args)
+ struct memstress_vcpu_args *vcpu_args)
{
int vcpu_idx = vcpu_args->vcpu_idx;
uint64_t base_gva = vcpu_args->gva;
@@ -145,7 +145,7 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm,
TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap.");
for (page = 0; page < pages; page++) {
- uint64_t gva = base_gva + page * perf_test_args.guest_page_size;
+ uint64_t gva = base_gva + page * memstress_args.guest_page_size;
uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva);
if (!pfn) {
@@ -208,7 +208,7 @@ static bool spin_wait_for_next_iteration(int *current_iteration)
int last_iteration = *current_iteration;
do {
- if (READ_ONCE(perf_test_args.stop_vcpus))
+ if (READ_ONCE(memstress_args.stop_vcpus))
return false;
*current_iteration = READ_ONCE(iteration);
@@ -217,10 +217,10 @@ static bool spin_wait_for_next_iteration(int *current_iteration)
return true;
}
-static void vcpu_thread_main(struct perf_test_vcpu_args *vcpu_args)
+static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args)
{
struct kvm_vcpu *vcpu = vcpu_args->vcpu;
- struct kvm_vm *vm = perf_test_args.vm;
+ struct kvm_vm *vm = memstress_args.vm;
int vcpu_idx = vcpu_args->vcpu_idx;
int current_iteration = 0;
@@ -276,7 +276,7 @@ static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *descripti
static void access_memory(struct kvm_vm *vm, int nr_vcpus,
enum access_type access, const char *description)
{
- perf_test_set_wr_fract(vm, (access == ACCESS_READ) ? INT_MAX : 1);
+ memstress_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100);
iteration_work = ITERATION_ACCESS_MEMORY;
run_iteration(vm, nr_vcpus, description);
}
@@ -300,10 +300,10 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct kvm_vm *vm;
int nr_vcpus = params->nr_vcpus;
- vm = perf_test_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1,
+ vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1,
params->backing_src, !overlap_memory_access);
- perf_test_start_vcpu_threads(nr_vcpus, vcpu_thread_main);
+ memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main);
pr_info("\n");
access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory");
@@ -318,8 +318,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
mark_memory_idle(vm, nr_vcpus);
access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory");
- perf_test_join_vcpu_threads(nr_vcpus);
- perf_test_destroy_vm(vm);
+ memstress_join_vcpu_threads(nr_vcpus);
+ memstress_destroy_vm(vm);
}
static void help(char *name)
@@ -362,7 +362,7 @@ int main(int argc, char *argv[])
params.vcpu_memory_bytes = parse_size(optarg);
break;
case 'v':
- params.nr_vcpus = atoi(optarg);
+ params.nr_vcpus = atoi_positive("Number of vCPUs", optarg);
break;
case 'o':
overlap_memory_access = true;
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 8e1fe4ffcccd..b0e1fc4de9e2 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -20,7 +20,7 @@
#include "kvm_util.h"
#include "test_util.h"
-#include "perf_test_util.h"
+#include "memstress.h"
#include "guest_modes.h"
#include "userfaultfd_util.h"
@@ -32,7 +32,7 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
static size_t demand_paging_size;
static char *guest_data_prototype;
-static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
+static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
{
struct kvm_vcpu *vcpu = vcpu_args->vcpu;
int vcpu_idx = vcpu_args->vcpu_idx;
@@ -135,7 +135,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct kvm_vm *vm;
int i;
- vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
+ vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
p->src_type, p->partition_vcpu_memory_access);
demand_paging_size = get_backing_src_pagesz(p->src_type);
@@ -150,18 +150,18 @@ static void run_test(enum vm_guest_mode mode, void *arg)
TEST_ASSERT(uffd_descs, "Memory allocation failed");
for (i = 0; i < nr_vcpus; i++) {
- struct perf_test_vcpu_args *vcpu_args;
+ struct memstress_vcpu_args *vcpu_args;
void *vcpu_hva;
void *vcpu_alias;
- vcpu_args = &perf_test_args.vcpu_args[i];
+ vcpu_args = &memstress_args.vcpu_args[i];
/* Cache the host addresses of the region */
vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa);
vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa);
prefault_mem(vcpu_alias,
- vcpu_args->pages * perf_test_args.guest_page_size);
+ vcpu_args->pages * memstress_args.guest_page_size);
/*
* Set up user fault fd to handle demand paging
@@ -169,7 +169,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
*/
uffd_descs[i] = uffd_setup_demand_paging(
p->uffd_mode, p->uffd_delay, vcpu_hva,
- vcpu_args->pages * perf_test_args.guest_page_size,
+ vcpu_args->pages * memstress_args.guest_page_size,
&handle_uffd_page_request);
}
}
@@ -177,10 +177,10 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Finished creating vCPUs and starting uffd threads\n");
clock_gettime(CLOCK_MONOTONIC, &start);
- perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker);
+ memstress_start_vcpu_threads(nr_vcpus, vcpu_worker);
pr_info("Started all vCPUs\n");
- perf_test_join_vcpu_threads(nr_vcpus);
+ memstress_join_vcpu_threads(nr_vcpus);
ts_diff = timespec_elapsed(start);
pr_info("All vCPU threads joined\n");
@@ -193,10 +193,10 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Total guest execution time: %ld.%.9lds\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
pr_info("Overall demand paging rate: %f pgs/sec\n",
- perf_test_args.vcpu_args[0].pages * nr_vcpus /
+ memstress_args.vcpu_args[0].pages * nr_vcpus /
((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
- perf_test_destroy_vm(vm);
+ memstress_destroy_vm(vm);
free(guest_data_prototype);
if (p->uffd_mode)
@@ -259,8 +259,8 @@ int main(int argc, char *argv[])
p.src_type = parse_backing_src_type(optarg);
break;
case 'v':
- nr_vcpus = atoi(optarg);
- TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+ nr_vcpus = atoi_positive("Number of vCPUs", optarg);
+ TEST_ASSERT(nr_vcpus <= max_vcpus,
"Invalid number of vcpus, must be between 1 and %d", max_vcpus);
break;
case 'o':
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index f99e39a672d3..c33e89012ae6 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -16,7 +16,7 @@
#include "kvm_util.h"
#include "test_util.h"
-#include "perf_test_util.h"
+#include "memstress.h"
#include "guest_modes.h"
#ifdef __aarch64__
@@ -67,7 +67,7 @@ static bool host_quit;
static int iteration;
static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
-static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
+static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
{
struct kvm_vcpu *vcpu = vcpu_args->vcpu;
int vcpu_idx = vcpu_args->vcpu_idx;
@@ -128,10 +128,12 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
struct test_params {
unsigned long iterations;
uint64_t phys_offset;
- int wr_fract;
bool partition_vcpu_memory_access;
enum vm_mem_backing_src_type backing_src;
int slots;
+ uint32_t write_percent;
+ uint32_t random_seed;
+ bool random_access;
};
static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable)
@@ -139,7 +141,7 @@ static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable)
int i;
for (i = 0; i < slots; i++) {
- int slot = PERF_TEST_MEM_SLOT_INDEX + i;
+ int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0;
vm_mem_region_set_flags(vm, slot, flags);
@@ -161,7 +163,7 @@ static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots
int i;
for (i = 0; i < slots; i++) {
- int slot = PERF_TEST_MEM_SLOT_INDEX + i;
+ int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
kvm_vm_get_dirty_log(vm, slot, bitmaps[i]);
}
@@ -173,7 +175,7 @@ static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[],
int i;
for (i = 0; i < slots; i++) {
- int slot = PERF_TEST_MEM_SLOT_INDEX + i;
+ int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot);
}
@@ -221,11 +223,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct timespec clear_dirty_log_total = (struct timespec){0};
int i;
- vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
+ vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
p->slots, p->backing_src,
p->partition_vcpu_memory_access);
- perf_test_set_wr_fract(vm, p->wr_fract);
+ pr_info("Random seed: %u\n", p->random_seed);
+ memstress_set_random_seed(vm, p->random_seed);
+ memstress_set_write_percent(vm, p->write_percent);
guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift;
guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
@@ -248,7 +252,16 @@ static void run_test(enum vm_guest_mode mode, void *arg)
for (i = 0; i < nr_vcpus; i++)
vcpu_last_completed_iteration[i] = -1;
- perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker);
+ /*
+ * Use 100% writes during the population phase to ensure all
+ * memory is actually populated and not just mapped to the zero
+ * page. The prevents expensive copy-on-write faults from
+ * occurring during the dirty memory iterations below, which
+ * would pollute the performance results.
+ */
+ memstress_set_write_percent(vm, 100);
+ memstress_set_random_access(vm, false);
+ memstress_start_vcpu_threads(nr_vcpus, vcpu_worker);
/* Allow the vCPUs to populate memory */
pr_debug("Starting iteration %d - Populating\n", iteration);
@@ -269,6 +282,9 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
+ memstress_set_write_percent(vm, p->write_percent);
+ memstress_set_random_access(vm, p->random_access);
+
while (iteration < p->iterations) {
/*
* Incrementing the iteration number will start the vCPUs
@@ -329,7 +345,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
* wait for them to exit.
*/
host_quit = true;
- perf_test_join_vcpu_threads(nr_vcpus);
+ memstress_join_vcpu_threads(nr_vcpus);
avg = timespec_div(get_dirty_log_total, p->iterations);
pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
@@ -345,16 +361,17 @@ static void run_test(enum vm_guest_mode mode, void *arg)
free_bitmaps(bitmaps, p->slots);
arch_cleanup_vm(vm);
- perf_test_destroy_vm(vm);
+ memstress_destroy_vm(vm);
}
static void help(char *name)
{
puts("");
- printf("usage: %s [-h] [-i iterations] [-p offset] [-g] "
- "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]"
- "[-x memslots]\n", name);
+ printf("usage: %s [-h] [-a] [-i iterations] [-p offset] [-g] "
+ "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-r random seed ] [-s mem type]"
+ "[-x memslots] [-w percentage] [-c physical cpus to run test on]\n", name);
puts("");
+ printf(" -a: access memory randomly rather than in order.\n");
printf(" -i: specify iteration counts (default: %"PRIu64")\n",
TEST_HOST_LOOP_N);
printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n"
@@ -373,16 +390,29 @@ static void help(char *name)
printf(" -b: specify the size of the memory region which should be\n"
" dirtied by each vCPU. e.g. 10M or 3G.\n"
" (default: 1G)\n");
- printf(" -f: specify the fraction of pages which should be written to\n"
- " as opposed to simply read, in the form\n"
- " 1/<fraction of pages to write>.\n"
- " (default: 1 i.e. all pages are written to.)\n");
printf(" -v: specify the number of vCPUs to run.\n");
printf(" -o: Overlap guest memory accesses instead of partitioning\n"
" them into a separate region of memory for each vCPU.\n");
+ printf(" -r: specify the starting random seed.\n");
backing_src_help("-s");
printf(" -x: Split the memory region into this number of memslots.\n"
" (default: 1)\n");
+ printf(" -w: specify the percentage of pages which should be written to\n"
+ " as an integer from 0-100 inclusive. This is probabalistic,\n"
+ " so -w X means each page has an X%% chance of writing\n"
+ " and a (100-X)%% chance of reading.\n"
+ " (default: 100 i.e. all pages are written to.)\n");
+ printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n"
+ " values (target pCPU), one for each vCPU, plus an optional\n"
+ " entry for the main application task (specified via entry\n"
+ " <nr_vcpus + 1>). If used, entries must be provided for all\n"
+ " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
+ " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
+ " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
+ " ./dirty_log_perf_test -v 3 -c 22,23,24,50\n\n"
+ " To leave the application task unpinned, drop the final entry:\n\n"
+ " ./dirty_log_perf_test -v 3 -c 22,23,24\n\n"
+ " (default: no pinning)\n");
puts("");
exit(0);
}
@@ -390,12 +420,14 @@ static void help(char *name)
int main(int argc, char *argv[])
{
int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ const char *pcpu_list = NULL;
struct test_params p = {
.iterations = TEST_HOST_LOOP_N,
- .wr_fract = 1,
.partition_vcpu_memory_access = true,
.backing_src = DEFAULT_VM_MEM_SRC,
.slots = 1,
+ .random_seed = 1,
+ .write_percent = 100,
};
int opt;
@@ -406,55 +438,73 @@ int main(int argc, char *argv[])
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "eghi:p:m:nb:f:v:os:x:")) != -1) {
+ while ((opt = getopt(argc, argv, "ab:c:eghi:m:nop:r:s:v:x:w:")) != -1) {
switch (opt) {
+ case 'a':
+ p.random_access = true;
+ break;
+ case 'b':
+ guest_percpu_mem_size = parse_size(optarg);
+ break;
+ case 'c':
+ pcpu_list = optarg;
+ break;
case 'e':
/* 'e' is for evil. */
run_vcpus_while_disabling_dirty_logging = true;
+ break;
case 'g':
dirty_log_manual_caps = 0;
break;
- case 'i':
- p.iterations = atoi(optarg);
+ case 'h':
+ help(argv[0]);
break;
- case 'p':
- p.phys_offset = strtoull(optarg, NULL, 0);
+ case 'i':
+ p.iterations = atoi_positive("Number of iterations", optarg);
break;
case 'm':
guest_modes_cmdline(optarg);
break;
case 'n':
- perf_test_args.nested = true;
- break;
- case 'b':
- guest_percpu_mem_size = parse_size(optarg);
- break;
- case 'f':
- p.wr_fract = atoi(optarg);
- TEST_ASSERT(p.wr_fract >= 1,
- "Write fraction cannot be less than one");
- break;
- case 'v':
- nr_vcpus = atoi(optarg);
- TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
- "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
+ memstress_args.nested = true;
break;
case 'o':
p.partition_vcpu_memory_access = false;
break;
+ case 'p':
+ p.phys_offset = strtoull(optarg, NULL, 0);
+ break;
+ case 'r':
+ p.random_seed = atoi_positive("Random seed", optarg);
+ break;
case 's':
p.backing_src = parse_backing_src_type(optarg);
break;
+ case 'v':
+ nr_vcpus = atoi_positive("Number of vCPUs", optarg);
+ TEST_ASSERT(nr_vcpus <= max_vcpus,
+ "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
+ break;
+ case 'w':
+ p.write_percent = atoi_non_negative("Write percentage", optarg);
+ TEST_ASSERT(p.write_percent <= 100,
+ "Write percentage must be between 0 and 100");
+ break;
case 'x':
- p.slots = atoi(optarg);
+ p.slots = atoi_positive("Number of slots", optarg);
break;
- case 'h':
default:
help(argv[0]);
break;
}
}
+ if (pcpu_list) {
+ kvm_parse_vcpu_pinning(pcpu_list, memstress_args.vcpu_to_pcpu,
+ nr_vcpus);
+ memstress_args.pin_vcpus = true;
+ }
+
TEST_ASSERT(p.iterations >= 2, "The test should have at least two iterations");
pr_info("Test iterations: %"PRIu64"\n", p.iterations);
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index a87e5f78ebf1..9d4c50c4e72e 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -779,8 +779,6 @@ static void run_test(enum vm_guest_mode mode, void *arg)
/* Cache the HVA pointer of the region */
host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
- ucall_init(vm, NULL);
-
/* Export the shared variables to the guest */
sync_global_to_guest(vm, host_page_size);
sync_global_to_guest(vm, guest_page_size);
@@ -838,7 +836,6 @@ static void run_test(enum vm_guest_mode mode, void *arg)
free(bmap);
free(host_bmap_track);
- ucall_uninit(vm);
kvm_vm_free(vm);
}
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index b0da75af1ff3..37500c92dd0a 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -16,6 +16,7 @@
#include <linux/kvm.h>
#include "linux/rbtree.h"
+#include <asm/atomic.h>
#include <sys/ioctl.h>
@@ -90,6 +91,7 @@ struct kvm_vm {
struct sparsebit *vpages_mapped;
bool has_irqchip;
bool pgd_created;
+ vm_paddr_t ucall_mmio_addr;
vm_paddr_t pgd;
vm_vaddr_t gdt;
vm_vaddr_t tss;
@@ -406,6 +408,7 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id);
+vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min);
vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
enum kvm_mem_region_type type);
@@ -715,6 +718,10 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm);
+void kvm_pin_this_task_to_pcpu(uint32_t pcpu);
+void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
+ int nr_vcpus);
+
unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
@@ -745,6 +752,19 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
memcpy(&(g), _p, sizeof(g)); \
})
+/*
+ * Write a global value, but only in the VM's (guest's) domain. Primarily used
+ * for "globals" that hold per-VM values (VMs always duplicate code and global
+ * data into their own region of physical memory), but can be used anytime it's
+ * undesirable to change the host's copy of the global.
+ */
+#define write_guest_global(vm, g, val) ({ \
+ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \
+ typeof(g) _val = val; \
+ \
+ memcpy(_p, &(_val), sizeof(g)); \
+})
+
void assert_on_unhandled_exception(struct kvm_vcpu *vcpu);
void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu,
@@ -865,4 +885,13 @@ static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm)
return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0);
}
+/*
+ * Arch hook that is invoked via a constructor, i.e. before exeucting main(),
+ * to allow for arch-specific setup that is common to all tests, e.g. computing
+ * the default guest "mode".
+ */
+void kvm_selftest_arch_init(void);
+
+void kvm_arch_vm_post_create(struct kvm_vm *vm);
+
#endif /* SELFTEST_KVM_UTIL_BASE_H */
diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h
new file mode 100644
index 000000000000..72e3e358ef7b
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/memstress.h
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tools/testing/selftests/kvm/include/memstress.h
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_MEMSTRESS_H
+#define SELFTEST_KVM_MEMSTRESS_H
+
+#include <pthread.h>
+
+#include "kvm_util.h"
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc0000000
+
+#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */
+
+#define MEMSTRESS_MEM_SLOT_INDEX 1
+
+struct memstress_vcpu_args {
+ uint64_t gpa;
+ uint64_t gva;
+ uint64_t pages;
+
+ /* Only used by the host userspace part of the vCPU thread */
+ struct kvm_vcpu *vcpu;
+ int vcpu_idx;
+};
+
+struct memstress_args {
+ struct kvm_vm *vm;
+ /* The starting address and size of the guest test region. */
+ uint64_t gpa;
+ uint64_t size;
+ uint64_t guest_page_size;
+ uint32_t random_seed;
+ uint32_t write_percent;
+
+ /* Run vCPUs in L2 instead of L1, if the architecture supports it. */
+ bool nested;
+ /* Randomize which pages are accessed by the guest. */
+ bool random_access;
+ /* True if all vCPUs are pinned to pCPUs */
+ bool pin_vcpus;
+ /* The vCPU=>pCPU pinning map. Only valid if pin_vcpus is true. */
+ uint32_t vcpu_to_pcpu[KVM_MAX_VCPUS];
+
+ /* Test is done, stop running vCPUs. */
+ bool stop_vcpus;
+
+ struct memstress_vcpu_args vcpu_args[KVM_MAX_VCPUS];
+};
+
+extern struct memstress_args memstress_args;
+
+struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus,
+ uint64_t vcpu_memory_bytes, int slots,
+ enum vm_mem_backing_src_type backing_src,
+ bool partition_vcpu_memory_access);
+void memstress_destroy_vm(struct kvm_vm *vm);
+
+void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent);
+void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed);
+void memstress_set_random_access(struct kvm_vm *vm, bool random_access);
+
+void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *));
+void memstress_join_vcpu_threads(int vcpus);
+void memstress_guest_code(uint32_t vcpu_id);
+
+uint64_t memstress_nested_pages(int nr_vcpus);
+void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]);
+
+#endif /* SELFTEST_KVM_MEMSTRESS_H */
diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
deleted file mode 100644
index 536d7c3c3f14..000000000000
--- a/tools/testing/selftests/kvm/include/perf_test_util.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * tools/testing/selftests/kvm/include/perf_test_util.h
- *
- * Copyright (C) 2020, Google LLC.
- */
-
-#ifndef SELFTEST_KVM_PERF_TEST_UTIL_H
-#define SELFTEST_KVM_PERF_TEST_UTIL_H
-
-#include <pthread.h>
-
-#include "kvm_util.h"
-
-/* Default guest test virtual memory offset */
-#define DEFAULT_GUEST_TEST_MEM 0xc0000000
-
-#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */
-
-#define PERF_TEST_MEM_SLOT_INDEX 1
-
-struct perf_test_vcpu_args {
- uint64_t gpa;
- uint64_t gva;
- uint64_t pages;
-
- /* Only used by the host userspace part of the vCPU thread */
- struct kvm_vcpu *vcpu;
- int vcpu_idx;
-};
-
-struct perf_test_args {
- struct kvm_vm *vm;
- /* The starting address and size of the guest test region. */
- uint64_t gpa;
- uint64_t size;
- uint64_t guest_page_size;
- int wr_fract;
-
- /* Run vCPUs in L2 instead of L1, if the architecture supports it. */
- bool nested;
-
- /* Test is done, stop running vCPUs. */
- bool stop_vcpus;
-
- struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS];
-};
-
-extern struct perf_test_args perf_test_args;
-
-struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
- uint64_t vcpu_memory_bytes, int slots,
- enum vm_mem_backing_src_type backing_src,
- bool partition_vcpu_memory_access);
-void perf_test_destroy_vm(struct kvm_vm *vm);
-
-void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract);
-
-void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *));
-void perf_test_join_vcpu_threads(int vcpus);
-void perf_test_guest_code(uint32_t vcpu_id);
-
-uint64_t perf_test_nested_pages(int nr_vcpus);
-void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]);
-
-#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index befc754ce9b3..80d6416f3012 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -77,6 +77,13 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2);
struct timespec timespec_elapsed(struct timespec start);
struct timespec timespec_div(struct timespec ts, int divisor);
+struct guest_random_state {
+ uint32_t seed;
+};
+
+struct guest_random_state new_guest_random_state(uint32_t seed);
+uint32_t guest_random_u32(struct guest_random_state *state);
+
enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS,
VM_MEM_SRC_ANONYMOUS_THP,
@@ -152,4 +159,22 @@ static inline void *align_ptr_up(void *x, size_t size)
return (void *)align_up((unsigned long)x, size);
}
+int atoi_paranoid(const char *num_str);
+
+static inline uint32_t atoi_positive(const char *name, const char *num_str)
+{
+ int num = atoi_paranoid(num_str);
+
+ TEST_ASSERT(num > 0, "%s must be greater than 0, got '%s'", name, num_str);
+ return num;
+}
+
+static inline uint32_t atoi_non_negative(const char *name, const char *num_str)
+{
+ int num = atoi_paranoid(num_str);
+
+ TEST_ASSERT(num >= 0, "%s must be non-negative, got '%s'", name, num_str);
+ return num;
+}
+
#endif /* SELFTEST_KVM_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h
index ee79d180e07e..bdd373189a77 100644
--- a/tools/testing/selftests/kvm/include/ucall_common.h
+++ b/tools/testing/selftests/kvm/include/ucall_common.h
@@ -22,12 +22,18 @@ enum {
struct ucall {
uint64_t cmd;
uint64_t args[UCALL_MAX_ARGS];
+
+ /* Host virtual address of this struct. */
+ struct ucall *hva;
};
-void ucall_init(struct kvm_vm *vm, void *arg);
-void ucall_uninit(struct kvm_vm *vm);
+void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
+void ucall_arch_do_ucall(vm_vaddr_t uc);
+void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu);
+
void ucall(uint64_t cmd, int nargs, ...);
uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc);
+void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \
ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h
index 58db74f68af2..901caf0e0939 100644
--- a/tools/testing/selftests/kvm/include/x86_64/evmcs.h
+++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h
@@ -10,6 +10,7 @@
#define SELFTEST_KVM_EVMCS_H
#include <stdint.h>
+#include "hyperv.h"
#include "vmx.h"
#define u16 uint16_t
@@ -20,15 +21,6 @@
extern bool enable_evmcs;
-struct hv_vp_assist_page {
- __u32 apic_assist;
- __u32 reserved;
- __u64 vtl_control[2];
- __u64 nested_enlightenments_control[2];
- __u32 enlighten_vmentry;
- __u64 current_nested_vmcs;
-};
-
struct hv_enlightened_vmcs {
u32 revision_id;
u32 abort;
@@ -41,6 +33,8 @@ struct hv_enlightened_vmcs {
u16 host_gs_selector;
u16 host_tr_selector;
+ u16 padding16_1;
+
u64 host_ia32_pat;
u64 host_ia32_efer;
@@ -159,7 +153,7 @@ struct hv_enlightened_vmcs {
u64 ept_pointer;
u16 virtual_processor_id;
- u16 padding16[3];
+ u16 padding16_2[3];
u64 padding64_2[5];
u64 guest_physical_address;
@@ -195,13 +189,13 @@ struct hv_enlightened_vmcs {
u64 guest_rip;
u32 hv_clean_fields;
- u32 hv_padding_32;
+ u32 padding32_1;
u32 hv_synthetic_controls;
struct {
u32 nested_flush_hypercall:1;
u32 msr_bitmap:1;
u32 reserved:30;
- } hv_enlightenments_control;
+ } __packed hv_enlightenments_control;
u32 hv_vp_id;
u32 padding32_2;
u64 hv_vm_id;
@@ -222,7 +216,7 @@ struct hv_enlightened_vmcs {
u64 host_ssp;
u64 host_ia32_int_ssp_table_addr;
u64 padding64_6;
-};
+} __packed;
#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
@@ -243,29 +237,15 @@ struct hv_enlightened_vmcs {
#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15)
#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
-#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
-#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
-#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
-#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
- (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031
extern struct hv_enlightened_vmcs *current_evmcs;
-extern struct hv_vp_assist_page *current_vp_assist;
int vcpu_enable_evmcs(struct kvm_vcpu *vcpu);
-static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist)
+static inline void evmcs_enable(void)
{
- u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) |
- HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-
- wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val);
-
- current_vp_assist = vp_assist;
-
enable_evmcs = true;
-
- return 0;
}
static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs)
@@ -278,6 +258,16 @@ static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs)
return 0;
}
+static inline bool load_evmcs(struct hyperv_test_pages *hv)
+{
+ if (evmcs_vmptrld(hv->enlightened_vmcs_gpa, hv->enlightened_vmcs))
+ return false;
+
+ current_evmcs->revision_id = EVMCS_VERSION;
+
+ return true;
+}
+
static inline int evmcs_vmptrst(uint64_t *value)
{
*value = current_vp_assist->current_nested_vmcs &
diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
index b66910702c0a..9218bb5f44bf 100644
--- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h
+++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
@@ -9,6 +9,8 @@
#ifndef SELFTEST_KVM_HYPERV_H
#define SELFTEST_KVM_HYPERV_H
+#include "processor.h"
+
#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
#define HYPERV_CPUID_INTERFACE 0x40000001
#define HYPERV_CPUID_VERSION 0x40000002
@@ -184,5 +186,106 @@
/* hypercall options */
#define HV_HYPERCALL_FAST_BIT BIT(16)
+#define HV_HYPERCALL_VARHEAD_OFFSET 17
+#define HV_HYPERCALL_REP_COMP_OFFSET 32
+
+/*
+ * Issue a Hyper-V hypercall. Returns exception vector raised or 0, 'hv_status'
+ * is set to the hypercall status (if no exception occurred).
+ */
+static inline uint8_t __hyperv_hypercall(u64 control, vm_vaddr_t input_address,
+ vm_vaddr_t output_address,
+ uint64_t *hv_status)
+{
+ uint64_t error_code;
+ uint8_t vector;
+
+ /* Note both the hypercall and the "asm safe" clobber r9-r11. */
+ asm volatile("mov %[output_address], %%r8\n\t"
+ KVM_ASM_SAFE("vmcall")
+ : "=a" (*hv_status),
+ "+c" (control), "+d" (input_address),
+ KVM_ASM_SAFE_OUTPUTS(vector, error_code)
+ : [output_address] "r"(output_address),
+ "a" (-EFAULT)
+ : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS);
+ return vector;
+}
+
+/* Issue a Hyper-V hypercall and assert that it succeeded. */
+static inline void hyperv_hypercall(u64 control, vm_vaddr_t input_address,
+ vm_vaddr_t output_address)
+{
+ uint64_t hv_status;
+ uint8_t vector;
+
+ vector = __hyperv_hypercall(control, input_address, output_address, &hv_status);
+
+ GUEST_ASSERT(!vector);
+ GUEST_ASSERT((hv_status & 0xffff) == 0);
+}
+
+/* Write 'Fast' hypercall input 'data' to the first 'n_sse_regs' SSE regs */
+static inline void hyperv_write_xmm_input(void *data, int n_sse_regs)
+{
+ int i;
+
+ for (i = 0; i < n_sse_regs; i++)
+ write_sse_reg(i, (sse128_t *)(data + sizeof(sse128_t) * i));
+}
+
+/* Proper HV_X64_MSR_GUEST_OS_ID value */
+#define HYPERV_LINUX_OS_ID ((u64)0x8100 << 48)
+
+#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+struct hv_nested_enlightenments_control {
+ struct {
+ __u32 directhypercall:1;
+ __u32 reserved:31;
+ } features;
+ struct {
+ __u32 reserved;
+ } hypercallControls;
+} __packed;
+
+/* Define virtual processor assist page structure. */
+struct hv_vp_assist_page {
+ __u32 apic_assist;
+ __u32 reserved1;
+ __u64 vtl_control[3];
+ struct hv_nested_enlightenments_control nested_control;
+ __u8 enlighten_vmentry;
+ __u8 reserved2[7];
+ __u64 current_nested_vmcs;
+} __packed;
+
+extern struct hv_vp_assist_page *current_vp_assist;
+
+int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist);
+
+struct hyperv_test_pages {
+ /* VP assist page */
+ void *vp_assist_hva;
+ uint64_t vp_assist_gpa;
+ void *vp_assist;
+
+ /* Partition assist page */
+ void *partition_assist_hva;
+ uint64_t partition_assist_gpa;
+ void *partition_assist;
+
+ /* Enlightened VMCS */
+ void *enlightened_vmcs_hva;
+ uint64_t enlightened_vmcs_gpa;
+ void *enlightened_vmcs;
+};
+
+struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
+ vm_vaddr_t *p_hv_pages_gva);
#endif /* !SELFTEST_KVM_HYPERV_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index e8ca0d8a6a7e..5d310abe6c3f 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -63,16 +63,21 @@ struct kvm_x86_cpu_feature {
u8 reg;
u8 bit;
};
-#define KVM_X86_CPU_FEATURE(fn, idx, gpr, __bit) \
-({ \
- struct kvm_x86_cpu_feature feature = { \
- .function = fn, \
- .index = idx, \
- .reg = KVM_CPUID_##gpr, \
- .bit = __bit, \
- }; \
- \
- feature; \
+#define KVM_X86_CPU_FEATURE(fn, idx, gpr, __bit) \
+({ \
+ struct kvm_x86_cpu_feature feature = { \
+ .function = fn, \
+ .index = idx, \
+ .reg = KVM_CPUID_##gpr, \
+ .bit = __bit, \
+ }; \
+ \
+ static_assert((fn & 0xc0000000) == 0 || \
+ (fn & 0xc0000000) == 0x40000000 || \
+ (fn & 0xc0000000) == 0x80000000 || \
+ (fn & 0xc0000000) == 0xc0000000); \
+ static_assert(idx < BIT(sizeof(feature.index) * BITS_PER_BYTE)); \
+ feature; \
})
/*
@@ -89,6 +94,7 @@ struct kvm_x86_cpu_feature {
#define X86_FEATURE_XSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 26)
#define X86_FEATURE_OSXSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 27)
#define X86_FEATURE_RDRAND KVM_X86_CPU_FEATURE(0x1, 0, ECX, 30)
+#define X86_FEATURE_PAE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 6)
#define X86_FEATURE_MCE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 7)
#define X86_FEATURE_APIC KVM_X86_CPU_FEATURE(0x1, 0, EDX, 9)
#define X86_FEATURE_CLFLUSH KVM_X86_CPU_FEATURE(0x1, 0, EDX, 19)
@@ -162,6 +168,102 @@ struct kvm_x86_cpu_feature {
#define X86_FEATURE_KVM_HC_MAP_GPA_RANGE KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 16)
#define X86_FEATURE_KVM_MIGRATION_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17)
+/*
+ * Same idea as X86_FEATURE_XXX, but X86_PROPERTY_XXX retrieves a multi-bit
+ * value/property as opposed to a single-bit feature. Again, pack the info
+ * into a 64-bit value to pass by value with no overhead.
+ */
+struct kvm_x86_cpu_property {
+ u32 function;
+ u8 index;
+ u8 reg;
+ u8 lo_bit;
+ u8 hi_bit;
+};
+#define KVM_X86_CPU_PROPERTY(fn, idx, gpr, low_bit, high_bit) \
+({ \
+ struct kvm_x86_cpu_property property = { \
+ .function = fn, \
+ .index = idx, \
+ .reg = KVM_CPUID_##gpr, \
+ .lo_bit = low_bit, \
+ .hi_bit = high_bit, \
+ }; \
+ \
+ static_assert(low_bit < high_bit); \
+ static_assert((fn & 0xc0000000) == 0 || \
+ (fn & 0xc0000000) == 0x40000000 || \
+ (fn & 0xc0000000) == 0x80000000 || \
+ (fn & 0xc0000000) == 0xc0000000); \
+ static_assert(idx < BIT(sizeof(property.index) * BITS_PER_BYTE)); \
+ property; \
+})
+
+#define X86_PROPERTY_MAX_BASIC_LEAF KVM_X86_CPU_PROPERTY(0, 0, EAX, 0, 31)
+#define X86_PROPERTY_PMU_VERSION KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 0, 7)
+#define X86_PROPERTY_PMU_NR_GP_COUNTERS KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 8, 15)
+#define X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 24, 31)
+
+#define X86_PROPERTY_XSTATE_MAX_SIZE_XCR0 KVM_X86_CPU_PROPERTY(0xd, 0, EBX, 0, 31)
+#define X86_PROPERTY_XSTATE_MAX_SIZE KVM_X86_CPU_PROPERTY(0xd, 0, ECX, 0, 31)
+#define X86_PROPERTY_XSTATE_TILE_SIZE KVM_X86_CPU_PROPERTY(0xd, 18, EAX, 0, 31)
+#define X86_PROPERTY_XSTATE_TILE_OFFSET KVM_X86_CPU_PROPERTY(0xd, 18, EBX, 0, 31)
+#define X86_PROPERTY_AMX_TOTAL_TILE_BYTES KVM_X86_CPU_PROPERTY(0x1d, 1, EAX, 0, 15)
+#define X86_PROPERTY_AMX_BYTES_PER_TILE KVM_X86_CPU_PROPERTY(0x1d, 1, EAX, 16, 31)
+#define X86_PROPERTY_AMX_BYTES_PER_ROW KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 0, 15)
+#define X86_PROPERTY_AMX_NR_TILE_REGS KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 16, 31)
+#define X86_PROPERTY_AMX_MAX_ROWS KVM_X86_CPU_PROPERTY(0x1d, 1, ECX, 0, 15)
+
+#define X86_PROPERTY_MAX_KVM_LEAF KVM_X86_CPU_PROPERTY(0x40000000, 0, EAX, 0, 31)
+
+#define X86_PROPERTY_MAX_EXT_LEAF KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31)
+#define X86_PROPERTY_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7)
+#define X86_PROPERTY_MAX_VIRT_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15)
+#define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11)
+
+#define X86_PROPERTY_MAX_CENTAUR_LEAF KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31)
+
+/*
+ * Intel's architectural PMU events are bizarre. They have a "feature" bit
+ * that indicates the feature is _not_ supported, and a property that states
+ * the length of the bit mask of unsupported features. A feature is supported
+ * if the size of the bit mask is larger than the "unavailable" bit, and said
+ * bit is not set.
+ *
+ * Wrap the "unavailable" feature to simplify checking whether or not a given
+ * architectural event is supported.
+ */
+struct kvm_x86_pmu_feature {
+ struct kvm_x86_cpu_feature anti_feature;
+};
+#define KVM_X86_PMU_FEATURE(name, __bit) \
+({ \
+ struct kvm_x86_pmu_feature feature = { \
+ .anti_feature = KVM_X86_CPU_FEATURE(0xa, 0, EBX, __bit), \
+ }; \
+ \
+ feature; \
+})
+
+#define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED KVM_X86_PMU_FEATURE(BRANCH_INSNS_RETIRED, 5)
+
+static inline unsigned int x86_family(unsigned int eax)
+{
+ unsigned int x86;
+
+ x86 = (eax >> 8) & 0xf;
+
+ if (x86 == 0xf)
+ x86 += (eax >> 20) & 0xff;
+
+ return x86;
+}
+
+static inline unsigned int x86_model(unsigned int eax)
+{
+ return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
+}
+
/* Page table bitfield declarations */
#define PTE_PRESENT_MASK BIT_ULL(0)
#define PTE_WRITABLE_MASK BIT_ULL(1)
@@ -172,12 +274,18 @@ struct kvm_x86_cpu_feature {
#define PTE_GLOBAL_MASK BIT_ULL(8)
#define PTE_NX_MASK BIT_ULL(63)
+#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12)
+
#define PAGE_SHIFT 12
#define PAGE_SIZE (1ULL << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#define PAGE_MASK (~(PAGE_SIZE-1) & PHYSICAL_PAGE_MASK)
-#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12)
-#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define HUGEPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
+#define HUGEPAGE_SIZE(x) (1UL << HUGEPAGE_SHIFT(x))
+#define HUGEPAGE_MASK(x) (~(HUGEPAGE_SIZE(x) - 1) & PHYSICAL_PAGE_MASK)
+
+#define PTE_GET_PA(pte) ((pte) & PHYSICAL_PAGE_MASK)
+#define PTE_GET_PFN(pte) (PTE_GET_PA(pte) >> PAGE_SHIFT)
/* General Registers in 64-Bit Mode */
struct gpr64_regs {
@@ -425,82 +533,143 @@ static inline void cpuid(uint32_t function,
return __cpuid(function, 0, eax, ebx, ecx, edx);
}
-static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature)
+static inline uint32_t this_cpu_fms(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+ return eax;
+}
+
+static inline uint32_t this_cpu_family(void)
+{
+ return x86_family(this_cpu_fms());
+}
+
+static inline uint32_t this_cpu_model(void)
+{
+ return x86_model(this_cpu_fms());
+}
+
+static inline uint32_t __this_cpu_has(uint32_t function, uint32_t index,
+ uint8_t reg, uint8_t lo, uint8_t hi)
{
uint32_t gprs[4];
- __cpuid(feature.function, feature.index,
+ __cpuid(function, index,
&gprs[KVM_CPUID_EAX], &gprs[KVM_CPUID_EBX],
&gprs[KVM_CPUID_ECX], &gprs[KVM_CPUID_EDX]);
- return gprs[feature.reg] & BIT(feature.bit);
+ return (gprs[reg] & GENMASK(hi, lo)) >> lo;
+}
+
+static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature)
+{
+ return __this_cpu_has(feature.function, feature.index,
+ feature.reg, feature.bit, feature.bit);
}
-#define SET_XMM(__var, __xmm) \
- asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm)
+static inline uint32_t this_cpu_property(struct kvm_x86_cpu_property property)
+{
+ return __this_cpu_has(property.function, property.index,
+ property.reg, property.lo_bit, property.hi_bit);
+}
-static inline void set_xmm(int n, unsigned long val)
+static __always_inline bool this_cpu_has_p(struct kvm_x86_cpu_property property)
{
- switch (n) {
+ uint32_t max_leaf;
+
+ switch (property.function & 0xc0000000) {
case 0:
- SET_XMM(val, xmm0);
+ max_leaf = this_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF);
+ break;
+ case 0x40000000:
+ max_leaf = this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF);
+ break;
+ case 0x80000000:
+ max_leaf = this_cpu_property(X86_PROPERTY_MAX_EXT_LEAF);
+ break;
+ case 0xc0000000:
+ max_leaf = this_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF);
+ }
+ return max_leaf >= property.function;
+}
+
+static inline bool this_pmu_has(struct kvm_x86_pmu_feature feature)
+{
+ uint32_t nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+
+ return nr_bits > feature.anti_feature.bit &&
+ !this_cpu_has(feature.anti_feature);
+}
+
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
+#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; })
+#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; })
+
+static inline void read_sse_reg(int reg, sse128_t *data)
+{
+ switch (reg) {
+ case 0:
+ asm("movdqa %%xmm0, %0" : "=m"(*data));
break;
case 1:
- SET_XMM(val, xmm1);
+ asm("movdqa %%xmm1, %0" : "=m"(*data));
break;
case 2:
- SET_XMM(val, xmm2);
+ asm("movdqa %%xmm2, %0" : "=m"(*data));
break;
case 3:
- SET_XMM(val, xmm3);
+ asm("movdqa %%xmm3, %0" : "=m"(*data));
break;
case 4:
- SET_XMM(val, xmm4);
+ asm("movdqa %%xmm4, %0" : "=m"(*data));
break;
case 5:
- SET_XMM(val, xmm5);
+ asm("movdqa %%xmm5, %0" : "=m"(*data));
break;
case 6:
- SET_XMM(val, xmm6);
+ asm("movdqa %%xmm6, %0" : "=m"(*data));
break;
case 7:
- SET_XMM(val, xmm7);
+ asm("movdqa %%xmm7, %0" : "=m"(*data));
break;
+ default:
+ BUG();
}
}
-#define GET_XMM(__xmm) \
-({ \
- unsigned long __val; \
- asm volatile("movq %%"#__xmm", %0" : "=r"(__val)); \
- __val; \
-})
-
-static inline unsigned long get_xmm(int n)
+static inline void write_sse_reg(int reg, const sse128_t *data)
{
- assert(n >= 0 && n <= 7);
-
- switch (n) {
+ switch (reg) {
case 0:
- return GET_XMM(xmm0);
+ asm("movdqa %0, %%xmm0" : : "m"(*data));
+ break;
case 1:
- return GET_XMM(xmm1);
+ asm("movdqa %0, %%xmm1" : : "m"(*data));
+ break;
case 2:
- return GET_XMM(xmm2);
+ asm("movdqa %0, %%xmm2" : : "m"(*data));
+ break;
case 3:
- return GET_XMM(xmm3);
+ asm("movdqa %0, %%xmm3" : : "m"(*data));
+ break;
case 4:
- return GET_XMM(xmm4);
+ asm("movdqa %0, %%xmm4" : : "m"(*data));
+ break;
case 5:
- return GET_XMM(xmm5);
+ asm("movdqa %0, %%xmm5" : : "m"(*data));
+ break;
case 6:
- return GET_XMM(xmm6);
+ asm("movdqa %0, %%xmm6" : : "m"(*data));
+ break;
case 7:
- return GET_XMM(xmm7);
+ asm("movdqa %0, %%xmm7" : : "m"(*data));
+ break;
+ default:
+ BUG();
}
-
- /* never reached */
- return 0;
}
static inline void cpu_relax(void)
@@ -508,11 +677,6 @@ static inline void cpu_relax(void)
asm volatile("rep; nop" ::: "memory");
}
-#define vmmcall() \
- __asm__ __volatile__( \
- "vmmcall\n" \
- )
-
#define ud2() \
__asm__ __volatile__( \
"ud2\n" \
@@ -526,23 +690,6 @@ static inline void cpu_relax(void)
bool is_intel_cpu(void);
bool is_amd_cpu(void);
-static inline unsigned int x86_family(unsigned int eax)
-{
- unsigned int x86;
-
- x86 = (eax >> 8) & 0xf;
-
- if (x86 == 0xf)
- x86 += (eax >> 20) & 0xff;
-
- return x86;
-}
-
-static inline unsigned int x86_model(unsigned int eax)
-{
- return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
-}
-
struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu);
void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state);
void kvm_x86_state_cleanup(struct kvm_x86_state *state);
@@ -604,10 +751,27 @@ static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs)
vcpu_ioctl(vcpu, KVM_SET_XCRS, xcrs);
}
+const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
+ uint32_t function, uint32_t index);
const struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu);
+static inline uint32_t kvm_cpu_fms(void)
+{
+ return get_cpuid_entry(kvm_get_supported_cpuid(), 0x1, 0)->eax;
+}
+
+static inline uint32_t kvm_cpu_family(void)
+{
+ return x86_family(kvm_cpu_fms());
+}
+
+static inline uint32_t kvm_cpu_model(void)
+{
+ return x86_model(kvm_cpu_fms());
+}
+
bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
struct kvm_x86_cpu_feature feature);
@@ -616,6 +780,42 @@ static inline bool kvm_cpu_has(struct kvm_x86_cpu_feature feature)
return kvm_cpuid_has(kvm_get_supported_cpuid(), feature);
}
+uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
+ struct kvm_x86_cpu_property property);
+
+static inline uint32_t kvm_cpu_property(struct kvm_x86_cpu_property property)
+{
+ return kvm_cpuid_property(kvm_get_supported_cpuid(), property);
+}
+
+static __always_inline bool kvm_cpu_has_p(struct kvm_x86_cpu_property property)
+{
+ uint32_t max_leaf;
+
+ switch (property.function & 0xc0000000) {
+ case 0:
+ max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF);
+ break;
+ case 0x40000000:
+ max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_KVM_LEAF);
+ break;
+ case 0x80000000:
+ max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_EXT_LEAF);
+ break;
+ case 0xc0000000:
+ max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF);
+ }
+ return max_leaf >= property.function;
+}
+
+static inline bool kvm_pmu_has(struct kvm_x86_pmu_feature feature)
+{
+ uint32_t nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+
+ return nr_bits > feature.anti_feature.bit &&
+ !kvm_cpu_has(feature.anti_feature);
+}
+
static inline size_t kvm_cpuid2_size(int nr_entries)
{
return sizeof(struct kvm_cpuid2) +
@@ -639,8 +839,6 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries)
return cpuid;
}
-const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
- uint32_t function, uint32_t index);
void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid);
void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu);
@@ -701,17 +899,6 @@ static inline void vcpu_clear_cpuid_feature(struct kvm_vcpu *vcpu,
vcpu_set_or_clear_cpuid_feature(vcpu, feature, false);
}
-static inline const struct kvm_cpuid_entry2 *__kvm_get_supported_cpuid_entry(uint32_t function,
- uint32_t index)
-{
- return get_cpuid_entry(kvm_get_supported_cpuid(), function, index);
-}
-
-static inline const struct kvm_cpuid_entry2 *kvm_get_supported_cpuid_entry(uint32_t function)
-{
- return __kvm_get_supported_cpuid_entry(function, 0);
-}
-
uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index);
int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value);
@@ -723,15 +910,6 @@ static inline void vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index,
TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r));
}
-static inline uint32_t kvm_get_cpuid_max_basic(void)
-{
- return kvm_get_supported_cpuid_entry(0)->eax;
-}
-
-static inline uint32_t kvm_get_cpuid_max_extended(void)
-{
- return kvm_get_supported_cpuid_entry(0x80000000)->eax;
-}
void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
bool vm_is_unrestricted_guest(struct kvm_vm *vm);
@@ -748,6 +926,19 @@ struct ex_regs {
uint64_t rflags;
};
+struct idt_entry {
+ uint16_t offset0;
+ uint16_t selector;
+ uint16_t ist : 3;
+ uint16_t : 5;
+ uint16_t type : 4;
+ uint16_t : 1;
+ uint16_t dpl : 2;
+ uint16_t p : 1;
+ uint16_t offset1;
+ uint32_t offset2; uint32_t reserved;
+};
+
void vm_init_descriptor_tables(struct kvm_vm *vm);
void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu);
void vm_install_exception_handler(struct kvm_vm *vm, int vector,
@@ -764,7 +955,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector,
* for recursive faults when accessing memory in the handler. The downside to
* using registers is that it restricts what registers can be used by the actual
* instruction. But, selftests are 64-bit only, making register* pressure a
- * minor concern. Use r9-r11 as they are volatile, i.e. don't need* to be saved
+ * minor concern. Use r9-r11 as they are volatile, i.e. don't need to be saved
* by the callee, and except for r11 are not implicit parameters to any
* instructions. Ideally, fixup would use r8-r10 and thus avoid implicit
* parameters entirely, but Hyper-V's hypercall ABI uses r8 and testing Hyper-V
@@ -780,39 +971,52 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector,
*
* REGISTER OUTPUTS:
* r9 = exception vector (non-zero)
+ * r10 = error code
*/
#define KVM_ASM_SAFE(insn) \
"mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t" \
"lea 1f(%%rip), %%r10\n\t" \
"lea 2f(%%rip), %%r11\n\t" \
"1: " insn "\n\t" \
- "movb $0, %[vector]\n\t" \
- "jmp 3f\n\t" \
+ "xor %%r9, %%r9\n\t" \
"2:\n\t" \
"mov %%r9b, %[vector]\n\t" \
- "3:\n\t"
+ "mov %%r10, %[error_code]\n\t"
-#define KVM_ASM_SAFE_OUTPUTS(v) [vector] "=qm"(v)
+#define KVM_ASM_SAFE_OUTPUTS(v, ec) [vector] "=qm"(v), [error_code] "=rm"(ec)
#define KVM_ASM_SAFE_CLOBBERS "r9", "r10", "r11"
-#define kvm_asm_safe(insn, inputs...) \
-({ \
- uint8_t vector; \
- \
- asm volatile(KVM_ASM_SAFE(insn) \
- : KVM_ASM_SAFE_OUTPUTS(vector) \
- : inputs \
- : KVM_ASM_SAFE_CLOBBERS); \
- vector; \
+#define kvm_asm_safe(insn, inputs...) \
+({ \
+ uint64_t ign_error_code; \
+ uint8_t vector; \
+ \
+ asm volatile(KVM_ASM_SAFE(insn) \
+ : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code) \
+ : inputs \
+ : KVM_ASM_SAFE_CLOBBERS); \
+ vector; \
+})
+
+#define kvm_asm_safe_ec(insn, error_code, inputs...) \
+({ \
+ uint8_t vector; \
+ \
+ asm volatile(KVM_ASM_SAFE(insn) \
+ : KVM_ASM_SAFE_OUTPUTS(vector, error_code) \
+ : inputs \
+ : KVM_ASM_SAFE_CLOBBERS); \
+ vector; \
})
static inline uint8_t rdmsr_safe(uint32_t msr, uint64_t *val)
{
+ uint64_t error_code;
uint8_t vector;
uint32_t a, d;
asm volatile(KVM_ASM_SAFE("rdmsr")
- : "=a"(a), "=d"(d), KVM_ASM_SAFE_OUTPUTS(vector)
+ : "=a"(a), "=d"(d), KVM_ASM_SAFE_OUTPUTS(vector, error_code)
: "c"(msr)
: KVM_ASM_SAFE_CLOBBERS);
@@ -827,10 +1031,9 @@ static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val)
bool kvm_is_tdp_enabled(void);
-uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
- uint64_t vaddr);
-void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
- uint64_t vaddr, uint64_t pte);
+uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
+ int *level);
+uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr);
uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
uint64_t a3);
@@ -882,4 +1085,27 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
#define XSTATE_XTILE_DATA_MASK (1ULL << XSTATE_XTILE_DATA_BIT)
#define XFEATURE_XTILE_MASK (XSTATE_XTILE_CFG_MASK | \
XSTATE_XTILE_DATA_MASK)
+
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+#define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
+#define PFERR_GUEST_FINAL_BIT 32
+#define PFERR_GUEST_PAGE_BIT 33
+#define PFERR_IMPLICIT_ACCESS_BIT 48
+
+#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT)
+#define PFERR_USER_MASK BIT(PFERR_USER_BIT)
+#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT)
+#define PFERR_PK_MASK BIT(PFERR_PK_BIT)
+#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT)
+#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT)
+#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
+
#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h
index c8343ff84f7f..4803e1056055 100644
--- a/tools/testing/selftests/kvm/include/x86_64/svm.h
+++ b/tools/testing/selftests/kvm/include/x86_64/svm.h
@@ -58,6 +58,27 @@ enum {
INTERCEPT_RDPRU,
};
+struct hv_vmcb_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB
+ */
+#define HV_VMCB_NESTED_ENLIGHTENMENTS (1U << 31)
+
+/* Synthetic VM-Exit */
+#define HV_SVM_EXITCODE_ENL 0xf0000000
+#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1)
struct __attribute__ ((__packed__)) vmcb_control_area {
u32 intercept_cr;
@@ -106,7 +127,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
*/
- u8 reserved_sw[32];
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
};
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
index 7aee6244ab6a..044f0f872ba9 100644
--- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h
+++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
@@ -32,6 +32,20 @@ struct svm_test_data {
uint64_t msr_gpa;
};
+static inline void vmmcall(void)
+{
+ /*
+ * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle
+ * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended
+ * use of this function is to exit to L1 from L2. Clobber all other
+ * GPRs as L1 doesn't correctly preserve them during vmexits.
+ */
+ __asm__ __volatile__("push %%rbp; vmmcall; pop %%rbp"
+ : : "a"(0xdeadbeef), "c"(0xbeefdead)
+ : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+ "r10", "r11", "r12", "r13", "r14", "r15");
+}
+
#define stgi() \
__asm__ __volatile__( \
"stgi\n" \
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
index 71b290b6469d..5f0c0a29c556 100644
--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -437,11 +437,16 @@ static inline int vmresume(void)
static inline void vmcall(void)
{
- /* Currently, L1 destroys our GPRs during vmexits. */
- __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : :
- "rax", "rbx", "rcx", "rdx",
- "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
- "r13", "r14", "r15");
+ /*
+ * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle
+ * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended
+ * use of this function is to exit to L1 from L2. Clobber all other
+ * GPRs as L1 doesn't correctly preserve them during vmexits.
+ */
+ __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp"
+ : : "a"(0xdeadbeef), "c"(0xbeefdead)
+ : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+ "r10", "r11", "r12", "r13", "r14", "r15");
}
static inline int vmread(uint64_t encoding, uint64_t *value)
@@ -517,14 +522,6 @@ struct vmx_pages {
uint64_t vmwrite_gpa;
void *vmwrite;
- void *vp_assist_hva;
- uint64_t vp_assist_gpa;
- void *vp_assist;
-
- void *enlightened_vmcs_hva;
- uint64_t enlightened_vmcs_gpa;
- void *enlightened_vmcs;
-
void *eptp_hva;
uint64_t eptp_gpa;
void *eptp;
@@ -572,7 +569,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t memslot);
void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
uint64_t addr, uint64_t size);
-bool kvm_vm_has_ept(struct kvm_vm *vm);
+bool kvm_cpu_has_ept(void);
void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t eptp_memslot);
void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c
index f42c6ac6d71d..b3b00be1ef82 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -289,7 +289,6 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
/* Export shared structure test_args to guest */
- ucall_init(vm, NULL);
sync_global_to_guest(vm, test_args);
ret = sem_init(&test_stage_updated, 0, 0);
@@ -417,7 +416,6 @@ static void run_test(enum vm_guest_mode mode, void *arg)
TEST_ASSERT(ret == 0, "Error in sem_destroy");
free(vcpu_threads);
- ucall_uninit(vm);
kvm_vm_free(vm);
}
@@ -461,8 +459,8 @@ int main(int argc, char *argv[])
p.test_mem_size = parse_size(optarg);
break;
case 'v':
- nr_vcpus = atoi(optarg);
- TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+ nr_vcpus = atoi_positive("Number of vCPUs", optarg);
+ TEST_ASSERT(nr_vcpus <= max_vcpus,
"Invalid number of vcpus, must be between 1 and %d", max_vcpus);
break;
case 's':
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index f79f2e37dc3b..316de70db91d 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -508,15 +508,6 @@ void aarch64_get_supported_page_sizes(uint32_t ipa,
close(kvm_fd);
}
-/*
- * arm64 doesn't have a true default mode, so start by computing the
- * available IPA space and page sizes early.
- */
-void __attribute__((constructor)) init_guest_modes(void)
-{
- guest_modes_append_default();
-}
-
void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5,
uint64_t arg6, struct arm_smccc_res *res)
@@ -541,3 +532,12 @@ void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1,
[arg4] "r"(arg4), [arg5] "r"(arg5), [arg6] "r"(arg6)
: "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7");
}
+
+void kvm_selftest_arch_init(void)
+{
+ /*
+ * arm64 doesn't have a true default mode, so start by computing the
+ * available IPA space and page sizes early.
+ */
+ guest_modes_append_default();
+}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
index ed237b744690..562c16dfbb00 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
@@ -6,108 +6,36 @@
*/
#include "kvm_util.h"
+/*
+ * ucall_exit_mmio_addr holds per-VM values (global data is duplicated by each
+ * VM), it must not be accessed from host code.
+ */
static vm_vaddr_t *ucall_exit_mmio_addr;
-static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa)
-{
- if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1))
- return false;
-
- virt_pg_map(vm, gpa, gpa);
-
- ucall_exit_mmio_addr = (vm_vaddr_t *)gpa;
- sync_global_to_guest(vm, ucall_exit_mmio_addr);
-
- return true;
-}
-
-void ucall_init(struct kvm_vm *vm, void *arg)
+void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
{
- vm_paddr_t gpa, start, end, step, offset;
- unsigned int bits;
- bool ret;
+ virt_pg_map(vm, mmio_gpa, mmio_gpa);
- if (arg) {
- gpa = (vm_paddr_t)arg;
- ret = ucall_mmio_init(vm, gpa);
- TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa);
- return;
- }
+ vm->ucall_mmio_addr = mmio_gpa;
- /*
- * Find an address within the allowed physical and virtual address
- * spaces, that does _not_ have a KVM memory region associated with
- * it. Identity mapping an address like this allows the guest to
- * access it, but as KVM doesn't know what to do with it, it
- * will assume it's something userspace handles and exit with
- * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64.
- * Here we start with a guess that the addresses around 5/8th
- * of the allowed space are unmapped and then work both down and
- * up from there in 1/16th allowed space sized steps.
- *
- * Note, we need to use VA-bits - 1 when calculating the allowed
- * virtual address space for an identity mapping because the upper
- * half of the virtual address space is the two's complement of the
- * lower and won't match physical addresses.
- */
- bits = vm->va_bits - 1;
- bits = min(vm->pa_bits, bits);
- end = 1ul << bits;
- start = end * 5 / 8;
- step = end / 16;
- for (offset = 0; offset < end - start; offset += step) {
- if (ucall_mmio_init(vm, start - offset))
- return;
- if (ucall_mmio_init(vm, start + offset))
- return;
- }
- TEST_FAIL("Can't find a ucall mmio address");
+ write_guest_global(vm, ucall_exit_mmio_addr, (vm_vaddr_t *)mmio_gpa);
}
-void ucall_uninit(struct kvm_vm *vm)
+void ucall_arch_do_ucall(vm_vaddr_t uc)
{
- ucall_exit_mmio_addr = 0;
- sync_global_to_guest(vm, ucall_exit_mmio_addr);
+ WRITE_ONCE(*ucall_exit_mmio_addr, uc);
}
-void ucall(uint64_t cmd, int nargs, ...)
-{
- struct ucall uc = {};
- va_list va;
- int i;
-
- WRITE_ONCE(uc.cmd, cmd);
- nargs = min(nargs, UCALL_MAX_ARGS);
-
- va_start(va, nargs);
- for (i = 0; i < nargs; ++i)
- WRITE_ONCE(uc.args[i], va_arg(va, uint64_t));
- va_end(va);
-
- WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc);
-}
-
-uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc)
+void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- struct ucall ucall = {};
-
- if (uc)
- memset(uc, 0, sizeof(*uc));
if (run->exit_reason == KVM_EXIT_MMIO &&
- run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) {
- vm_vaddr_t gva;
-
- TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8,
+ run->mmio.phys_addr == vcpu->vm->ucall_mmio_addr) {
+ TEST_ASSERT(run->mmio.is_write && run->mmio.len == sizeof(uint64_t),
"Unexpected ucall exit mmio address access");
- memcpy(&gva, run->mmio.data, sizeof(gva));
- memcpy(&ucall, addr_gva2hva(vcpu->vm, gva), sizeof(ucall));
-
- vcpu_run_complete_io(vcpu);
- if (uc)
- memcpy(uc, &ucall, sizeof(ucall));
+ return (void *)(*((uint64_t *)run->mmio.data));
}
- return ucall.cmd;
+ return NULL;
}
diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c
index 51f280c412ba..820ac2d08c98 100644
--- a/tools/testing/selftests/kvm/lib/elf.c
+++ b/tools/testing/selftests/kvm/lib/elf.c
@@ -138,7 +138,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename)
offset = hdr.e_phoff + (n1 * hdr.e_phentsize);
offset_rv = lseek(fd, offset, SEEK_SET);
TEST_ASSERT(offset_rv == offset,
- "Failed to seek to begining of program header %u,\n"
+ "Failed to seek to beginning of program header %u,\n"
" filename: %s\n"
" rv: %jd errno: %i",
n1, filename, (intmax_t) offset_rv, errno);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index e3daa97ab0f4..e9607eb089be 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -11,6 +11,7 @@
#include "processor.h"
#include <assert.h>
+#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
@@ -328,6 +329,7 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
{
uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus,
nr_extra_pages);
+ struct userspace_mem_region *slot0;
struct kvm_vm *vm;
int i;
@@ -342,9 +344,17 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus,
kvm_vm_elf_load(vm, program_invocation_name);
-#ifdef __x86_64__
- vm_create_irqchip(vm);
-#endif
+ /*
+ * TODO: Add proper defines to protect the library's memslots, and then
+ * carve out memslot1 for the ucall MMIO address. KVM treats writes to
+ * read-only memslots as MMIO, and creating a read-only memslot for the
+ * MMIO region would prevent silently clobbering the MMIO region.
+ */
+ slot0 = memslot2region(vm, 0);
+ ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
+
+ kvm_arch_vm_post_create(vm);
+
return vm;
}
@@ -445,6 +455,59 @@ struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm)
return vm_vcpu_recreate(vm, 0);
}
+void kvm_pin_this_task_to_pcpu(uint32_t pcpu)
+{
+ cpu_set_t mask;
+ int r;
+
+ CPU_ZERO(&mask);
+ CPU_SET(pcpu, &mask);
+ r = sched_setaffinity(0, sizeof(mask), &mask);
+ TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.\n", pcpu);
+}
+
+static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask)
+{
+ uint32_t pcpu = atoi_non_negative("CPU number", cpu_str);
+
+ TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask),
+ "Not allowed to run on pCPU '%d', check cgroups?\n", pcpu);
+ return pcpu;
+}
+
+void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
+ int nr_vcpus)
+{
+ cpu_set_t allowed_mask;
+ char *cpu, *cpu_list;
+ char delim[2] = ",";
+ int i, r;
+
+ cpu_list = strdup(pcpus_string);
+ TEST_ASSERT(cpu_list, "strdup() allocation failed.\n");
+
+ r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask);
+ TEST_ASSERT(!r, "sched_getaffinity() failed");
+
+ cpu = strtok(cpu_list, delim);
+
+ /* 1. Get all pcpus for vcpus. */
+ for (i = 0; i < nr_vcpus; i++) {
+ TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'\n", i);
+ vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask);
+ cpu = strtok(NULL, delim);
+ }
+
+ /* 2. Check if the main worker needs to be pinned. */
+ if (cpu) {
+ kvm_pin_this_task_to_pcpu(parse_pcpu(cpu, &allowed_mask));
+ cpu = strtok(NULL, delim);
+ }
+
+ TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu);
+ free(cpu_list);
+}
+
/*
* Userspace Memory Region Find
*
@@ -1160,8 +1223,8 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
* TEST_ASSERT failure occurs for invalid input or no area of at least
* sz unallocated bytes >= vaddr_min is available.
*/
-static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
- vm_vaddr_t vaddr_min)
+vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
+ vm_vaddr_t vaddr_min)
{
uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
@@ -1248,8 +1311,7 @@ vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
virt_pg_map(vm, vaddr, paddr);
- sparsebit_set(vm->vpages_mapped,
- vaddr >> vm->page_shift);
+ sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift);
}
return vaddr_start;
@@ -1351,6 +1413,8 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
virt_pg_map(vm, vaddr, paddr);
vaddr += page_size;
paddr += page_size;
+
+ sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift);
}
}
@@ -2043,3 +2107,19 @@ void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data,
break;
}
}
+
+__weak void kvm_arch_vm_post_create(struct kvm_vm *vm)
+{
+}
+
+__weak void kvm_selftest_arch_init(void)
+{
+}
+
+void __attribute((constructor)) kvm_selftest_init(void)
+{
+ /* Tell stdout not to buffer its content. */
+ setbuf(stdout, NULL);
+
+ kvm_selftest_arch_init();
+}
diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/memstress.c
index ee3f499ccbd2..5f1d3173c238 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/memstress.c
@@ -2,13 +2,15 @@
/*
* Copyright (C) 2020, Google LLC.
*/
+#define _GNU_SOURCE
+
#include <inttypes.h>
#include "kvm_util.h"
-#include "perf_test_util.h"
+#include "memstress.h"
#include "processor.h"
-struct perf_test_args perf_test_args;
+struct memstress_args memstress_args;
/*
* Guest virtual memory offset of the testing memory slot.
@@ -31,7 +33,7 @@ struct vcpu_thread {
static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS];
/* The function run by each vCPU thread, as provided by the test. */
-static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *);
+static void (*vcpu_thread_fn)(struct memstress_vcpu_args *);
/* Set to true once all vCPU threads are up and running. */
static bool all_vcpu_threads_running;
@@ -42,14 +44,19 @@ static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
* Continuously write to the first 8 bytes of each page in the
* specified region.
*/
-void perf_test_guest_code(uint32_t vcpu_idx)
+void memstress_guest_code(uint32_t vcpu_idx)
{
- struct perf_test_args *pta = &perf_test_args;
- struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_idx];
+ struct memstress_args *args = &memstress_args;
+ struct memstress_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx];
+ struct guest_random_state rand_state;
uint64_t gva;
uint64_t pages;
+ uint64_t addr;
+ uint64_t page;
int i;
+ rand_state = new_guest_random_state(args->random_seed + vcpu_idx);
+
gva = vcpu_args->gva;
pages = vcpu_args->pages;
@@ -58,9 +65,14 @@ void perf_test_guest_code(uint32_t vcpu_idx)
while (true) {
for (i = 0; i < pages; i++) {
- uint64_t addr = gva + (i * pta->guest_page_size);
+ if (args->random_access)
+ page = guest_random_u32(&rand_state) % pages;
+ else
+ page = i;
- if (i % pta->wr_fract == 0)
+ addr = gva + (page * args->guest_page_size);
+
+ if (guest_random_u32(&rand_state) % 100 < args->write_percent)
*(uint64_t *)addr = 0x0123456789ABCDEF;
else
READ_ONCE(*(uint64_t *)addr);
@@ -70,17 +82,17 @@ void perf_test_guest_code(uint32_t vcpu_idx)
}
}
-void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus,
+void memstress_setup_vcpus(struct kvm_vm *vm, int nr_vcpus,
struct kvm_vcpu *vcpus[],
uint64_t vcpu_memory_bytes,
bool partition_vcpu_memory_access)
{
- struct perf_test_args *pta = &perf_test_args;
- struct perf_test_vcpu_args *vcpu_args;
+ struct memstress_args *args = &memstress_args;
+ struct memstress_vcpu_args *vcpu_args;
int i;
for (i = 0; i < nr_vcpus; i++) {
- vcpu_args = &pta->vcpu_args[i];
+ vcpu_args = &args->vcpu_args[i];
vcpu_args->vcpu = vcpus[i];
vcpu_args->vcpu_idx = i;
@@ -89,29 +101,29 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus,
vcpu_args->gva = guest_test_virt_mem +
(i * vcpu_memory_bytes);
vcpu_args->pages = vcpu_memory_bytes /
- pta->guest_page_size;
- vcpu_args->gpa = pta->gpa + (i * vcpu_memory_bytes);
+ args->guest_page_size;
+ vcpu_args->gpa = args->gpa + (i * vcpu_memory_bytes);
} else {
vcpu_args->gva = guest_test_virt_mem;
vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) /
- pta->guest_page_size;
- vcpu_args->gpa = pta->gpa;
+ args->guest_page_size;
+ vcpu_args->gpa = args->gpa;
}
vcpu_args_set(vcpus[i], 1, i);
pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
i, vcpu_args->gpa, vcpu_args->gpa +
- (vcpu_args->pages * pta->guest_page_size));
+ (vcpu_args->pages * args->guest_page_size));
}
}
-struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
+struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus,
uint64_t vcpu_memory_bytes, int slots,
enum vm_mem_backing_src_type backing_src,
bool partition_vcpu_memory_access)
{
- struct perf_test_args *pta = &perf_test_args;
+ struct memstress_args *args = &memstress_args;
struct kvm_vm *vm;
uint64_t guest_num_pages, slot0_pages = 0;
uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src);
@@ -121,20 +133,20 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
/* By default vCPUs will write to memory. */
- pta->wr_fract = 1;
+ args->write_percent = 100;
/*
* Snapshot the non-huge page size. This is used by the guest code to
* access/dirty pages at the logging granularity.
*/
- pta->guest_page_size = vm_guest_mode_params[mode].page_size;
+ args->guest_page_size = vm_guest_mode_params[mode].page_size;
guest_num_pages = vm_adjust_num_guest_pages(mode,
- (nr_vcpus * vcpu_memory_bytes) / pta->guest_page_size);
+ (nr_vcpus * vcpu_memory_bytes) / args->guest_page_size);
TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0,
"Guest memory size is not host page size aligned.");
- TEST_ASSERT(vcpu_memory_bytes % pta->guest_page_size == 0,
+ TEST_ASSERT(vcpu_memory_bytes % args->guest_page_size == 0,
"Guest memory size is not guest page size aligned.");
TEST_ASSERT(guest_num_pages % slots == 0,
"Guest memory cannot be evenly divided into %d slots.",
@@ -144,8 +156,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
* If using nested, allocate extra pages for the nested page tables and
* in-memory data structures.
*/
- if (pta->nested)
- slot0_pages += perf_test_nested_pages(nr_vcpus);
+ if (args->nested)
+ slot0_pages += memstress_nested_pages(nr_vcpus);
/*
* Pass guest_num_pages to populate the page tables for test memory.
@@ -153,9 +165,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
* effect as KVM allows aliasing HVAs in meslots.
*/
vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages,
- perf_test_guest_code, vcpus);
+ memstress_guest_code, vcpus);
- pta->vm = vm;
+ args->vm = vm;
/* Put the test region at the top guest physical memory. */
region_end_gfn = vm->max_gfn + 1;
@@ -165,8 +177,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
* When running vCPUs in L2, restrict the test region to 48 bits to
* avoid needing 5-level page tables to identity map L2.
*/
- if (pta->nested)
- region_end_gfn = min(region_end_gfn, (1UL << 48) / pta->guest_page_size);
+ if (args->nested)
+ region_end_gfn = min(region_end_gfn, (1UL << 48) / args->guest_page_size);
#endif
/*
* If there should be more memory in the guest test region than there
@@ -178,63 +190,72 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus,
" nr_vcpus: %d wss: %" PRIx64 "]\n",
guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes);
- pta->gpa = (region_end_gfn - guest_num_pages - 1) * pta->guest_page_size;
- pta->gpa = align_down(pta->gpa, backing_src_pagesz);
+ args->gpa = (region_end_gfn - guest_num_pages - 1) * args->guest_page_size;
+ args->gpa = align_down(args->gpa, backing_src_pagesz);
#ifdef __s390x__
/* Align to 1M (segment size) */
- pta->gpa = align_down(pta->gpa, 1 << 20);
+ args->gpa = align_down(args->gpa, 1 << 20);
#endif
- pta->size = guest_num_pages * pta->guest_page_size;
+ args->size = guest_num_pages * args->guest_page_size;
pr_info("guest physical test memory: [0x%lx, 0x%lx)\n",
- pta->gpa, pta->gpa + pta->size);
+ args->gpa, args->gpa + args->size);
/* Add extra memory slots for testing */
for (i = 0; i < slots; i++) {
uint64_t region_pages = guest_num_pages / slots;
- vm_paddr_t region_start = pta->gpa + region_pages * pta->guest_page_size * i;
+ vm_paddr_t region_start = args->gpa + region_pages * args->guest_page_size * i;
vm_userspace_mem_region_add(vm, backing_src, region_start,
- PERF_TEST_MEM_SLOT_INDEX + i,
+ MEMSTRESS_MEM_SLOT_INDEX + i,
region_pages, 0);
}
/* Do mapping for the demand paging memory slot */
- virt_map(vm, guest_test_virt_mem, pta->gpa, guest_num_pages);
+ virt_map(vm, guest_test_virt_mem, args->gpa, guest_num_pages);
- perf_test_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes,
+ memstress_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes,
partition_vcpu_memory_access);
- if (pta->nested) {
+ if (args->nested) {
pr_info("Configuring vCPUs to run in L2 (nested).\n");
- perf_test_setup_nested(vm, nr_vcpus, vcpus);
+ memstress_setup_nested(vm, nr_vcpus, vcpus);
}
- ucall_init(vm, NULL);
-
/* Export the shared variables to the guest. */
- sync_global_to_guest(vm, perf_test_args);
+ sync_global_to_guest(vm, memstress_args);
return vm;
}
-void perf_test_destroy_vm(struct kvm_vm *vm)
+void memstress_destroy_vm(struct kvm_vm *vm)
{
- ucall_uninit(vm);
kvm_vm_free(vm);
}
-void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract)
+void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent)
+{
+ memstress_args.write_percent = write_percent;
+ sync_global_to_guest(vm, memstress_args.write_percent);
+}
+
+void memstress_set_random_seed(struct kvm_vm *vm, uint32_t random_seed)
+{
+ memstress_args.random_seed = random_seed;
+ sync_global_to_guest(vm, memstress_args.random_seed);
+}
+
+void memstress_set_random_access(struct kvm_vm *vm, bool random_access)
{
- perf_test_args.wr_fract = wr_fract;
- sync_global_to_guest(vm, perf_test_args);
+ memstress_args.random_access = random_access;
+ sync_global_to_guest(vm, memstress_args.random_access);
}
-uint64_t __weak perf_test_nested_pages(int nr_vcpus)
+uint64_t __weak memstress_nested_pages(int nr_vcpus)
{
return 0;
}
-void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus)
+void __weak memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus)
{
pr_info("%s() not support on this architecture, skipping.\n", __func__);
exit(KSFT_SKIP);
@@ -243,6 +264,10 @@ void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_v
static void *vcpu_thread_main(void *data)
{
struct vcpu_thread *vcpu = data;
+ int vcpu_idx = vcpu->vcpu_idx;
+
+ if (memstress_args.pin_vcpus)
+ kvm_pin_this_task_to_pcpu(memstress_args.vcpu_to_pcpu[vcpu_idx]);
WRITE_ONCE(vcpu->running, true);
@@ -255,19 +280,19 @@ static void *vcpu_thread_main(void *data)
while (!READ_ONCE(all_vcpu_threads_running))
;
- vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_idx]);
+ vcpu_thread_fn(&memstress_args.vcpu_args[vcpu_idx]);
return NULL;
}
-void perf_test_start_vcpu_threads(int nr_vcpus,
- void (*vcpu_fn)(struct perf_test_vcpu_args *))
+void memstress_start_vcpu_threads(int nr_vcpus,
+ void (*vcpu_fn)(struct memstress_vcpu_args *))
{
int i;
vcpu_thread_fn = vcpu_fn;
WRITE_ONCE(all_vcpu_threads_running, false);
- WRITE_ONCE(perf_test_args.stop_vcpus, false);
+ WRITE_ONCE(memstress_args.stop_vcpus, false);
for (i = 0; i < nr_vcpus; i++) {
struct vcpu_thread *vcpu = &vcpu_threads[i];
@@ -286,11 +311,11 @@ void perf_test_start_vcpu_threads(int nr_vcpus,
WRITE_ONCE(all_vcpu_threads_running, true);
}
-void perf_test_join_vcpu_threads(int nr_vcpus)
+void memstress_join_vcpu_threads(int nr_vcpus)
{
int i;
- WRITE_ONCE(perf_test_args.stop_vcpus, true);
+ WRITE_ONCE(memstress_args.stop_vcpus, true);
for (i = 0; i < nr_vcpus; i++)
pthread_join(vcpu_threads[i].thread, NULL);
diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c
index 087b9740bc8f..9a3476a2dfca 100644
--- a/tools/testing/selftests/kvm/lib/riscv/ucall.c
+++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c
@@ -10,11 +10,7 @@
#include "kvm_util.h"
#include "processor.h"
-void ucall_init(struct kvm_vm *vm, void *arg)
-{
-}
-
-void ucall_uninit(struct kvm_vm *vm)
+void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
{
}
@@ -44,47 +40,22 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
return ret;
}
-void ucall(uint64_t cmd, int nargs, ...)
+void ucall_arch_do_ucall(vm_vaddr_t uc)
{
- struct ucall uc = {
- .cmd = cmd,
- };
- va_list va;
- int i;
-
- nargs = min(nargs, UCALL_MAX_ARGS);
-
- va_start(va, nargs);
- for (i = 0; i < nargs; ++i)
- uc.args[i] = va_arg(va, uint64_t);
- va_end(va);
-
sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT,
KVM_RISCV_SELFTESTS_SBI_UCALL,
- (vm_vaddr_t)&uc, 0, 0, 0, 0, 0);
+ uc, 0, 0, 0, 0, 0);
}
-uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc)
+void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- struct ucall ucall = {};
-
- if (uc)
- memset(uc, 0, sizeof(*uc));
if (run->exit_reason == KVM_EXIT_RISCV_SBI &&
run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT) {
switch (run->riscv_sbi.function_id) {
case KVM_RISCV_SELFTESTS_SBI_UCALL:
- memcpy(&ucall,
- addr_gva2hva(vcpu->vm, run->riscv_sbi.args[0]),
- sizeof(ucall));
-
- vcpu_run_complete_io(vcpu);
- if (uc)
- memcpy(uc, &ucall, sizeof(ucall));
-
- break;
+ return (void *)run->riscv_sbi.args[0];
case KVM_RISCV_SELFTESTS_SBI_UNEXP:
vcpu_dump(stderr, vcpu, 2);
TEST_ASSERT(0, "Unexpected trap taken by guest");
@@ -93,6 +64,5 @@ uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc)
break;
}
}
-
- return ucall.cmd;
+ return NULL;
}
diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c
index 73dc4e21190f..a7f02dc372cf 100644
--- a/tools/testing/selftests/kvm/lib/s390x/ucall.c
+++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c
@@ -6,40 +6,19 @@
*/
#include "kvm_util.h"
-void ucall_init(struct kvm_vm *vm, void *arg)
+void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
{
}
-void ucall_uninit(struct kvm_vm *vm)
+void ucall_arch_do_ucall(vm_vaddr_t uc)
{
-}
-
-void ucall(uint64_t cmd, int nargs, ...)
-{
- struct ucall uc = {
- .cmd = cmd,
- };
- va_list va;
- int i;
-
- nargs = min(nargs, UCALL_MAX_ARGS);
-
- va_start(va, nargs);
- for (i = 0; i < nargs; ++i)
- uc.args[i] = va_arg(va, uint64_t);
- va_end(va);
-
/* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */
- asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory");
+ asm volatile ("diag 0,%0,0x501" : : "a"(uc) : "memory");
}
-uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc)
+void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- struct ucall ucall = {};
-
- if (uc)
- memset(uc, 0, sizeof(*uc));
if (run->exit_reason == KVM_EXIT_S390_SIEIC &&
run->s390_sieic.icptcode == 4 &&
@@ -47,13 +26,7 @@ uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc)
(run->s390_sieic.ipb >> 16) == 0x501) {
int reg = run->s390_sieic.ipa & 0xf;
- memcpy(&ucall, addr_gva2hva(vcpu->vm, run->s.regs.gprs[reg]),
- sizeof(ucall));
-
- vcpu_run_complete_io(vcpu);
- if (uc)
- memcpy(uc, &ucall, sizeof(ucall));
+ return (void *)run->s.regs.gprs[reg];
}
-
- return ucall.cmd;
+ return NULL;
}
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 6d23878bbfe1..5c22fa4c2825 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -18,6 +18,23 @@
#include "test_util.h"
/*
+ * Random number generator that is usable from guest code. This is the
+ * Park-Miller LCG using standard constants.
+ */
+
+struct guest_random_state new_guest_random_state(uint32_t seed)
+{
+ struct guest_random_state s = {.seed = seed};
+ return s;
+}
+
+uint32_t guest_random_u32(struct guest_random_state *state)
+{
+ state->seed = (uint64_t)state->seed * 48271 % ((uint32_t)(1 << 31) - 1);
+ return state->seed;
+}
+
+/*
* Parses "[0-9]+[kmgt]?".
*/
size_t parse_size(const char *size)
@@ -334,3 +351,22 @@ long get_run_delay(void)
return val[1];
}
+
+int atoi_paranoid(const char *num_str)
+{
+ char *end_ptr;
+ long num;
+
+ errno = 0;
+ num = strtol(num_str, &end_ptr, 0);
+ TEST_ASSERT(!errno, "strtol(\"%s\") failed", num_str);
+ TEST_ASSERT(num_str != end_ptr,
+ "strtol(\"%s\") didn't find a valid integer.", num_str);
+ TEST_ASSERT(*end_ptr == '\0',
+ "strtol(\"%s\") failed to parse trailing characters \"%s\".",
+ num_str, end_ptr);
+ TEST_ASSERT(num >= INT_MIN && num <= INT_MAX,
+ "%ld not in range of [%d, %d]", num, INT_MIN, INT_MAX);
+
+ return num;
+}
diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c
new file mode 100644
index 000000000000..fcae96461e46
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/ucall_common.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "kvm_util.h"
+#include "linux/types.h"
+#include "linux/bitmap.h"
+#include "linux/atomic.h"
+
+struct ucall_header {
+ DECLARE_BITMAP(in_use, KVM_MAX_VCPUS);
+ struct ucall ucalls[KVM_MAX_VCPUS];
+};
+
+/*
+ * ucall_pool holds per-VM values (global data is duplicated by each VM), it
+ * must not be accessed from host code.
+ */
+static struct ucall_header *ucall_pool;
+
+void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
+{
+ struct ucall_header *hdr;
+ struct ucall *uc;
+ vm_vaddr_t vaddr;
+ int i;
+
+ vaddr = vm_vaddr_alloc(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR);
+ hdr = (struct ucall_header *)addr_gva2hva(vm, vaddr);
+ memset(hdr, 0, sizeof(*hdr));
+
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ uc = &hdr->ucalls[i];
+ uc->hva = uc;
+ }
+
+ write_guest_global(vm, ucall_pool, (struct ucall_header *)vaddr);
+
+ ucall_arch_init(vm, mmio_gpa);
+}
+
+static struct ucall *ucall_alloc(void)
+{
+ struct ucall *uc;
+ int i;
+
+ GUEST_ASSERT(ucall_pool);
+
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ if (!atomic_test_and_set_bit(i, ucall_pool->in_use)) {
+ uc = &ucall_pool->ucalls[i];
+ memset(uc->args, 0, sizeof(uc->args));
+ return uc;
+ }
+ }
+
+ GUEST_ASSERT(0);
+ return NULL;
+}
+
+static void ucall_free(struct ucall *uc)
+{
+ /* Beware, here be pointer arithmetic. */
+ clear_bit(uc - ucall_pool->ucalls, ucall_pool->in_use);
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+ struct ucall *uc;
+ va_list va;
+ int i;
+
+ uc = ucall_alloc();
+
+ WRITE_ONCE(uc->cmd, cmd);
+
+ nargs = min(nargs, UCALL_MAX_ARGS);
+
+ va_start(va, nargs);
+ for (i = 0; i < nargs; ++i)
+ WRITE_ONCE(uc->args[i], va_arg(va, uint64_t));
+ va_end(va);
+
+ ucall_arch_do_ucall((vm_vaddr_t)uc->hva);
+
+ ucall_free(uc);
+}
+
+uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc)
+{
+ struct ucall ucall;
+ void *addr;
+
+ if (!uc)
+ uc = &ucall;
+
+ addr = ucall_arch_get_ucall(vcpu);
+ if (addr) {
+ memcpy(uc, addr, sizeof(*uc));
+ vcpu_run_complete_io(vcpu);
+ } else {
+ memset(uc, 0, sizeof(*uc));
+ }
+
+ return uc->cmd;
+}
diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
index 3b44846fc277..92cef20902f1 100644
--- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c
+++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
@@ -20,7 +20,7 @@
#include "kvm_util.h"
#include "test_util.h"
-#include "perf_test_util.h"
+#include "memstress.h"
#include "userfaultfd_util.h"
#ifdef __NR_userfaultfd
diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c
new file mode 100644
index 000000000000..efb7e7a1354d
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Hyper-V specific functions.
+ *
+ * Copyright (C) 2021, Red Hat Inc.
+ */
+#include <stdint.h>
+#include "processor.h"
+#include "hyperv.h"
+
+struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
+ vm_vaddr_t *p_hv_pages_gva)
+{
+ vm_vaddr_t hv_pages_gva = vm_vaddr_alloc_page(vm);
+ struct hyperv_test_pages *hv = addr_gva2hva(vm, hv_pages_gva);
+
+ /* Setup of a region of guest memory for the VP Assist page. */
+ hv->vp_assist = (void *)vm_vaddr_alloc_page(vm);
+ hv->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->vp_assist);
+ hv->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->vp_assist);
+
+ /* Setup of a region of guest memory for the partition assist page. */
+ hv->partition_assist = (void *)vm_vaddr_alloc_page(vm);
+ hv->partition_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->partition_assist);
+ hv->partition_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->partition_assist);
+
+ /* Setup of a region of guest memory for the enlightened VMCS. */
+ hv->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm);
+ hv->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)hv->enlightened_vmcs);
+ hv->enlightened_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)hv->enlightened_vmcs);
+
+ *p_hv_pages_gva = hv_pages_gva;
+ return hv;
+}
+
+int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist)
+{
+ uint64_t val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) |
+ HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val);
+
+ current_vp_assist = vp_assist;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/memstress.c
index 0f344a7c89c4..d61e623afc8c 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/memstress.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * x86_64-specific extensions to perf_test_util.c.
+ * x86_64-specific extensions to memstress.c.
*
* Copyright (C) 2022, Google, Inc.
*/
@@ -11,25 +11,25 @@
#include "test_util.h"
#include "kvm_util.h"
-#include "perf_test_util.h"
+#include "memstress.h"
#include "processor.h"
#include "vmx.h"
-void perf_test_l2_guest_code(uint64_t vcpu_id)
+void memstress_l2_guest_code(uint64_t vcpu_id)
{
- perf_test_guest_code(vcpu_id);
+ memstress_guest_code(vcpu_id);
vmcall();
}
-extern char perf_test_l2_guest_entry[];
+extern char memstress_l2_guest_entry[];
__asm__(
-"perf_test_l2_guest_entry:"
+"memstress_l2_guest_entry:"
" mov (%rsp), %rdi;"
-" call perf_test_l2_guest_code;"
+" call memstress_l2_guest_code;"
" ud2;"
);
-static void perf_test_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
+static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
{
#define L2_GUEST_STACK_SIZE 64
unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
@@ -42,14 +42,14 @@ static void perf_test_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1];
*rsp = vcpu_id;
- prepare_vmcs(vmx, perf_test_l2_guest_entry, rsp);
+ prepare_vmcs(vmx, memstress_l2_guest_entry, rsp);
GUEST_ASSERT(!vmlaunch());
GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
GUEST_DONE();
}
-uint64_t perf_test_nested_pages(int nr_vcpus)
+uint64_t memstress_nested_pages(int nr_vcpus)
{
/*
* 513 page tables is enough to identity-map 256 TiB of L2 with 1G
@@ -59,7 +59,7 @@ uint64_t perf_test_nested_pages(int nr_vcpus)
return 513 + 10 * nr_vcpus;
}
-void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
+void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
{
uint64_t start, end;
@@ -72,12 +72,12 @@ void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
*/
nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL);
- start = align_down(perf_test_args.gpa, PG_SIZE_1G);
- end = align_up(perf_test_args.gpa + perf_test_args.size, PG_SIZE_1G);
+ start = align_down(memstress_args.gpa, PG_SIZE_1G);
+ end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G);
nested_identity_map_1g(vmx, vm, start, end - start);
}
-void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[])
+void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[])
{
struct vmx_pages *vmx, *vmx0 = NULL;
struct kvm_regs regs;
@@ -85,12 +85,13 @@ void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc
int vcpu_id;
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+ TEST_REQUIRE(kvm_cpu_has_ept());
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vmx = vcpu_alloc_vmx(vm, &vmx_gva);
if (vcpu_id == 0) {
- perf_test_setup_ept(vmx, vm);
+ memstress_setup_ept(vmx, vm);
vmx0 = vmx;
} else {
/* Share the same EPT table across all vCPUs. */
@@ -100,11 +101,11 @@ void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc
}
/*
- * Override the vCPU to run perf_test_l1_guest_code() which will
- * bounce it into L2 before calling perf_test_guest_code().
+ * Override the vCPU to run memstress_l1_guest_code() which will
+ * bounce it into L2 before calling memstress_guest_code().
*/
vcpu_regs_get(vcpus[vcpu_id], &regs);
- regs.rip = (unsigned long) perf_test_l1_guest_code;
+ regs.rip = (unsigned long) memstress_l1_guest_code;
vcpu_regs_set(vcpus[vcpu_id], &regs);
vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id);
}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index b199dde90e9f..c5c46f5b767c 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -131,23 +131,28 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
}
}
-static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr,
- int level)
+static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,
+ uint64_t vaddr, int level)
{
- uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift);
+ uint64_t pt_gpa = PTE_GET_PA(*parent_pte);
+ uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);
int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
+ TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd,
+ "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
+ level + 1, vaddr);
+
return &page_table[index];
}
static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
- uint64_t pt_pfn,
+ uint64_t *parent_pte,
uint64_t vaddr,
uint64_t paddr,
int current_level,
int target_level)
{
- uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, current_level);
+ uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level);
if (!(*pte & PTE_PRESENT_MASK)) {
*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
@@ -197,21 +202,20 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
* Allocate upper level page tables, if not already present. Return
* early if a hugepage was created.
*/
- pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift,
- vaddr, paddr, PG_LEVEL_512G, level);
+ pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level);
if (*pml4e & PTE_LARGE_MASK)
return;
- pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, PG_LEVEL_1G, level);
+ pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level);
if (*pdpe & PTE_LARGE_MASK)
return;
- pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, PG_LEVEL_2M, level);
+ pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level);
if (*pde & PTE_LARGE_MASK)
return;
/* Fill in page table entry. */
- pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, PG_LEVEL_4K);
+ pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
"PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
@@ -241,30 +245,25 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
}
}
-static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm,
- struct kvm_vcpu *vcpu,
- uint64_t vaddr)
+static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
{
- uint16_t index[4];
- uint64_t *pml4e, *pdpe, *pde;
- uint64_t *pte;
- struct kvm_sregs sregs;
- uint64_t rsvd_mask = 0;
+ if (*pte & PTE_LARGE_MASK) {
+ TEST_ASSERT(*level == PG_LEVEL_NONE ||
+ *level == current_level,
+ "Unexpected hugepage at level %d\n", current_level);
+ *level = current_level;
+ }
- /* Set the high bits in the reserved mask. */
- if (vm->pa_bits < 52)
- rsvd_mask = GENMASK_ULL(51, vm->pa_bits);
+ return *level == current_level;
+}
- /*
- * SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries
- * with 4-Level Paging and 5-Level Paging".
- * If IA32_EFER.NXE = 0 and the P flag of a paging-structure entry is 1,
- * the XD flag (bit 63) is reserved.
- */
- vcpu_sregs_get(vcpu, &sregs);
- if ((sregs.efer & EFER_NX) == 0) {
- rsvd_mask |= PTE_NX_MASK;
- }
+uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
+ int *level)
+{
+ uint64_t *pml4e, *pdpe, *pde;
+
+ TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM,
+ "Invalid PG_LEVEL_* '%d'", *level);
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
@@ -279,54 +278,26 @@ static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm,
TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16),
"Canonical check failed. The virtual address is invalid.");
- index[0] = (vaddr >> 12) & 0x1ffu;
- index[1] = (vaddr >> 21) & 0x1ffu;
- index[2] = (vaddr >> 30) & 0x1ffu;
- index[3] = (vaddr >> 39) & 0x1ffu;
+ pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G);
+ if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G))
+ return pml4e;
- pml4e = addr_gpa2hva(vm, vm->pgd);
- TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK,
- "Expected pml4e to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0,
- "Unexpected reserved bits set.");
+ pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G);
+ if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G))
+ return pdpe;
- pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
- TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK,
- "Expected pdpe to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK),
- "Expected pdpe to map a pde not a 1-GByte page.");
- TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0,
- "Unexpected reserved bits set.");
+ pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M);
+ if (vm_is_target_pte(pde, level, PG_LEVEL_2M))
+ return pde;
- pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
- TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK,
- "Expected pde to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK),
- "Expected pde to map a pte not a 2-MByte page.");
- TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0,
- "Unexpected reserved bits set.");
-
- pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
- TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK,
- "Expected pte to be present for gva: 0x%08lx", vaddr);
-
- return &pte[index[0]];
+ return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
}
-uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
- uint64_t vaddr)
+uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)
{
- uint64_t *pte = _vm_get_page_table_entry(vm, vcpu, vaddr);
+ int level = PG_LEVEL_4K;
- return *(uint64_t *)pte;
-}
-
-void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu,
- uint64_t vaddr, uint64_t pte)
-{
- uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpu, vaddr);
-
- *(uint64_t *)new_pte = pte;
+ return __vm_get_page_table_entry(vm, vaddr, &level);
}
void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
@@ -512,41 +483,17 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
{
- uint16_t index[4];
- uint64_t *pml4e, *pdpe, *pde;
- uint64_t *pte;
-
- TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
- "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
-
- index[0] = (gva >> 12) & 0x1ffu;
- index[1] = (gva >> 21) & 0x1ffu;
- index[2] = (gva >> 30) & 0x1ffu;
- index[3] = (gva >> 39) & 0x1ffu;
+ int level = PG_LEVEL_NONE;
+ uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level);
- if (!vm->pgd_created)
- goto unmapped_gva;
- pml4e = addr_gpa2hva(vm, vm->pgd);
- if (!(pml4e[index[3]] & PTE_PRESENT_MASK))
- goto unmapped_gva;
+ TEST_ASSERT(*pte & PTE_PRESENT_MASK,
+ "Leaf PTE not PRESENT for gva: 0x%08lx", gva);
- pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
- if (!(pdpe[index[2]] & PTE_PRESENT_MASK))
- goto unmapped_gva;
-
- pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
- if (!(pde[index[1]] & PTE_PRESENT_MASK))
- goto unmapped_gva;
-
- pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
- if (!(pte[index[0]] & PTE_PRESENT_MASK))
- goto unmapped_gva;
-
- return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & ~PAGE_MASK);
-
-unmapped_gva:
- TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
- exit(EXIT_FAILURE);
+ /*
+ * No need for a hugepage mask on the PTE, x86-64 requires the "unused"
+ * address bits to be zero.
+ */
+ return PTE_GET_PA(*pte) | (gva & ~HUGEPAGE_MASK(level));
}
static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt)
@@ -639,6 +586,11 @@ void __vm_xsave_require_permission(int bit, const char *name)
bitmask);
}
+void kvm_arch_vm_post_create(struct kvm_vm *vm)
+{
+ vm_create_irqchip(vm);
+}
+
struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id,
void *guest_code)
{
@@ -701,8 +653,9 @@ const struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
return cpuid;
}
-bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
- struct kvm_x86_cpu_feature feature)
+static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid,
+ uint32_t function, uint32_t index,
+ uint8_t reg, uint8_t lo, uint8_t hi)
{
const struct kvm_cpuid_entry2 *entry;
int i;
@@ -715,12 +668,25 @@ bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
* order, but kvm_x86_cpu_feature matches that mess, so yay
* pointer shenanigans!
*/
- if (entry->function == feature.function &&
- entry->index == feature.index)
- return (&entry->eax)[feature.reg] & BIT(feature.bit);
+ if (entry->function == function && entry->index == index)
+ return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo;
}
- return false;
+ return 0;
+}
+
+bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
+ struct kvm_x86_cpu_feature feature)
+{
+ return __kvm_cpu_has(cpuid, feature.function, feature.index,
+ feature.reg, feature.bit, feature.bit);
+}
+
+uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
+ struct kvm_x86_cpu_property property)
+{
+ return __kvm_cpu_has(cpuid, property.function, property.index,
+ property.reg, property.lo_bit, property.hi_bit);
}
uint64_t kvm_get_feature_msr(uint64_t msr_index)
@@ -1060,34 +1026,15 @@ bool is_amd_cpu(void)
void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
{
- const struct kvm_cpuid_entry2 *entry;
- bool pae;
-
- /* SDM 4.1.4 */
- if (kvm_get_cpuid_max_extended() < 0x80000008) {
- pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
- *pa_bits = pae ? 36 : 32;
+ if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) {
+ *pa_bits == kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32;
*va_bits = 32;
} else {
- entry = kvm_get_supported_cpuid_entry(0x80000008);
- *pa_bits = entry->eax & 0xff;
- *va_bits = (entry->eax >> 8) & 0xff;
+ *pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
+ *va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR);
}
}
-struct idt_entry {
- uint16_t offset0;
- uint16_t selector;
- uint16_t ist : 3;
- uint16_t : 5;
- uint16_t type : 4;
- uint16_t : 1;
- uint16_t dpl : 2;
- uint16_t p : 1;
- uint16_t offset1;
- uint32_t offset2; uint32_t reserved;
-};
-
static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
int dpl, unsigned short selector)
{
@@ -1117,6 +1064,7 @@ static bool kvm_fixup_exception(struct ex_regs *regs)
regs->rip = regs->r11;
regs->r9 = regs->vector;
+ regs->r10 = regs->error_code;
return true;
}
@@ -1279,7 +1227,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
{
const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
unsigned long ht_gfn, max_gfn, max_pfn;
- uint32_t eax, ebx, ecx, edx, max_ext_leaf;
+ uint8_t maxphyaddr;
max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
@@ -1293,8 +1241,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
/* Before family 17h, the HyperTransport area is just below 1T. */
ht_gfn = (1 << 28) - num_ht_pages;
- cpuid(1, &eax, &ebx, &ecx, &edx);
- if (x86_family(eax) < 0x17)
+ if (this_cpu_family() < 0x17)
goto done;
/*
@@ -1302,17 +1249,14 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
* reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use
* the old conservative value if MAXPHYADDR is not enumerated.
*/
- cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- max_ext_leaf = eax;
- if (max_ext_leaf < 0x80000008)
+ if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR))
goto done;
- cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
- max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1;
- if (max_ext_leaf >= 0x8000001f) {
- cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
- max_pfn >>= (ebx >> 6) & 0x3f;
- }
+ maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
+ max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1;
+
+ if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION))
+ max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION);
ht_gfn = max_pfn - num_ht_pages;
done:
diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c
index e5f0f9e0d3ee..4d41dc63cc9e 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c
@@ -8,52 +8,25 @@
#define UCALL_PIO_PORT ((uint16_t)0x1000)
-void ucall_init(struct kvm_vm *vm, void *arg)
+void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa)
{
}
-void ucall_uninit(struct kvm_vm *vm)
+void ucall_arch_do_ucall(vm_vaddr_t uc)
{
-}
-
-void ucall(uint64_t cmd, int nargs, ...)
-{
- struct ucall uc = {
- .cmd = cmd,
- };
- va_list va;
- int i;
-
- nargs = min(nargs, UCALL_MAX_ARGS);
-
- va_start(va, nargs);
- for (i = 0; i < nargs; ++i)
- uc.args[i] = va_arg(va, uint64_t);
- va_end(va);
-
asm volatile("in %[port], %%al"
- : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory");
+ : : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax", "memory");
}
-uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc)
+void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- struct ucall ucall = {};
-
- if (uc)
- memset(uc, 0, sizeof(*uc));
if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) {
struct kvm_regs regs;
vcpu_regs_get(vcpu, &regs);
- memcpy(&ucall, addr_gva2hva(vcpu->vm, (vm_vaddr_t)regs.rdi),
- sizeof(ucall));
-
- vcpu_run_complete_io(vcpu);
- if (uc)
- memcpy(uc, &ucall, sizeof(ucall));
+ return (void *)regs.rdi;
}
-
- return ucall.cmd;
+ return NULL;
}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
index d21049c38fc5..59d97531c9b1 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -109,18 +109,6 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
memset(vmx->vmwrite_hva, 0, getpagesize());
- /* Setup of a region of guest memory for the VP Assist page. */
- vmx->vp_assist = (void *)vm_vaddr_alloc_page(vm);
- vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist);
- vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist);
-
- /* Setup of a region of guest memory for the enlightened VMCS. */
- vmx->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm);
- vmx->enlightened_vmcs_hva =
- addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs);
- vmx->enlightened_vmcs_gpa =
- addr_gva2gpa(vm, (uintptr_t)vmx->enlightened_vmcs);
-
*p_vmx_gva = vmx_gva;
return vmx;
}
@@ -171,26 +159,18 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx)
bool load_vmcs(struct vmx_pages *vmx)
{
- if (!enable_evmcs) {
- /* Load a VMCS. */
- *(uint32_t *)(vmx->vmcs) = vmcs_revision();
- if (vmclear(vmx->vmcs_gpa))
- return false;
-
- if (vmptrld(vmx->vmcs_gpa))
- return false;
-
- /* Setup shadow VMCS, do not load it yet. */
- *(uint32_t *)(vmx->shadow_vmcs) =
- vmcs_revision() | 0x80000000ul;
- if (vmclear(vmx->shadow_vmcs_gpa))
- return false;
- } else {
- if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa,
- vmx->enlightened_vmcs))
- return false;
- current_evmcs->revision_id = EVMCS_VERSION;
- }
+ /* Load a VMCS. */
+ *(uint32_t *)(vmx->vmcs) = vmcs_revision();
+ if (vmclear(vmx->vmcs_gpa))
+ return false;
+
+ if (vmptrld(vmx->vmcs_gpa))
+ return false;
+
+ /* Setup shadow VMCS, do not load it yet. */
+ *(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul;
+ if (vmclear(vmx->shadow_vmcs_gpa))
+ return false;
return true;
}
@@ -544,26 +524,22 @@ void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
__nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
}
-bool kvm_vm_has_ept(struct kvm_vm *vm)
+bool kvm_cpu_has_ept(void)
{
- struct kvm_vcpu *vcpu;
uint64_t ctrl;
- vcpu = list_first_entry(&vm->vcpus, struct kvm_vcpu, list);
- TEST_ASSERT(vcpu, "Cannot determine EPT support without vCPUs.\n");
-
- ctrl = vcpu_get_msr(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32;
+ ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32;
if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
return false;
- ctrl = vcpu_get_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
+ ctrl = kvm_get_feature_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
return ctrl & SECONDARY_EXEC_ENABLE_EPT;
}
void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t eptp_memslot)
{
- TEST_REQUIRE(kvm_vm_has_ept(vm));
+ TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT");
vmx->eptp = (void *)vm_vaddr_alloc_page(vm);
vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c
index 9a6e4f3ad6b5..feaf2be20ff2 100644
--- a/tools/testing/selftests/kvm/max_guest_memory_test.c
+++ b/tools/testing/selftests/kvm/max_guest_memory_test.c
@@ -11,6 +11,7 @@
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/atomic.h>
+#include <linux/sizes.h>
#include "kvm_util.h"
#include "test_util.h"
@@ -162,8 +163,7 @@ int main(int argc, char *argv[])
* just below the 4gb boundary. This test could create memory at
* 1gb-3gb,but it's simpler to skip straight to 4gb.
*/
- const uint64_t size_1gb = (1 << 30);
- const uint64_t start_gpa = (4ull * size_1gb);
+ const uint64_t start_gpa = SZ_4G;
const int first_slot = 1;
struct timespec time_start, time_run1, time_reset, time_run2;
@@ -180,29 +180,26 @@ int main(int argc, char *argv[])
* are quite common for x86, requires changing only max_mem (KVM allows
* 32k memslots, 32k * 2gb == ~64tb of guest memory).
*/
- slot_size = 2 * size_1gb;
+ slot_size = SZ_2G;
max_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
TEST_ASSERT(max_slots > first_slot, "KVM is broken");
/* All KVM MMUs should be able to survive a 128gb guest. */
- max_mem = 128 * size_1gb;
+ max_mem = 128ull * SZ_1G;
calc_default_nr_vcpus();
while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) {
switch (opt) {
case 'c':
- nr_vcpus = atoi(optarg);
- TEST_ASSERT(nr_vcpus > 0, "number of vcpus must be >0");
+ nr_vcpus = atoi_positive("Number of vCPUs", optarg);
break;
case 'm':
- max_mem = atoi(optarg) * size_1gb;
- TEST_ASSERT(max_mem > 0, "memory size must be >0");
+ max_mem = 1ull * atoi_positive("Memory size", optarg) * SZ_1G;
break;
case 's':
- slot_size = atoi(optarg) * size_1gb;
- TEST_ASSERT(slot_size > 0, "slot size must be >0");
+ slot_size = 1ull * atoi_positive("Slot size", optarg) * SZ_1G;
break;
case 'H':
hugepages = true;
@@ -245,7 +242,7 @@ int main(int argc, char *argv[])
#ifdef __x86_64__
/* Identity map memory in the guest using 1gb pages. */
- for (i = 0; i < slot_size; i += size_1gb)
+ for (i = 0; i < slot_size; i += SZ_1G)
__virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G);
#else
for (i = 0; i < slot_size; i += vm->page_size)
@@ -260,7 +257,7 @@ int main(int argc, char *argv[])
vcpus = NULL;
pr_info("Running with %lugb of guest memory and %u vCPUs\n",
- (gpa - start_gpa) / size_1gb, nr_vcpus);
+ (gpa - start_gpa) / SZ_1G, nr_vcpus);
rendezvous_with_vcpus(&time_start, "spawning");
rendezvous_with_vcpus(&time_run1, "run 1");
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index 3a5e4518307c..9855c41ca811 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -21,7 +21,7 @@
#include <linux/bitops.h>
#include <linux/userfaultfd.h>
-#include "perf_test_util.h"
+#include "memstress.h"
#include "processor.h"
#include "test_util.h"
#include "guest_modes.h"
@@ -34,7 +34,7 @@
static int nr_vcpus = 1;
static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
-static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
+static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
{
struct kvm_vcpu *vcpu = vcpu_args->vcpu;
struct kvm_run *run;
@@ -43,7 +43,7 @@ static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args)
run = vcpu->run;
/* Let the guest access its memory until a stop signal is received */
- while (!READ_ONCE(perf_test_args.stop_vcpus)) {
+ while (!READ_ONCE(memstress_args.stop_vcpus)) {
ret = _vcpu_run(vcpu);
TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
@@ -70,10 +70,10 @@ static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
int i;
/*
- * Add the dummy memslot just below the perf_test_util memslot, which is
+ * Add the dummy memslot just below the memstress memslot, which is
* at the top of the guest physical address space.
*/
- gpa = perf_test_args.gpa - pages * vm->page_size;
+ gpa = memstress_args.gpa - pages * vm->page_size;
for (i = 0; i < nr_modifications; i++) {
usleep(delay);
@@ -85,8 +85,8 @@ static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
}
struct test_params {
- useconds_t memslot_modification_delay;
- uint64_t nr_memslot_modifications;
+ useconds_t delay;
+ uint64_t nr_iterations;
bool partition_vcpu_memory_access;
};
@@ -95,23 +95,22 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct test_params *p = arg;
struct kvm_vm *vm;
- vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
+ vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
VM_MEM_SRC_ANONYMOUS,
p->partition_vcpu_memory_access);
pr_info("Finished creating vCPUs\n");
- perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker);
+ memstress_start_vcpu_threads(nr_vcpus, vcpu_worker);
pr_info("Started all vCPUs\n");
- add_remove_memslot(vm, p->memslot_modification_delay,
- p->nr_memslot_modifications);
+ add_remove_memslot(vm, p->delay, p->nr_iterations);
- perf_test_join_vcpu_threads(nr_vcpus);
+ memstress_join_vcpu_threads(nr_vcpus);
pr_info("All vCPU threads joined\n");
- perf_test_destroy_vm(vm);
+ memstress_destroy_vm(vm);
}
static void help(char *name)
@@ -140,9 +139,8 @@ int main(int argc, char *argv[])
int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
int opt;
struct test_params p = {
- .memslot_modification_delay = 0,
- .nr_memslot_modifications =
- DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS,
+ .delay = 0,
+ .nr_iterations = DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS,
.partition_vcpu_memory_access = true
};
@@ -154,16 +152,14 @@ int main(int argc, char *argv[])
guest_modes_cmdline(optarg);
break;
case 'd':
- p.memslot_modification_delay = strtoul(optarg, NULL, 0);
- TEST_ASSERT(p.memslot_modification_delay >= 0,
- "A negative delay is not supported.");
+ p.delay = atoi_non_negative("Delay", optarg);
break;
case 'b':
guest_percpu_mem_size = parse_size(optarg);
break;
case 'v':
- nr_vcpus = atoi(optarg);
- TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+ nr_vcpus = atoi_positive("Number of vCPUs", optarg);
+ TEST_ASSERT(nr_vcpus <= max_vcpus,
"Invalid number of vcpus, must be between 1 and %d",
max_vcpus);
break;
@@ -171,7 +167,7 @@ int main(int argc, char *argv[])
p.partition_vcpu_memory_access = false;
break;
case 'i':
- p.nr_memslot_modifications = atoi(optarg);
+ p.nr_iterations = atoi_positive("Number of iterations", optarg);
break;
case 'h':
default:
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 2ad40f7c9c08..e698306bf49d 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -289,7 +289,6 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
mempages = mem_size / guest_page_size;
data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
- ucall_init(data->vm, NULL);
TEST_ASSERT(data->vm->page_size == guest_page_size, "Invalid VM page size");
data->npages = mempages;
@@ -306,6 +305,8 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
TEST_ASSERT(data->hva_slots, "malloc() fail");
+ data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code);
+
pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
data->nslots, data->pages_per_slot, rempages);
@@ -966,40 +967,28 @@ static bool parse_args(int argc, char *argv[],
map_unmap_verify = true;
break;
case 's':
- targs->nslots = atoi(optarg);
+ targs->nslots = atoi_paranoid(optarg);
if (targs->nslots <= 1 && targs->nslots != -1) {
pr_info("Slot count cap must be larger than 1 or -1 for no cap\n");
return false;
}
break;
case 'f':
- targs->tfirst = atoi(optarg);
- if (targs->tfirst < 0) {
- pr_info("First test to run has to be non-negative\n");
- return false;
- }
+ targs->tfirst = atoi_non_negative("First test", optarg);
break;
case 'e':
- targs->tlast = atoi(optarg);
- if (targs->tlast < 0 || targs->tlast >= NTESTS) {
+ targs->tlast = atoi_non_negative("Last test", optarg);
+ if (targs->tlast >= NTESTS) {
pr_info("Last test to run has to be non-negative and less than %zu\n",
NTESTS);
return false;
}
break;
case 'l':
- targs->seconds = atoi(optarg);
- if (targs->seconds < 0) {
- pr_info("Test length in seconds has to be non-negative\n");
- return false;
- }
+ targs->seconds = atoi_non_negative("Test length", optarg);
break;
case 'r':
- targs->runs = atoi(optarg);
- if (targs->runs <= 0) {
- pr_info("Runs per test has to be positive\n");
- return false;
- }
+ targs->runs = atoi_positive("Runs per test", optarg);
break;
}
}
@@ -1103,9 +1092,6 @@ int main(int argc, char *argv[])
struct test_result rbestslottime;
int tctr;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
if (!check_memory_sizes())
return -1;
diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c
index 6f88da7e60be..3045fdf9bdf5 100644
--- a/tools/testing/selftests/kvm/rseq_test.c
+++ b/tools/testing/selftests/kvm/rseq_test.c
@@ -205,9 +205,6 @@ int main(int argc, char *argv[])
struct kvm_vcpu *vcpu;
u32 cpu, rseq_cpu;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,
strerror(errno));
@@ -224,7 +221,6 @@ int main(int argc, char *argv[])
* CPU affinity.
*/
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
- ucall_init(vm, NULL);
pthread_create(&migration_thread, NULL, migration_worker,
(void *)(unsigned long)syscall(SYS_gettid));
diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index 9113696d5178..3fd81e58f40c 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -760,8 +760,6 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_MEM_OP));
- setbuf(stdout, NULL); /* Tell stdout not to buffer its content */
-
ksft_print_header();
ksft_set_plan(ARRAY_SIZE(testlist));
diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c
index 19486084eb30..e41e2cb8ffa9 100644
--- a/tools/testing/selftests/kvm/s390x/resets.c
+++ b/tools/testing/selftests/kvm/s390x/resets.c
@@ -296,8 +296,6 @@ int main(int argc, char *argv[])
bool has_s390_vcpu_resets = kvm_check_cap(KVM_CAP_S390_VCPU_RESETS);
int idx;
- setbuf(stdout, NULL); /* Tell stdout not to buffer its content */
-
ksft_print_header();
ksft_set_plan(ARRAY_SIZE(testlist));
diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
index 3fdb6e2598eb..2ddde41c44ba 100644
--- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c
+++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
@@ -231,9 +231,6 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS));
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
ksft_print_header();
ksft_set_plan(ARRAY_SIZE(testlist));
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index 0d55f508d595..2ef1d1b72ce4 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -392,9 +392,6 @@ int main(int argc, char *argv[])
int i, loops;
#endif
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
#ifdef __x86_64__
/*
* FIXME: the zero-memslot test fails on aarch64 and s390x because
@@ -407,7 +404,7 @@ int main(int argc, char *argv[])
#ifdef __x86_64__
if (argc > 1)
- loops = atoi(argv[1]);
+ loops = atoi_positive("Number of iterations", argv[1]);
else
loops = 10;
diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c
index db8967f1a17b..c87f38712073 100644
--- a/tools/testing/selftests/kvm/steal_time.c
+++ b/tools/testing/selftests/kvm/steal_time.c
@@ -266,7 +266,6 @@ int main(int ac, char **av)
gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS);
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0);
virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages);
- ucall_init(vm, NULL);
TEST_REQUIRE(is_steal_time_supported(vcpus[0]));
diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c
index 1c274933912b..7f5b330b6a1b 100644
--- a/tools/testing/selftests/kvm/system_counter_offset_test.c
+++ b/tools/testing/selftests/kvm/system_counter_offset_test.c
@@ -121,7 +121,6 @@ int main(void)
vm = vm_create_with_one_vcpu(&vcpu, guest_main);
check_preconditions(vcpu);
- ucall_init(vm, NULL);
enter_guest(vcpu);
kvm_vm_free(vm);
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
index dadcbad10a1d..21de6ae42086 100644
--- a/tools/testing/selftests/kvm/x86_64/amx_test.c
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -39,11 +39,6 @@
#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
-#define TILE_CPUID 0x1d
-#define XSTATE_CPUID 0xd
-#define TILE_PALETTE_CPUID_SUBLEAVE 0x1
-#define XSTATE_USER_STATE_SUBLEAVE 0x0
-
#define XSAVE_HDR_OFFSET 512
struct xsave_data {
@@ -129,71 +124,26 @@ static bool check_xsave_supports_xtile(void)
return __xgetbv(0) & XFEATURE_MASK_XTILE;
}
-static bool enum_xtile_config(void)
-{
- u32 eax, ebx, ecx, edx;
-
- __cpuid(TILE_CPUID, TILE_PALETTE_CPUID_SUBLEAVE, &eax, &ebx, &ecx, &edx);
- if (!eax || !ebx || !ecx)
- return false;
-
- xtile.max_names = ebx >> 16;
- if (xtile.max_names < NUM_TILES)
- return false;
-
- xtile.bytes_per_tile = eax >> 16;
- if (xtile.bytes_per_tile < TILE_SIZE)
- return false;
-
- xtile.bytes_per_row = ebx;
- xtile.max_rows = ecx;
-
- return true;
-}
-
-static bool enum_xsave_tile(void)
-{
- u32 eax, ebx, ecx, edx;
-
- __cpuid(XSTATE_CPUID, XFEATURE_XTILEDATA, &eax, &ebx, &ecx, &edx);
- if (!eax || !ebx)
- return false;
-
- xtile.xsave_offset = ebx;
- xtile.xsave_size = eax;
-
- return true;
-}
-
-static bool check_xsave_size(void)
-{
- u32 eax, ebx, ecx, edx;
- bool valid = false;
-
- __cpuid(XSTATE_CPUID, XSTATE_USER_STATE_SUBLEAVE, &eax, &ebx, &ecx, &edx);
- if (ebx && ebx <= XSAVE_SIZE)
- valid = true;
-
- return valid;
-}
-
-static bool check_xtile_info(void)
+static void check_xtile_info(void)
{
- bool ret = false;
-
- if (!check_xsave_size())
- return ret;
-
- if (!enum_xsave_tile())
- return ret;
-
- if (!enum_xtile_config())
- return ret;
+ GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0));
+ GUEST_ASSERT(this_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0) <= XSAVE_SIZE);
- if (sizeof(struct tile_data) >= xtile.xsave_size)
- ret = true;
+ xtile.xsave_offset = this_cpu_property(X86_PROPERTY_XSTATE_TILE_OFFSET);
+ GUEST_ASSERT(xtile.xsave_offset == 2816);
+ xtile.xsave_size = this_cpu_property(X86_PROPERTY_XSTATE_TILE_SIZE);
+ GUEST_ASSERT(xtile.xsave_size == 8192);
+ GUEST_ASSERT(sizeof(struct tile_data) >= xtile.xsave_size);
- return ret;
+ GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_NR_TILE_REGS));
+ xtile.max_names = this_cpu_property(X86_PROPERTY_AMX_NR_TILE_REGS);
+ GUEST_ASSERT(xtile.max_names == 8);
+ xtile.bytes_per_tile = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_TILE);
+ GUEST_ASSERT(xtile.bytes_per_tile == 1024);
+ xtile.bytes_per_row = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_ROW);
+ GUEST_ASSERT(xtile.bytes_per_row == 64);
+ xtile.max_rows = this_cpu_property(X86_PROPERTY_AMX_MAX_ROWS);
+ GUEST_ASSERT(xtile.max_rows == 16);
}
static void set_tilecfg(struct tile_config *cfg)
@@ -238,16 +188,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
{
init_regs();
check_cpuid_xsave();
- GUEST_ASSERT(check_xsave_supports_xtile());
- GUEST_ASSERT(check_xtile_info());
-
- /* check xtile configs */
- GUEST_ASSERT(xtile.xsave_offset == 2816);
- GUEST_ASSERT(xtile.xsave_size == 8192);
- GUEST_ASSERT(xtile.max_names == 8);
- GUEST_ASSERT(xtile.bytes_per_tile == 1024);
- GUEST_ASSERT(xtile.bytes_per_row == 64);
- GUEST_ASSERT(xtile.max_rows == 16);
+ check_xsave_supports_xtile();
+ check_xtile_info();
GUEST_SYNC(1);
/* xfd=0, enable amx */
@@ -317,8 +259,9 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG));
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA));
- /* Get xsave/restore max size */
- xsave_restore_size = kvm_get_supported_cpuid_entry(0xd)->ecx;
+ TEST_ASSERT(kvm_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE),
+ "KVM should enumerate max XSAVE size when XSAVE is supported");
+ xsave_restore_size = kvm_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE);
run = vcpu->run;
vcpu_regs_get(vcpu, &regs1);
diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
index a6aeee2e62e4..2fc3ad9c887e 100644
--- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
@@ -43,15 +43,6 @@ static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
}
-static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid)
-{
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x40000000, &eax, &ebx, &ecx, &edx);
-
- GUEST_ASSERT(eax == 0x40000001);
-}
-
static void guest_main(struct kvm_cpuid2 *guest_cpuid)
{
GUEST_SYNC(1);
@@ -60,7 +51,7 @@ static void guest_main(struct kvm_cpuid2 *guest_cpuid)
GUEST_SYNC(2);
- test_cpuid_40000000(guest_cpuid);
+ GUEST_ASSERT(this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF) == 0x40000001);
GUEST_DONE();
}
diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
index 4208487652f8..1027a671c7d3 100644
--- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
@@ -57,9 +57,6 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
run = vcpu->run;
diff --git a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c b/tools/testing/selftests/kvm/x86_64/emulator_error_test.c
deleted file mode 100644
index 236e11755ba6..000000000000
--- a/tools/testing/selftests/kvm/x86_64/emulator_error_test.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2020, Google LLC.
- *
- * Tests for KVM_CAP_EXIT_ON_EMULATION_FAILURE capability.
- */
-
-#define _GNU_SOURCE /* for program_invocation_short_name */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "vmx.h"
-
-#define MAXPHYADDR 36
-
-#define MEM_REGION_GVA 0x0000123456789000
-#define MEM_REGION_GPA 0x0000000700000000
-#define MEM_REGION_SLOT 10
-#define MEM_REGION_SIZE PAGE_SIZE
-
-static void guest_code(void)
-{
- __asm__ __volatile__("flds (%[addr])"
- :: [addr]"r"(MEM_REGION_GVA));
-
- GUEST_DONE();
-}
-
-/*
- * Accessors to get R/M, REG, and Mod bits described in the SDM vol 2,
- * figure 2-2 "Table Interpretation of ModR/M Byte (C8H)".
- */
-#define GET_RM(insn_byte) (insn_byte & 0x7)
-#define GET_REG(insn_byte) ((insn_byte & 0x38) >> 3)
-#define GET_MOD(insn_byte) ((insn_byte & 0xc) >> 6)
-
-/* Ensure we are dealing with a simple 2-byte flds instruction. */
-static bool is_flds(uint8_t *insn_bytes, uint8_t insn_size)
-{
- return insn_size >= 2 &&
- insn_bytes[0] == 0xd9 &&
- GET_REG(insn_bytes[1]) == 0x0 &&
- GET_MOD(insn_bytes[1]) == 0x0 &&
- /* Ensure there is no SIB byte. */
- GET_RM(insn_bytes[1]) != 0x4 &&
- /* Ensure there is no displacement byte. */
- GET_RM(insn_bytes[1]) != 0x5;
-}
-
-static void process_exit_on_emulation_error(struct kvm_vcpu *vcpu)
-{
- struct kvm_run *run = vcpu->run;
- struct kvm_regs regs;
- uint8_t *insn_bytes;
- uint8_t insn_size;
- uint64_t flags;
-
- TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
- "Unexpected exit reason: %u (%s)",
- run->exit_reason,
- exit_reason_str(run->exit_reason));
-
- TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
- "Unexpected suberror: %u",
- run->emulation_failure.suberror);
-
- if (run->emulation_failure.ndata >= 1) {
- flags = run->emulation_failure.flags;
- if ((flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES) &&
- run->emulation_failure.ndata >= 3) {
- insn_size = run->emulation_failure.insn_size;
- insn_bytes = run->emulation_failure.insn_bytes;
-
- TEST_ASSERT(insn_size <= 15 && insn_size > 0,
- "Unexpected instruction size: %u",
- insn_size);
-
- TEST_ASSERT(is_flds(insn_bytes, insn_size),
- "Unexpected instruction. Expected 'flds' (0xd9 /0)");
-
- /*
- * If is_flds() succeeded then the instruction bytes
- * contained an flds instruction that is 2-bytes in
- * length (ie: no prefix, no SIB, no displacement).
- */
- vcpu_regs_get(vcpu, &regs);
- regs.rip += 2;
- vcpu_regs_set(vcpu, &regs);
- }
- }
-}
-
-static void do_guest_assert(struct ucall *uc)
-{
- REPORT_GUEST_ASSERT(*uc);
-}
-
-static void check_for_guest_assert(struct kvm_vcpu *vcpu)
-{
- struct ucall uc;
-
- if (vcpu->run->exit_reason == KVM_EXIT_IO &&
- get_ucall(vcpu, &uc) == UCALL_ABORT) {
- do_guest_assert(&uc);
- }
-}
-
-static void process_ucall_done(struct kvm_vcpu *vcpu)
-{
- struct kvm_run *run = vcpu->run;
- struct ucall uc;
-
- check_for_guest_assert(vcpu);
-
- TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
- "Unexpected exit reason: %u (%s)",
- run->exit_reason,
- exit_reason_str(run->exit_reason));
-
- TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_DONE,
- "Unexpected ucall command: %lu, expected UCALL_DONE (%d)",
- uc.cmd, UCALL_DONE);
-}
-
-static uint64_t process_ucall(struct kvm_vcpu *vcpu)
-{
- struct kvm_run *run = vcpu->run;
- struct ucall uc;
-
- TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
- "Unexpected exit reason: %u (%s)",
- run->exit_reason,
- exit_reason_str(run->exit_reason));
-
- switch (get_ucall(vcpu, &uc)) {
- case UCALL_SYNC:
- break;
- case UCALL_ABORT:
- do_guest_assert(&uc);
- break;
- case UCALL_DONE:
- process_ucall_done(vcpu);
- break;
- default:
- TEST_ASSERT(false, "Unexpected ucall");
- }
-
- return uc.cmd;
-}
-
-int main(int argc, char *argv[])
-{
- struct kvm_vcpu *vcpu;
- struct kvm_vm *vm;
- uint64_t gpa, pte;
- uint64_t *hva;
- int rc;
-
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
- TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR));
-
- vm = vm_create_with_one_vcpu(&vcpu, guest_code);
-
- vcpu_set_cpuid_maxphyaddr(vcpu, MAXPHYADDR);
-
- rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE);
- TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable");
- vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
-
- vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
- MEM_REGION_GPA, MEM_REGION_SLOT,
- MEM_REGION_SIZE / PAGE_SIZE, 0);
- gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE,
- MEM_REGION_GPA, MEM_REGION_SLOT);
- TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
- virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1);
- hva = addr_gpa2hva(vm, MEM_REGION_GPA);
- memset(hva, 0, PAGE_SIZE);
- pte = vm_get_page_table_entry(vm, vcpu, MEM_REGION_GVA);
- vm_set_page_table_entry(vm, vcpu, MEM_REGION_GVA, pte | (1ull << 36));
-
- vcpu_run(vcpu);
- process_exit_on_emulation_error(vcpu);
- vcpu_run(vcpu);
-
- TEST_ASSERT(process_ucall(vcpu) == UCALL_DONE, "Expected UCALL_DONE");
-
- kvm_vm_free(vm);
-
- return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
new file mode 100644
index 000000000000..37c61f712fd5
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022, Google LLC.
+ *
+ * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+
+#include "flds_emulation.h"
+
+#include "test_util.h"
+
+#define MMIO_GPA 0x700000000
+#define MMIO_GVA MMIO_GPA
+
+static void guest_code(void)
+{
+ /* Execute flds with an MMIO address to force KVM to emulate it. */
+ flds(MMIO_GVA);
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE));
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+ vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
+ virt_map(vm, MMIO_GVA, MMIO_GPA, 1);
+
+ vcpu_run(vcpu);
+ handle_flds_emulation_failure_exit(vcpu);
+ vcpu_run(vcpu);
+ ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/flds_emulation.h b/tools/testing/selftests/kvm/x86_64/flds_emulation.h
new file mode 100644
index 000000000000..e43a7df25f2c
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/flds_emulation.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_FLDS_EMULATION_H
+#define SELFTEST_KVM_FLDS_EMULATION_H
+
+#include "kvm_util.h"
+
+#define FLDS_MEM_EAX ".byte 0xd9, 0x00"
+
+/*
+ * flds is an instruction that the KVM instruction emulator is known not to
+ * support. This can be used in guest code along with a mechanism to force
+ * KVM to emulate the instruction (e.g. by providing an MMIO address) to
+ * exercise emulation failures.
+ */
+static inline void flds(uint64_t address)
+{
+ __asm__ __volatile__(FLDS_MEM_EAX :: "a"(address));
+}
+
+static inline void handle_flds_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_regs regs;
+ uint8_t *insn_bytes;
+ uint64_t flags;
+
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+ "Unexpected exit reason: %u (%s)",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
+ "Unexpected suberror: %u",
+ run->emulation_failure.suberror);
+
+ flags = run->emulation_failure.flags;
+ TEST_ASSERT(run->emulation_failure.ndata >= 3 &&
+ flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES,
+ "run->emulation_failure is missing instruction bytes");
+
+ TEST_ASSERT(run->emulation_failure.insn_size >= 2,
+ "Expected a 2-byte opcode for 'flds', got %d bytes",
+ run->emulation_failure.insn_size);
+
+ insn_bytes = run->emulation_failure.insn_bytes;
+ TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0,
+ "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x\n",
+ insn_bytes[0], insn_bytes[1]);
+
+ vcpu_regs_get(vcpu, &regs);
+ regs.rip += 2;
+ vcpu_regs_set(vcpu, &regs);
+}
+
+#endif /* !SELFTEST_KVM_FLDS_EMULATION_H */
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
index e804eb08dff9..5c27efbf405e 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -134,9 +134,6 @@ int main(int argc, char *argv[])
const struct kvm_cpuid2 *hv_cpuid_entries;
struct kvm_vcpu *vcpu;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID));
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
index 99bc202243d2..ba09d300c953 100644
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
@@ -16,6 +16,7 @@
#include "kvm_util.h"
+#include "hyperv.h"
#include "vmx.h"
static int ud_count;
@@ -30,24 +31,19 @@ static void guest_nmi_handler(struct ex_regs *regs)
{
}
-/* Exits to L1 destroy GRPs! */
-static inline void rdmsr_fs_base(void)
+static inline void rdmsr_from_l2(uint32_t msr)
{
- __asm__ __volatile__ ("mov $0xc0000100, %%rcx; rdmsr" : : :
- "rax", "rbx", "rcx", "rdx",
- "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
- "r13", "r14", "r15");
-}
-static inline void rdmsr_gs_base(void)
-{
- __asm__ __volatile__ ("mov $0xc0000101, %%rcx; rdmsr" : : :
- "rax", "rbx", "rcx", "rdx",
- "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
- "r13", "r14", "r15");
+ /* Currently, L1 doesn't preserve GPRs during vmexits. */
+ __asm__ __volatile__ ("rdmsr" : : "c"(msr) :
+ "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+ "r10", "r11", "r12", "r13", "r14", "r15");
}
+/* Exit to L1 from L2 with RDMSR instruction */
void l2_guest_code(void)
{
+ u64 unused;
+
GUEST_SYNC(7);
GUEST_SYNC(8);
@@ -58,42 +54,58 @@ void l2_guest_code(void)
vmcall();
/* MSR-Bitmap tests */
- rdmsr_fs_base(); /* intercepted */
- rdmsr_fs_base(); /* intercepted */
- rdmsr_gs_base(); /* not intercepted */
+ rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+ rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+ rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */
vmcall();
- rdmsr_gs_base(); /* intercepted */
+ rdmsr_from_l2(MSR_GS_BASE); /* intercepted */
+
+ /* L2 TLB flush tests */
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0,
+ HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS);
+ rdmsr_from_l2(MSR_FS_BASE);
+ /*
+ * Note: hypercall status (RAX) is not preserved correctly by L1 after
+ * synthetic vmexit, use unchecked version.
+ */
+ __hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0,
+ HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS,
+ &unused);
/* Done, exit to L1 and never come back. */
vmcall();
}
-void guest_code(struct vmx_pages *vmx_pages)
+void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages,
+ vm_vaddr_t hv_hcall_page_gpa)
{
#define L2_GUEST_STACK_SIZE 64
unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+ wrmsr(HV_X64_MSR_HYPERCALL, hv_hcall_page_gpa);
+
x2apic_enable();
GUEST_SYNC(1);
GUEST_SYNC(2);
- enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist);
+ enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
+ evmcs_enable();
- GUEST_ASSERT(vmx_pages->vmcs_gpa);
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_SYNC(3);
- GUEST_ASSERT(load_vmcs(vmx_pages));
- GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+ GUEST_ASSERT(load_evmcs(hv_pages));
+ GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
GUEST_SYNC(4);
- GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+ GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
prepare_vmcs(vmx_pages, l2_guest_code,
&l2_guest_stack[L2_GUEST_STACK_SIZE]);
GUEST_SYNC(5);
- GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+ GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
current_evmcs->revision_id = -1u;
GUEST_ASSERT(vmlaunch());
current_evmcs->revision_id = EVMCS_VERSION;
@@ -102,8 +114,18 @@ void guest_code(struct vmx_pages *vmx_pages)
vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
PIN_BASED_NMI_EXITING);
+ /* L2 TLB flush setup */
+ current_evmcs->partition_assist_page = hv_pages->partition_assist_gpa;
+ current_evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+ current_evmcs->hv_vm_id = 1;
+ current_evmcs->hv_vp_id = 1;
+ current_vp_assist->nested_control.features.directhypercall = 1;
+ *(u32 *)(hv_pages->partition_assist) = 0;
+
GUEST_ASSERT(!vmlaunch());
- GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+ GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
+ GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), NMI_VECTOR);
+ GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
/*
* NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is
@@ -146,12 +168,24 @@ void guest_code(struct vmx_pages *vmx_pages)
GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
current_evmcs->guest_rip += 2; /* rdmsr */
+ /*
+ * L2 TLB flush test. First VMCALL should be handled directly by L0,
+ * no VMCALL exit expected.
+ */
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+ current_evmcs->guest_rip += 2; /* rdmsr */
+ /* Enable synthetic vmexit */
+ *(u32 *)(hv_pages->partition_assist) = 1;
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH);
+
GUEST_ASSERT(!vmresume());
GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
GUEST_SYNC(11);
/* Try enlightened vmptrld with an incorrect GPA */
- evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
+ evmcs_vmptrld(0xdeadbeef, hv_pages->enlightened_vmcs);
GUEST_ASSERT(vmlaunch());
GUEST_ASSERT(ud_count == 1);
GUEST_DONE();
@@ -198,7 +232,8 @@ static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm,
int main(int argc, char *argv[])
{
- vm_vaddr_t vmx_pages_gva = 0;
+ vm_vaddr_t vmx_pages_gva = 0, hv_pages_gva = 0;
+ vm_vaddr_t hcall_page;
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
@@ -212,11 +247,16 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS));
+ hcall_page = vm_vaddr_alloc_pages(vm, 1);
+ memset(addr_gva2hva(vm, hcall_page), 0x0, getpagesize());
+
vcpu_set_hv_cpuid(vcpu);
vcpu_enable_evmcs(vcpu);
vcpu_alloc_vmx(vm, &vmx_pages_gva);
- vcpu_args_set(vcpu, 1, vmx_pages_gva);
+ vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
+ vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page));
+ vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vcpu);
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 05b32e550a80..3163c3e8db0a 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -13,25 +13,6 @@
#include "processor.h"
#include "hyperv.h"
-#define LINUX_OS_ID ((u64)0x8100 << 48)
-
-static inline uint8_t hypercall(u64 control, vm_vaddr_t input_address,
- vm_vaddr_t output_address, uint64_t *hv_status)
-{
- uint8_t vector;
-
- /* Note both the hypercall and the "asm safe" clobber r9-r11. */
- asm volatile("mov %[output_address], %%r8\n\t"
- KVM_ASM_SAFE("vmcall")
- : "=a" (*hv_status),
- "+c" (control), "+d" (input_address),
- KVM_ASM_SAFE_OUTPUTS(vector)
- : [output_address] "r"(output_address),
- "a" (-EFAULT)
- : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS);
- return vector;
-}
-
struct msr_data {
uint32_t idx;
bool available;
@@ -71,7 +52,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall)
GUEST_ASSERT(hcall->control);
- wrmsr(HV_X64_MSR_GUEST_OS_ID, LINUX_OS_ID);
+ wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) {
@@ -81,7 +62,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall)
input = output = 0;
}
- vector = hypercall(hcall->control, input, output, &res);
+ vector = __hyperv_hypercall(hcall->control, input, output, &res);
if (hcall->ud_expected) {
GUEST_ASSERT_2(vector == UD_VECTOR, hcall->control, vector);
} else {
@@ -169,7 +150,7 @@ static void guest_test_msrs_access(void)
*/
msr->idx = HV_X64_MSR_GUEST_OS_ID;
msr->write = 1;
- msr->write_val = LINUX_OS_ID;
+ msr->write_val = HYPERV_LINUX_OS_ID;
msr->available = 1;
break;
case 3:
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c
new file mode 100644
index 000000000000..8b791eac7d5a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V HvCallSendSyntheticClusterIpi{,Ex} tests
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <pthread.h>
+#include <inttypes.h>
+
+#include "kvm_util.h"
+#include "hyperv.h"
+#include "test_util.h"
+#include "vmx.h"
+
+#define RECEIVER_VCPU_ID_1 2
+#define RECEIVER_VCPU_ID_2 65
+
+#define IPI_VECTOR 0xfe
+
+static volatile uint64_t ipis_rcvd[RECEIVER_VCPU_ID_2 + 1];
+
+struct hv_vpset {
+ u64 format;
+ u64 valid_bank_mask;
+ u64 bank_contents[2];
+};
+
+enum HV_GENERIC_SET_FORMAT {
+ HV_GENERIC_SET_SPARSE_4K,
+ HV_GENERIC_SET_ALL,
+};
+
+/* HvCallSendSyntheticClusterIpi hypercall */
+struct hv_send_ipi {
+ u32 vector;
+ u32 reserved;
+ u64 cpu_mask;
+};
+
+/* HvCallSendSyntheticClusterIpiEx hypercall */
+struct hv_send_ipi_ex {
+ u32 vector;
+ u32 reserved;
+ struct hv_vpset vp_set;
+};
+
+static inline void hv_init(vm_vaddr_t pgs_gpa)
+{
+ wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+ wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
+}
+
+static void receiver_code(void *hcall_page, vm_vaddr_t pgs_gpa)
+{
+ u32 vcpu_id;
+
+ x2apic_enable();
+ hv_init(pgs_gpa);
+
+ vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+
+ /* Signal sender vCPU we're ready */
+ ipis_rcvd[vcpu_id] = (u64)-1;
+
+ for (;;)
+ asm volatile("sti; hlt; cli");
+}
+
+static void guest_ipi_handler(struct ex_regs *regs)
+{
+ u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+
+ ipis_rcvd[vcpu_id]++;
+ wrmsr(HV_X64_MSR_EOI, 1);
+}
+
+static inline void nop_loop(void)
+{
+ int i;
+
+ for (i = 0; i < 100000000; i++)
+ asm volatile("nop");
+}
+
+static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa)
+{
+ struct hv_send_ipi *ipi = (struct hv_send_ipi *)hcall_page;
+ struct hv_send_ipi_ex *ipi_ex = (struct hv_send_ipi_ex *)hcall_page;
+ int stage = 1, ipis_expected[2] = {0};
+
+ hv_init(pgs_gpa);
+ GUEST_SYNC(stage++);
+
+ /* Wait for receiver vCPUs to come up */
+ while (!ipis_rcvd[RECEIVER_VCPU_ID_1] || !ipis_rcvd[RECEIVER_VCPU_ID_2])
+ nop_loop();
+ ipis_rcvd[RECEIVER_VCPU_ID_1] = ipis_rcvd[RECEIVER_VCPU_ID_2] = 0;
+
+ /* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */
+ ipi->vector = IPI_VECTOR;
+ ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1;
+ hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+ GUEST_SYNC(stage++);
+ /* 'Fast' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */
+ hyperv_hypercall(HVCALL_SEND_IPI | HV_HYPERCALL_FAST_BIT,
+ IPI_VECTOR, 1 << RECEIVER_VCPU_ID_1);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+ GUEST_SYNC(stage++);
+
+ /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */
+ memset(hcall_page, 0, 4096);
+ ipi_ex->vector = IPI_VECTOR;
+ ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ ipi_ex->vp_set.valid_bank_mask = 1 << 0;
+ ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1);
+ hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+ pgs_gpa, pgs_gpa + 4096);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+ GUEST_SYNC(stage++);
+ /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */
+ hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1);
+ hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+ (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+ IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+ GUEST_SYNC(stage++);
+
+ /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */
+ memset(hcall_page, 0, 4096);
+ ipi_ex->vector = IPI_VECTOR;
+ ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ ipi_ex->vp_set.valid_bank_mask = 1 << 1;
+ ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64);
+ hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+ pgs_gpa, pgs_gpa + 4096);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+ GUEST_SYNC(stage++);
+ /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */
+ hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1);
+ hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+ (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+ IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+ GUEST_SYNC(stage++);
+
+ /* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */
+ memset(hcall_page, 0, 4096);
+ ipi_ex->vector = IPI_VECTOR;
+ ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1;
+ ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1);
+ ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64);
+ hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+ pgs_gpa, pgs_gpa + 4096);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+ GUEST_SYNC(stage++);
+ /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1, 2} */
+ hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2);
+ hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+ (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+ IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+ GUEST_SYNC(stage++);
+
+ /* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */
+ memset(hcall_page, 0, 4096);
+ ipi_ex->vector = IPI_VECTOR;
+ ipi_ex->vp_set.format = HV_GENERIC_SET_ALL;
+ hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+ GUEST_SYNC(stage++);
+ /*
+ * 'XMM Fast' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL.
+ * Nothing to write anything to XMM regs.
+ */
+ hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT,
+ IPI_VECTOR, HV_GENERIC_SET_ALL);
+ nop_loop();
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+ GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+ GUEST_SYNC(stage++);
+
+ GUEST_DONE();
+}
+
+static void *vcpu_thread(void *arg)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg;
+ int old, r;
+
+ r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+ TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+ vcpu->id, r);
+
+ vcpu_run(vcpu);
+
+ TEST_FAIL("vCPU %u exited unexpectedly", vcpu->id);
+
+ return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
+{
+ void *retval;
+ int r;
+
+ r = pthread_cancel(thread);
+ TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d",
+ vcpu->id, r);
+
+ r = pthread_join(thread, &retval);
+ TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d",
+ vcpu->id, r);
+ TEST_ASSERT(retval == PTHREAD_CANCELED,
+ "expected retval=%p, got %p", PTHREAD_CANCELED,
+ retval);
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_vcpu *vcpu[3];
+ unsigned int exit_reason;
+ vm_vaddr_t hcall_page;
+ pthread_t threads[2];
+ int stage = 1, r;
+ struct ucall uc;
+
+ vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code);
+
+ /* Hypercall input/output */
+ hcall_page = vm_vaddr_alloc_pages(vm, 2);
+ memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
+
+ vm_init_descriptor_tables(vm);
+
+ vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code);
+ vcpu_init_descriptor_tables(vcpu[1]);
+ vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+ vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1);
+ vcpu_set_hv_cpuid(vcpu[1]);
+
+ vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code);
+ vcpu_init_descriptor_tables(vcpu[2]);
+ vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+ vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2);
+ vcpu_set_hv_cpuid(vcpu[2]);
+
+ vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler);
+
+ vcpu_args_set(vcpu[0], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+ vcpu_set_hv_cpuid(vcpu[0]);
+
+ r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]);
+ TEST_ASSERT(!r, "pthread_create failed errno=%d", r);
+
+ r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]);
+ TEST_ASSERT(!r, "pthread_create failed errno=%d", errno);
+
+ while (true) {
+ vcpu_run(vcpu[0]);
+
+ exit_reason = vcpu[0]->run->exit_reason;
+ TEST_ASSERT(exit_reason == KVM_EXIT_IO,
+ "unexpected exit reason: %u (%s)",
+ exit_reason, exit_reason_str(exit_reason));
+
+ switch (get_ucall(vcpu[0], &uc)) {
+ case UCALL_SYNC:
+ TEST_ASSERT(uc.args[1] == stage,
+ "Unexpected stage: %ld (%d expected)\n",
+ uc.args[1], stage);
+ break;
+ case UCALL_DONE:
+ goto done;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ /* NOT REACHED */
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ stage++;
+ }
+
+done:
+ cancel_join_vcpu_thread(threads[0], vcpu[1]);
+ cancel_join_vcpu_thread(threads[1], vcpu[2]);
+ kvm_vm_free(vm);
+
+ return r;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
index a380ad7bb9b3..3b3cc94ba8e4 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
@@ -23,59 +23,78 @@
#define L2_GUEST_STACK_SIZE 256
-struct hv_enlightenments {
- struct __packed hv_enlightenments_control {
- u32 nested_flush_hypercall:1;
- u32 msr_bitmap:1;
- u32 enlightened_npt_tlb: 1;
- u32 reserved:29;
- } __packed hv_enlightenments_control;
- u32 hv_vp_id;
- u64 hv_vm_id;
- u64 partition_assist_page;
- u64 reserved;
-} __packed;
-
-/*
- * Hyper-V uses the software reserved clean bit in VMCB
- */
-#define VMCB_HV_NESTED_ENLIGHTENMENTS (1U << 31)
+/* Exit to L1 from L2 with RDMSR instruction */
+static inline void rdmsr_from_l2(uint32_t msr)
+{
+ /* Currently, L1 doesn't preserve GPRs during vmexits. */
+ __asm__ __volatile__ ("rdmsr" : : "c"(msr) :
+ "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+ "r10", "r11", "r12", "r13", "r14", "r15");
+}
void l2_guest_code(void)
{
+ u64 unused;
+
GUEST_SYNC(3);
/* Exit to L1 */
vmmcall();
/* MSR-Bitmap tests */
- rdmsr(MSR_FS_BASE); /* intercepted */
- rdmsr(MSR_FS_BASE); /* intercepted */
- rdmsr(MSR_GS_BASE); /* not intercepted */
+ rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+ rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+ rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */
vmmcall();
- rdmsr(MSR_GS_BASE); /* intercepted */
+ rdmsr_from_l2(MSR_GS_BASE); /* intercepted */
GUEST_SYNC(5);
+ /* L2 TLB flush tests */
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+ HV_HYPERCALL_FAST_BIT, 0x0,
+ HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+ HV_FLUSH_ALL_PROCESSORS);
+ rdmsr_from_l2(MSR_FS_BASE);
+ /*
+ * Note: hypercall status (RAX) is not preserved correctly by L1 after
+ * synthetic vmexit, use unchecked version.
+ */
+ __hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+ HV_HYPERCALL_FAST_BIT, 0x0,
+ HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+ HV_FLUSH_ALL_PROCESSORS, &unused);
+
/* Done, exit to L1 and never come back. */
vmmcall();
}
-static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm)
+static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm,
+ struct hyperv_test_pages *hv_pages,
+ vm_vaddr_t pgs_gpa)
{
unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
struct vmcb *vmcb = svm->vmcb;
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
GUEST_SYNC(1);
- wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48);
+ wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+ wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
+ enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
GUEST_ASSERT(svm->vmcb_gpa);
/* Prepare for L2 execution. */
generic_svm_setup(svm, l2_guest_code,
&l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ /* L2 TLB flush setup */
+ hve->partition_assist_page = hv_pages->partition_assist_gpa;
+ hve->hv_enlightenments_control.nested_flush_hypercall = 1;
+ hve->hv_vm_id = 1;
+ hve->hv_vp_id = 1;
+ current_vp_assist->nested_control.features.directhypercall = 1;
+ *(u32 *)(hv_pages->partition_assist) = 0;
+
GUEST_SYNC(2);
run_guest(vmcb, svm->vmcb_gpa);
GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
@@ -98,18 +117,32 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm)
/* Intercept RDMSR 0xc0000101 without telling KVM about it */
set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800);
/* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
- vmcb->control.clean |= VMCB_HV_NESTED_ENLIGHTENMENTS;
+ vmcb->control.clean |= HV_VMCB_NESTED_ENLIGHTENMENTS;
run_guest(vmcb, svm->vmcb_gpa);
/* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */
GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
vmcb->save.rip += 3; /* vmcall */
/* Now tell KVM we've changed MSR-Bitmap */
- vmcb->control.clean &= ~VMCB_HV_NESTED_ENLIGHTENMENTS;
+ vmcb->control.clean &= ~HV_VMCB_NESTED_ENLIGHTENMENTS;
run_guest(vmcb, svm->vmcb_gpa);
GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
vmcb->save.rip += 2; /* rdmsr */
+
+ /*
+ * L2 TLB flush test. First VMCALL should be handled directly by L0,
+ * no VMCALL exit expected.
+ */
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+ vmcb->save.rip += 2; /* rdmsr */
+ /* Enable synthetic vmexit */
+ *(u32 *)(hv_pages->partition_assist) = 1;
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == HV_SVM_EXITCODE_ENL);
+ GUEST_ASSERT(vmcb->control.exit_info_1 == HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH);
+
run_guest(vmcb, svm->vmcb_gpa);
GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
GUEST_SYNC(6);
@@ -119,8 +152,8 @@ static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm)
int main(int argc, char *argv[])
{
- vm_vaddr_t nested_gva = 0;
-
+ vm_vaddr_t nested_gva = 0, hv_pages_gva = 0;
+ vm_vaddr_t hcall_page;
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
struct kvm_run *run;
@@ -134,7 +167,13 @@ int main(int argc, char *argv[])
vcpu_set_hv_cpuid(vcpu);
run = vcpu->run;
vcpu_alloc_svm(vm, &nested_gva);
- vcpu_args_set(vcpu, 1, nested_gva);
+ vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
+
+ hcall_page = vm_vaddr_alloc_pages(vm, 1);
+ memset(addr_gva2hva(vm, hcall_page), 0x0, getpagesize());
+
+ vcpu_args_set(vcpu, 3, nested_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page));
+ vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
for (stage = 1;; stage++) {
vcpu_run(vcpu);
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c
new file mode 100644
index 000000000000..68f97ff720a7
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c
@@ -0,0 +1,690 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V HvFlushVirtualAddress{List,Space}{,Ex} tests
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <asm/barrier.h>
+#include <pthread.h>
+#include <inttypes.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+#include "test_util.h"
+#include "vmx.h"
+
+#define WORKER_VCPU_ID_1 2
+#define WORKER_VCPU_ID_2 65
+
+#define NTRY 100
+#define NTEST_PAGES 2
+
+struct hv_vpset {
+ u64 format;
+ u64 valid_bank_mask;
+ u64 bank_contents[];
+};
+
+enum HV_GENERIC_SET_FORMAT {
+ HV_GENERIC_SET_SPARSE_4K,
+ HV_GENERIC_SET_ALL,
+};
+
+#define HV_FLUSH_ALL_PROCESSORS BIT(0)
+#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
+#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
+#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)
+
+/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
+struct hv_tlb_flush {
+ u64 address_space;
+ u64 flags;
+ u64 processor_mask;
+ u64 gva_list[];
+} __packed;
+
+/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */
+struct hv_tlb_flush_ex {
+ u64 address_space;
+ u64 flags;
+ struct hv_vpset hv_vp_set;
+ u64 gva_list[];
+} __packed;
+
+/*
+ * Pass the following info to 'workers' and 'sender'
+ * - Hypercall page's GVA
+ * - Hypercall page's GPA
+ * - Test pages GVA
+ * - GVAs of the test pages' PTEs
+ */
+struct test_data {
+ vm_vaddr_t hcall_gva;
+ vm_paddr_t hcall_gpa;
+ vm_vaddr_t test_pages;
+ vm_vaddr_t test_pages_pte[NTEST_PAGES];
+};
+
+/* 'Worker' vCPU code checking the contents of the test page */
+static void worker_guest_code(vm_vaddr_t test_data)
+{
+ struct test_data *data = (struct test_data *)test_data;
+ u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+ void *exp_page = (void *)data->test_pages + PAGE_SIZE * NTEST_PAGES;
+ u64 *this_cpu = (u64 *)(exp_page + vcpu_id * sizeof(u64));
+ u64 expected, val;
+
+ x2apic_enable();
+ wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+
+ for (;;) {
+ cpu_relax();
+
+ expected = READ_ONCE(*this_cpu);
+
+ /*
+ * Make sure the value in the test page is read after reading
+ * the expectation for the first time. Pairs with wmb() in
+ * prepare_to_test().
+ */
+ rmb();
+
+ val = READ_ONCE(*(u64 *)data->test_pages);
+
+ /*
+ * Make sure the value in the test page is read after before
+ * reading the expectation for the second time. Pairs with wmb()
+ * post_test().
+ */
+ rmb();
+
+ /*
+ * '0' indicates the sender is between iterations, wait until
+ * the sender is ready for this vCPU to start checking again.
+ */
+ if (!expected)
+ continue;
+
+ /*
+ * Re-read the per-vCPU byte to ensure the sender didn't move
+ * onto a new iteration.
+ */
+ if (expected != READ_ONCE(*this_cpu))
+ continue;
+
+ GUEST_ASSERT(val == expected);
+ }
+}
+
+/*
+ * Write per-CPU info indicating what each 'worker' CPU is supposed to see in
+ * test page. '0' means don't check.
+ */
+static void set_expected_val(void *addr, u64 val, int vcpu_id)
+{
+ void *exp_page = addr + PAGE_SIZE * NTEST_PAGES;
+
+ *(u64 *)(exp_page + vcpu_id * sizeof(u64)) = val;
+}
+
+/*
+ * Update PTEs swapping two test pages.
+ * TODO: use swap()/xchg() when these are provided.
+ */
+static void swap_two_test_pages(vm_paddr_t pte_gva1, vm_paddr_t pte_gva2)
+{
+ uint64_t tmp = *(uint64_t *)pte_gva1;
+
+ *(uint64_t *)pte_gva1 = *(uint64_t *)pte_gva2;
+ *(uint64_t *)pte_gva2 = tmp;
+}
+
+/*
+ * TODO: replace the silly NOP loop with a proper udelay() implementation.
+ */
+static inline void do_delay(void)
+{
+ int i;
+
+ for (i = 0; i < 1000000; i++)
+ asm volatile("nop");
+}
+
+/*
+ * Prepare to test: 'disable' workers by setting the expectation to '0',
+ * clear hypercall input page and then swap two test pages.
+ */
+static inline void prepare_to_test(struct test_data *data)
+{
+ /* Clear hypercall input page */
+ memset((void *)data->hcall_gva, 0, PAGE_SIZE);
+
+ /* 'Disable' workers */
+ set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_1);
+ set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_2);
+
+ /* Make sure workers are 'disabled' before we swap PTEs. */
+ wmb();
+
+ /* Make sure workers have enough time to notice */
+ do_delay();
+
+ /* Swap test page mappings */
+ swap_two_test_pages(data->test_pages_pte[0], data->test_pages_pte[1]);
+}
+
+/*
+ * Finalize the test: check hypercall resule set the expected val for
+ * 'worker' CPUs and give them some time to test.
+ */
+static inline void post_test(struct test_data *data, u64 exp1, u64 exp2)
+{
+ /* Make sure we change the expectation after swapping PTEs */
+ wmb();
+
+ /* Set the expectation for workers, '0' means don't test */
+ set_expected_val((void *)data->test_pages, exp1, WORKER_VCPU_ID_1);
+ set_expected_val((void *)data->test_pages, exp2, WORKER_VCPU_ID_2);
+
+ /* Make sure workers have enough time to test */
+ do_delay();
+}
+
+#define TESTVAL1 0x0101010101010101
+#define TESTVAL2 0x0202020202020202
+
+/* Main vCPU doing the test */
+static void sender_guest_code(vm_vaddr_t test_data)
+{
+ struct test_data *data = (struct test_data *)test_data;
+ struct hv_tlb_flush *flush = (struct hv_tlb_flush *)data->hcall_gva;
+ struct hv_tlb_flush_ex *flush_ex = (struct hv_tlb_flush_ex *)data->hcall_gva;
+ vm_paddr_t hcall_gpa = data->hcall_gpa;
+ int i, stage = 1;
+
+ wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+ wrmsr(HV_X64_MSR_HYPERCALL, data->hcall_gpa);
+
+ /* "Slow" hypercalls */
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa,
+ hcall_gpa + PAGE_SIZE);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+ flush->gva_list[0] = (u64)data->test_pages;
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ hcall_gpa, hcall_gpa + PAGE_SIZE);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+ HV_FLUSH_ALL_PROCESSORS;
+ flush->processor_mask = 0;
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa,
+ hcall_gpa + PAGE_SIZE);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+ HV_FLUSH_ALL_PROCESSORS;
+ flush->gva_list[0] = (u64)data->test_pages;
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ hcall_gpa, hcall_gpa + PAGE_SIZE);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+ flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+ (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+ hcall_gpa, hcall_gpa + PAGE_SIZE);
+ post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+ flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+ /* bank_contents and gva_list occupy the same space, thus [1] */
+ flush_ex->gva_list[1] = (u64)data->test_pages;
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+ (1 << HV_HYPERCALL_VARHEAD_OFFSET) |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ hcall_gpa, hcall_gpa + PAGE_SIZE);
+ post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) |
+ BIT_ULL(WORKER_VCPU_ID_1 / 64);
+ flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+ flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+ (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+ hcall_gpa, hcall_gpa + PAGE_SIZE);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) |
+ BIT_ULL(WORKER_VCPU_ID_2 / 64);
+ flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+ flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+ /* bank_contents and gva_list occupy the same space, thus [2] */
+ flush_ex->gva_list[2] = (u64)data->test_pages;
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+ (2 << HV_HYPERCALL_VARHEAD_OFFSET) |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ hcall_gpa, hcall_gpa + PAGE_SIZE);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
+ hcall_gpa, hcall_gpa + PAGE_SIZE);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+ flush_ex->gva_list[0] = (u64)data->test_pages;
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ hcall_gpa, hcall_gpa + PAGE_SIZE);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ /* "Fast" hypercalls */
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+ hyperv_write_xmm_input(&flush->processor_mask, 1);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+ HV_HYPERCALL_FAST_BIT, 0x0,
+ HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+ flush->gva_list[0] = (u64)data->test_pages;
+ hyperv_write_xmm_input(&flush->processor_mask, 1);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+ HV_HYPERCALL_FAST_BIT |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ hyperv_write_xmm_input(&flush->processor_mask, 1);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+ HV_HYPERCALL_FAST_BIT, 0x0,
+ HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+ HV_FLUSH_ALL_PROCESSORS);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush->gva_list[0] = (u64)data->test_pages;
+ hyperv_write_xmm_input(&flush->processor_mask, 1);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+ HV_HYPERCALL_FAST_BIT |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET), 0x0,
+ HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+ HV_FLUSH_ALL_PROCESSORS);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+ flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+ hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+ HV_HYPERCALL_FAST_BIT |
+ (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+ 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+ post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+ flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+ /* bank_contents and gva_list occupy the same space, thus [1] */
+ flush_ex->gva_list[1] = (u64)data->test_pages;
+ hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+ HV_HYPERCALL_FAST_BIT |
+ (1 << HV_HYPERCALL_VARHEAD_OFFSET) |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+ post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) |
+ BIT_ULL(WORKER_VCPU_ID_1 / 64);
+ flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+ flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+ hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+ HV_HYPERCALL_FAST_BIT |
+ (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+ 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+ post_test(data, i % 2 ? TESTVAL1 :
+ TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) |
+ BIT_ULL(WORKER_VCPU_ID_2 / 64);
+ flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+ flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+ /* bank_contents and gva_list occupy the same space, thus [2] */
+ flush_ex->gva_list[2] = (u64)data->test_pages;
+ hyperv_write_xmm_input(&flush_ex->hv_vp_set, 3);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+ HV_HYPERCALL_FAST_BIT |
+ (2 << HV_HYPERCALL_VARHEAD_OFFSET) |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+ hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+ HV_HYPERCALL_FAST_BIT,
+ 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_SYNC(stage++);
+
+ /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */
+ for (i = 0; i < NTRY; i++) {
+ prepare_to_test(data);
+ flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+ flush_ex->gva_list[0] = (u64)data->test_pages;
+ hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+ hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+ HV_HYPERCALL_FAST_BIT |
+ (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+ 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+ post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+ i % 2 ? TESTVAL1 : TESTVAL2);
+ }
+
+ GUEST_DONE();
+}
+
+static void *vcpu_thread(void *arg)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg;
+ struct ucall uc;
+ int old;
+ int r;
+ unsigned int exit_reason;
+
+ r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+ TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+ vcpu->id, r);
+
+ vcpu_run(vcpu);
+ exit_reason = vcpu->run->exit_reason;
+
+ TEST_ASSERT(exit_reason == KVM_EXIT_IO,
+ "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO",
+ vcpu->id, exit_reason, exit_reason_str(exit_reason));
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ /* NOT REACHED */
+ default:
+ TEST_FAIL("Unexpected ucall %lu, vCPU %d", uc.cmd, vcpu->id);
+ }
+
+ return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
+{
+ void *retval;
+ int r;
+
+ r = pthread_cancel(thread);
+ TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d",
+ vcpu->id, r);
+
+ r = pthread_join(thread, &retval);
+ TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d",
+ vcpu->id, r);
+ TEST_ASSERT(retval == PTHREAD_CANCELED,
+ "expected retval=%p, got %p", PTHREAD_CANCELED,
+ retval);
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_vcpu *vcpu[3];
+ unsigned int exit_reason;
+ pthread_t threads[2];
+ vm_vaddr_t test_data_page, gva;
+ vm_paddr_t gpa;
+ uint64_t *pte;
+ struct test_data *data;
+ struct ucall uc;
+ int stage = 1, r, i;
+
+ vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code);
+
+ /* Test data page */
+ test_data_page = vm_vaddr_alloc_page(vm);
+ data = (struct test_data *)addr_gva2hva(vm, test_data_page);
+
+ /* Hypercall input/output */
+ data->hcall_gva = vm_vaddr_alloc_pages(vm, 2);
+ data->hcall_gpa = addr_gva2gpa(vm, data->hcall_gva);
+ memset(addr_gva2hva(vm, data->hcall_gva), 0x0, 2 * PAGE_SIZE);
+
+ /*
+ * Test pages: the first one is filled with '0x01's, the second with '0x02's
+ * and the test will swap their mappings. The third page keeps the indication
+ * about the current state of mappings.
+ */
+ data->test_pages = vm_vaddr_alloc_pages(vm, NTEST_PAGES + 1);
+ for (i = 0; i < NTEST_PAGES; i++)
+ memset(addr_gva2hva(vm, data->test_pages + PAGE_SIZE * i),
+ (u8)(i + 1), PAGE_SIZE);
+ set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_1);
+ set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_2);
+
+ /*
+ * Get PTE pointers for test pages and map them inside the guest.
+ * Use separate page for each PTE for simplicity.
+ */
+ gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR);
+ for (i = 0; i < NTEST_PAGES; i++) {
+ pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE);
+ gpa = addr_hva2gpa(vm, pte);
+ __virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K);
+ data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK);
+ }
+
+ /*
+ * Sender vCPU which performs the test: swaps test pages, sets expectation
+ * for 'workers' and issues TLB flush hypercalls.
+ */
+ vcpu_args_set(vcpu[0], 1, test_data_page);
+ vcpu_set_hv_cpuid(vcpu[0]);
+
+ /* Create worker vCPUs which check the contents of the test pages */
+ vcpu[1] = vm_vcpu_add(vm, WORKER_VCPU_ID_1, worker_guest_code);
+ vcpu_args_set(vcpu[1], 1, test_data_page);
+ vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_1);
+ vcpu_set_hv_cpuid(vcpu[1]);
+
+ vcpu[2] = vm_vcpu_add(vm, WORKER_VCPU_ID_2, worker_guest_code);
+ vcpu_args_set(vcpu[2], 1, test_data_page);
+ vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_2);
+ vcpu_set_hv_cpuid(vcpu[2]);
+
+ r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]);
+ TEST_ASSERT(!r, "pthread_create() failed");
+
+ r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]);
+ TEST_ASSERT(!r, "pthread_create() failed");
+
+ while (true) {
+ vcpu_run(vcpu[0]);
+ exit_reason = vcpu[0]->run->exit_reason;
+
+ TEST_ASSERT(exit_reason == KVM_EXIT_IO,
+ "unexpected exit reason: %u (%s)",
+ exit_reason, exit_reason_str(exit_reason));
+
+ switch (get_ucall(vcpu[0], &uc)) {
+ case UCALL_SYNC:
+ TEST_ASSERT(uc.args[1] == stage,
+ "Unexpected stage: %ld (%d expected)\n",
+ uc.args[1], stage);
+ break;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ /* NOT REACHED */
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ stage++;
+ }
+
+done:
+ cancel_join_vcpu_thread(threads[0], vcpu[1]);
+ cancel_join_vcpu_thread(threads[1], vcpu[2]);
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
index 59ffe7fd354f..ea0978f22db8 100644
--- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
+++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
@@ -241,10 +241,10 @@ int main(int argc, char **argv)
while ((opt = getopt(argc, argv, "hp:t:r")) != -1) {
switch (opt) {
case 'p':
- reclaim_period_ms = atoi(optarg);
+ reclaim_period_ms = atoi_non_negative("Reclaim period", optarg);
break;
case 't':
- token = atoi(optarg);
+ token = atoi_paranoid(optarg);
break;
case 'r':
reboot_permissions = true;
@@ -257,7 +257,6 @@ int main(int argc, char **argv)
}
TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES));
- TEST_REQUIRE(reclaim_period_ms > 0);
__TEST_REQUIRE(token == MAGIC_TOKEN,
"This test must be run with the magic token %d.\n"
diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
index 76417c7d687b..310a104d94f0 100644
--- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c
+++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
@@ -72,9 +72,6 @@ int main(int argc, char *argv[])
struct kvm_vm *vm;
uint64_t msr_platform_info;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO));
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index ea4e259a1e2e..2de98fce7edd 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -21,29 +21,6 @@
#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
-union cpuid10_eax {
- struct {
- unsigned int version_id:8;
- unsigned int num_counters:8;
- unsigned int bit_width:8;
- unsigned int mask_length:8;
- } split;
- unsigned int full;
-};
-
-union cpuid10_ebx {
- struct {
- unsigned int no_unhalted_core_cycles:1;
- unsigned int no_instructions_retired:1;
- unsigned int no_unhalted_reference_cycles:1;
- unsigned int no_llc_reference:1;
- unsigned int no_llc_misses:1;
- unsigned int no_branch_instruction_retired:1;
- unsigned int no_branch_misses_retired:1;
- } split;
- unsigned int full;
-};
-
/* End of stuff taken from perf_event.h. */
/* Oddly, this isn't in perf_event.h. */
@@ -380,46 +357,31 @@ static void test_pmu_config_disable(void (*guest_code)(void))
}
/*
- * Check for a non-zero PMU version, at least one general-purpose
- * counter per logical processor, an EBX bit vector of length greater
- * than 5, and EBX[5] clear.
- */
-static bool check_intel_pmu_leaf(const struct kvm_cpuid_entry2 *entry)
-{
- union cpuid10_eax eax = { .full = entry->eax };
- union cpuid10_ebx ebx = { .full = entry->ebx };
-
- return eax.split.version_id && eax.split.num_counters > 0 &&
- eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED &&
- !ebx.split.no_branch_instruction_retired;
-}
-
-/*
- * Note that CPUID leaf 0xa is Intel-specific. This leaf should be
- * clear on AMD hardware.
+ * On Intel, check for a non-zero PMU version, at least one general-purpose
+ * counter per logical processor, and support for counting the number of branch
+ * instructions retired.
*/
static bool use_intel_pmu(void)
{
- const struct kvm_cpuid_entry2 *entry;
-
- entry = kvm_get_supported_cpuid_entry(0xa);
- return is_intel_cpu() && check_intel_pmu_leaf(entry);
+ return is_intel_cpu() &&
+ kvm_cpu_property(X86_PROPERTY_PMU_VERSION) &&
+ kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) &&
+ kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED);
}
-static bool is_zen1(uint32_t eax)
+static bool is_zen1(uint32_t family, uint32_t model)
{
- return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f;
+ return family == 0x17 && model <= 0x0f;
}
-static bool is_zen2(uint32_t eax)
+static bool is_zen2(uint32_t family, uint32_t model)
{
- return x86_family(eax) == 0x17 &&
- x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f;
+ return family == 0x17 && model >= 0x30 && model <= 0x3f;
}
-static bool is_zen3(uint32_t eax)
+static bool is_zen3(uint32_t family, uint32_t model)
{
- return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f;
+ return family == 0x19 && model <= 0x0f;
}
/*
@@ -432,13 +394,13 @@ static bool is_zen3(uint32_t eax)
*/
static bool use_amd_pmu(void)
{
- const struct kvm_cpuid_entry2 *entry;
+ uint32_t family = kvm_cpu_family();
+ uint32_t model = kvm_cpu_model();
- entry = kvm_get_supported_cpuid_entry(1);
return is_amd_cpu() &&
- (is_zen1(entry->eax) ||
- is_zen2(entry->eax) ||
- is_zen3(entry->eax));
+ (is_zen1(family, model) ||
+ is_zen2(family, model) ||
+ is_zen3(family, model));
}
int main(int argc, char *argv[])
@@ -447,9 +409,6 @@ int main(int argc, char *argv[])
struct kvm_vcpu *vcpu;
struct kvm_vm *vm;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER));
TEST_REQUIRE(use_intel_pmu() || use_amd_pmu());
diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
index 2bb08bf2125d..a284fcef6ed7 100644
--- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
@@ -82,9 +82,6 @@ int main(int argc, char *argv[])
uint64_t cr4;
int rc;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
/*
* Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and
* use it to verify all supported CR4 bits can be set prior to defining
diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
new file mode 100644
index 000000000000..06edf00a97d6
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Test that KVM emulates instructions in response to EPT violations when
+ * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+
+#include "flds_emulation.h"
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define MAXPHYADDR 36
+
+#define MEM_REGION_GVA 0x0000123456789000
+#define MEM_REGION_GPA 0x0000000700000000
+#define MEM_REGION_SLOT 10
+#define MEM_REGION_SIZE PAGE_SIZE
+
+static void guest_code(bool tdp_enabled)
+{
+ uint64_t error_code;
+ uint64_t vector;
+
+ vector = kvm_asm_safe_ec(FLDS_MEM_EAX, error_code, "a"(MEM_REGION_GVA));
+
+ /*
+ * When TDP is enabled, flds will trigger an emulation failure, exit to
+ * userspace, and then the selftest host "VMM" skips the instruction.
+ *
+ * When TDP is disabled, no instruction emulation is required so flds
+ * should generate #PF(RSVD).
+ */
+ if (tdp_enabled) {
+ GUEST_ASSERT(!vector);
+ } else {
+ GUEST_ASSERT_EQ(vector, PF_VECTOR);
+ GUEST_ASSERT(error_code & PFERR_RSVD_MASK);
+ }
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ struct ucall uc;
+ uint64_t *pte;
+ uint64_t *hva;
+ uint64_t gpa;
+ int rc;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR));
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+ vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled());
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vcpu);
+
+ vcpu_set_cpuid_maxphyaddr(vcpu, MAXPHYADDR);
+
+ rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE);
+ TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable");
+ vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ MEM_REGION_GPA, MEM_REGION_SLOT,
+ MEM_REGION_SIZE / PAGE_SIZE, 0);
+ gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE,
+ MEM_REGION_GPA, MEM_REGION_SLOT);
+ TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
+ virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1);
+ hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+ memset(hva, 0, PAGE_SIZE);
+
+ pte = vm_get_page_table_entry(vm, MEM_REGION_GVA);
+ *pte |= BIT_ULL(MAXPHYADDR);
+
+ vcpu_run(vcpu);
+
+ /*
+ * When TDP is enabled, KVM must emulate in response the guest physical
+ * address that is illegal from the guest's perspective, but is legal
+ * from hardware's perspeective. This should result in an emulation
+ * failure exit to userspace since KVM doesn't support emulating flds.
+ */
+ if (kvm_is_tdp_enabled()) {
+ handle_flds_emulation_failure_exit(vcpu);
+ vcpu_run(vcpu);
+ }
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ case UCALL_DONE:
+ break;
+ default:
+ TEST_FAIL("Unrecognized ucall: %lu\n", uc.cmd);
+ }
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
index 1f136a81858e..cb38a478e1f6 100644
--- a/tools/testing/selftests/kvm/x86_64/smm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -137,6 +137,8 @@ int main(int argc, char *argv[])
struct kvm_x86_state *state;
int stage, stage_reported;
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_SMM));
+
/* Create VM */
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c
new file mode 100644
index 000000000000..e73fcdef47bb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_nested_shutdown_test
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ * Nested SVM testing: test that unintercepted shutdown in L2 doesn't crash the host
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+ __asm__ __volatile__("ud2");
+}
+
+static void l1_guest_code(struct svm_test_data *svm, struct idt_entry *idt)
+{
+ #define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ struct vmcb *vmcb = svm->vmcb;
+
+ generic_svm_setup(svm, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
+
+ idt[6].p = 0; // #UD is intercepted but its injection will cause #NP
+ idt[11].p = 0; // #NP is not intercepted and will cause another
+ // #NP that will be converted to #DF
+ idt[8].p = 0; // #DF will cause #NP which will cause SHUTDOWN
+
+ run_guest(vmcb, svm->vmcb_gpa);
+
+ /* should not reach here */
+ GUEST_ASSERT(0);
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_run *run;
+ vm_vaddr_t svm_gva;
+ struct kvm_vm *vm;
+
+ TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vcpu);
+
+ vcpu_alloc_svm(vm, &svm_gva);
+
+ vcpu_args_set(vcpu, 2, svm_gva, vm->idt);
+ run = vcpu->run;
+
+ vcpu_run(vcpu);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
+ "Got exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
index e637d7736012..e497ace629c1 100644
--- a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
+++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
@@ -194,9 +194,6 @@ done:
int main(int argc, char *argv[])
{
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
TEST_ASSERT(kvm_cpu_has(X86_FEATURE_NRIPS),
diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
index 9b6db0b0b13e..d2f9b5bdfab2 100644
--- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
@@ -90,9 +90,6 @@ int main(int argc, char *argv[])
struct kvm_vcpu_events events;
int rv, cap;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS);
TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD));
diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c
index 70b44f0b52fe..ead5d878a71c 100644
--- a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c
+++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c
@@ -3,6 +3,7 @@
#include "kvm_util.h"
#include "processor.h"
#include "vmx.h"
+#include "svm_util.h"
#include <string.h>
#include <sys/ioctl.h>
@@ -20,10 +21,11 @@ static void l2_guest_code(void)
: : [port] "d" (ARBITRARY_IO_PORT) : "rax");
}
-void l1_guest_code(struct vmx_pages *vmx)
-{
#define L2_GUEST_STACK_SIZE 64
- unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+void l1_guest_code_vmx(struct vmx_pages *vmx)
+{
GUEST_ASSERT(vmx->vmcs_gpa);
GUEST_ASSERT(prepare_for_vmx_operation(vmx));
@@ -38,24 +40,53 @@ void l1_guest_code(struct vmx_pages *vmx)
GUEST_DONE();
}
+void l1_guest_code_svm(struct svm_test_data *svm)
+{
+ struct vmcb *vmcb = svm->vmcb;
+
+ generic_svm_setup(svm, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ /* don't intercept shutdown to test the case of SVM allowing to do so */
+ vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
+
+ run_guest(vmcb, svm->vmcb_gpa);
+
+ /* should not reach here, L1 should crash */
+ GUEST_ASSERT(0);
+}
+
int main(void)
{
struct kvm_vcpu *vcpu;
struct kvm_run *run;
struct kvm_vcpu_events events;
- vm_vaddr_t vmx_pages_gva;
struct ucall uc;
- TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+ bool has_vmx = kvm_cpu_has(X86_FEATURE_VMX);
+ bool has_svm = kvm_cpu_has(X86_FEATURE_SVM);
+
+ TEST_REQUIRE(has_vmx || has_svm);
TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT));
- vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
- vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1);
+ if (has_vmx) {
+ vm_vaddr_t vmx_pages_gva;
+
+ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_vmx);
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vcpu, 1, vmx_pages_gva);
+ } else {
+ vm_vaddr_t svm_gva;
+
+ vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_svm);
+ vcpu_alloc_svm(vm, &svm_gva);
+ vcpu_args_set(vcpu, 1, svm_gva);
+ }
+
+ vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1);
run = vcpu->run;
- vcpu_alloc_vmx(vm, &vmx_pages_gva);
- vcpu_args_set(vcpu, 1, vmx_pages_gva);
vcpu_run(vcpu);
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
@@ -78,13 +109,21 @@ int main(void)
"No triple fault pending");
vcpu_run(vcpu);
- switch (get_ucall(vcpu, &uc)) {
- case UCALL_DONE:
- break;
- case UCALL_ABORT:
- REPORT_GUEST_ASSERT(uc);
- default:
- TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
- }
+ if (has_svm) {
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
+ "Got exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ } else {
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_DONE:
+ break;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ default:
+ TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+ }
+ }
+ return 0;
}
diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
index 7316521428f8..91076c9787b4 100644
--- a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
+++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
@@ -56,9 +56,6 @@ int main(int argc, char *argv[])
struct kvm_vm *vm;
struct ucall uc;
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
-
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
run = vcpu->run;
diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
index a4f06370a245..25fa55344a10 100644
--- a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
+++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
@@ -733,16 +733,98 @@ static void test_msr_permission_bitmap(void)
kvm_vm_free(vm);
}
-int main(int argc, char *argv[])
+#define test_user_exit_msr_ioctl(vm, cmd, arg, flag, valid_mask) \
+({ \
+ int r = __vm_ioctl(vm, cmd, arg); \
+ \
+ if (flag & valid_mask) \
+ TEST_ASSERT(!r, __KVM_IOCTL_ERROR(#cmd, r)); \
+ else \
+ TEST_ASSERT(r == -1 && errno == EINVAL, \
+ "Wanted EINVAL for %s with flag = 0x%llx, got rc: %i errno: %i (%s)", \
+ #cmd, flag, r, errno, strerror(errno)); \
+})
+
+static void run_user_space_msr_flag_test(struct kvm_vm *vm)
{
- /* Tell stdout not to buffer its content */
- setbuf(stdout, NULL);
+ struct kvm_enable_cap cap = { .cap = KVM_CAP_X86_USER_SPACE_MSR };
+ int nflags = sizeof(cap.args[0]) * BITS_PER_BYTE;
+ int rc;
+ int i;
+
+ rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+ TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+
+ for (i = 0; i < nflags; i++) {
+ cap.args[0] = BIT_ULL(i);
+ test_user_exit_msr_ioctl(vm, KVM_ENABLE_CAP, &cap,
+ BIT_ULL(i), KVM_MSR_EXIT_REASON_VALID_MASK);
+ }
+}
+
+static void run_msr_filter_flag_test(struct kvm_vm *vm)
+{
+ u64 deny_bits = 0;
+ struct kvm_msr_filter filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+ .ranges = {
+ {
+ .flags = KVM_MSR_FILTER_READ,
+ .nmsrs = 1,
+ .base = 0,
+ .bitmap = (uint8_t *)&deny_bits,
+ },
+ },
+ };
+ int nflags;
+ int rc;
+ int i;
+
+ rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+ TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+ nflags = sizeof(filter.flags) * BITS_PER_BYTE;
+ for (i = 0; i < nflags; i++) {
+ filter.flags = BIT_ULL(i);
+ test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter,
+ BIT_ULL(i), KVM_MSR_FILTER_VALID_MASK);
+ }
+ filter.flags = KVM_MSR_FILTER_DEFAULT_ALLOW;
+ nflags = sizeof(filter.ranges[0].flags) * BITS_PER_BYTE;
+ for (i = 0; i < nflags; i++) {
+ filter.ranges[0].flags = BIT_ULL(i);
+ test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter,
+ BIT_ULL(i), KVM_MSR_FILTER_RANGE_VALID_MASK);
+ }
+}
+
+/* Test that attempts to write to the unused bits in a flag fails. */
+static void test_user_exit_msr_flags(void)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+
+ vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+ /* Test flags for KVM_CAP_X86_USER_SPACE_MSR. */
+ run_user_space_msr_flag_test(vm);
+
+ /* Test flags and range flags for KVM_X86_SET_MSR_FILTER. */
+ run_msr_filter_flag_test(vm);
+
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
test_msr_filter_allow();
test_msr_filter_deny();
test_msr_permission_bitmap();
+ test_user_exit_msr_flags();
+
return 0;
}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
index 2d8c23d639f7..f0456fb031b1 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
@@ -78,6 +78,7 @@ int main(int argc, char *argv[])
bool done = false;
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+ TEST_REQUIRE(kvm_cpu_has_ept());
/* Create VM */
vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
index 069589c52f41..c280ba1e6572 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
@@ -20,16 +20,6 @@
#define PMU_CAP_FW_WRITES (1ULL << 13)
#define PMU_CAP_LBR_FMT 0x3f
-union cpuid10_eax {
- struct {
- unsigned int version_id:8;
- unsigned int num_counters:8;
- unsigned int bit_width:8;
- unsigned int mask_length:8;
- } split;
- unsigned int full;
-};
-
union perf_capabilities {
struct {
u64 lbr_format:6;
@@ -53,11 +43,9 @@ static void guest_code(void)
int main(int argc, char *argv[])
{
- const struct kvm_cpuid_entry2 *entry_a_0;
struct kvm_vm *vm;
struct kvm_vcpu *vcpu;
int ret;
- union cpuid10_eax eax;
union perf_capabilities host_cap;
uint64_t val;
@@ -69,11 +57,8 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));
- TEST_REQUIRE(kvm_get_cpuid_max_basic() >= 0xa);
- entry_a_0 = kvm_get_supported_cpuid_entry(0xa);
-
- eax.full = entry_a_0->eax;
- __TEST_REQUIRE(eax.split.version_id, "PMU is not supported by the vCPU");
+ TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
+ TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
/* testcase 1, set capabilities when we have PDCM bit */
vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES);
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
index 6f7a5ef66718..d7d37dae3eeb 100644
--- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
@@ -114,7 +114,9 @@ static void test_icr(struct xapic_vcpu *x)
* vCPUs, not vcpu.id + 1. Arbitrarily use vector 0xff.
*/
icr = APIC_INT_ASSERT | 0xff;
- for (i = vcpu->id + 1; i < 0xff; i++) {
+ for (i = 0; i < 0xff; i++) {
+ if (i == vcpu->id)
+ continue;
for (j = 0; j < 8; j++)
__test_icr(x, i << (32 + 24) | icr | (j << 8));
}
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 58e4f88b2b9f..1e567d1f6d3d 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -17,7 +17,6 @@
#include <linux/srcu.h>
#include <linux/export.h>
#include <trace/events/kvm.h>
-#include "irq.h"
int kvm_irq_map_gsi(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *entries, int gsi)
@@ -50,7 +49,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
{
struct kvm_kernel_irq_routing_entry route;
- if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID))
+ if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID))
return -EINVAL;
route.msi.address_lo = msi->address_lo;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 03e6a38094c1..3ee537eb0bab 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1641,6 +1641,8 @@ static void kvm_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ int old_flags = old ? old->flags : 0;
+ int new_flags = new ? new->flags : 0;
/*
* Update the total number of memslot pages before calling the arch
* hook so that architectures can consume the result directly.
@@ -1650,6 +1652,12 @@ static void kvm_commit_memory_region(struct kvm *kvm,
else if (change == KVM_MR_CREATE)
kvm->nr_memslot_pages += new->npages;
+ if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
+ int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
+ atomic_set(&kvm->nr_memslots_dirty_logging,
+ atomic_read(&kvm->nr_memslots_dirty_logging) + change);
+ }
+
kvm_arch_commit_memory_region(kvm, old, new, change);
switch (change) {
@@ -2514,7 +2522,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
* 1 indicates success, -errno is returned if error is detected.
*/
static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
- bool *writable, kvm_pfn_t *pfn)
+ bool interruptible, bool *writable, kvm_pfn_t *pfn)
{
unsigned int flags = FOLL_HWPOISON;
struct page *page;
@@ -2529,6 +2537,8 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
flags |= FOLL_WRITE;
if (async)
flags |= FOLL_NOWAIT;
+ if (interruptible)
+ flags |= FOLL_INTERRUPTIBLE;
npages = get_user_pages_unlocked(addr, 1, &page, flags);
if (npages != 1)
@@ -2638,6 +2648,7 @@ out:
* Pin guest page in memory and return its pfn.
* @addr: host virtual address which maps memory to the guest
* @atomic: whether this function can sleep
+ * @interruptible: whether the process can be interrupted by non-fatal signals
* @async: whether this function need to wait IO complete if the
* host page is not in the memory
* @write_fault: whether we should get a writable host page
@@ -2648,8 +2659,8 @@ out:
* 2): @write_fault = false && @writable, @writable will tell the caller
* whether the mapping is writable.
*/
-kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
- bool write_fault, bool *writable)
+kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
+ bool *async, bool write_fault, bool *writable)
{
struct vm_area_struct *vma;
kvm_pfn_t pfn;
@@ -2664,9 +2675,12 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
if (atomic)
return KVM_PFN_ERR_FAULT;
- npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+ npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
+ writable, &pfn);
if (npages == 1)
return pfn;
+ if (npages == -EINTR)
+ return KVM_PFN_ERR_SIGPENDING;
mmap_read_lock(current->mm);
if (npages == -EHWPOISON ||
@@ -2697,8 +2711,8 @@ exit:
}
kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
- bool atomic, bool *async, bool write_fault,
- bool *writable, hva_t *hva)
+ bool atomic, bool interruptible, bool *async,
+ bool write_fault, bool *writable, hva_t *hva)
{
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
@@ -2723,7 +2737,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
writable = NULL;
}
- return hva_to_pfn(addr, atomic, async, write_fault,
+ return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
writable);
}
EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
@@ -2731,20 +2745,22 @@ EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable)
{
- return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
- write_fault, writable, NULL);
+ return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
+ NULL, write_fault, writable, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
+ NULL, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
{
- return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
+ NULL, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 41da467d99c9..a1ab15006af3 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -24,8 +24,8 @@
#define KVM_MMU_READ_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock)
#endif /* KVM_HAVE_MMU_RWLOCK */
-kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
- bool write_fault, bool *writable);
+kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
+ bool *async, bool write_fault, bool *writable);
#ifdef CONFIG_HAVE_KVM_PFNCACHE
void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 346e47f15572..5f83321bfd2a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -185,7 +185,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
}
/* We always request a writeable mapping */
- new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL);
+ new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL);
if (is_error_noslot_pfn(new_pfn))
goto out_error;
@@ -297,7 +297,12 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
if (!gpc->valid || old_uhva != gpc->uhva) {
ret = hva_to_pfn_retry(kvm, gpc);
} else {
- /* If the HVA→PFN mapping was already valid, don't unmap it. */
+ /*
+ * If the HVA→PFN mapping was already valid, don't unmap it.
+ * But do update gpc->khva because the offset within the page
+ * may have changed.
+ */
+ gpc->khva = old_khva + page_offset;
old_pfn = KVM_PFN_ERR_FAULT;
old_khva = NULL;
ret = 0;