From 4e0b1ab72b8af961bcaca9ec1475279c1cd9579c Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Tue, 29 Nov 2016 07:17:55 +0100 Subject: KVM: s390: gs support for kvm guests This patch adds guarded storage support for KVM guest. We need to setup the necessary control blocks, the kvm_run structure for the new registers, the necessary wrappers for VSIE, as well as the machine check save areas. GS is enabled lazily and the register saving and reloading is done in KVM code. As this feature adds new content for migration, we provide a new capability for enablement (KVM_CAP_S390_GS). Signed-off-by: Fan Zhang Reviewed-by: Christian Borntraeger Reviewed-by: Janosch Frank Signed-off-by: Christian Borntraeger --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f51d5082a377..c9d522765f8f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -883,6 +883,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 #define KVM_CAP_IMMEDIATE_EXIT 136 +#define KVM_CAP_S390_GS 137 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From a8a3c426772e55ae9c3209f061cb6317268f932c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Mar 2017 10:15:19 +0000 Subject: KVM: MIPS: Add VZ & TE capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new KVM_CAP_MIPS_VZ and KVM_CAP_MIPS_TE capabilities, and in order to allow MIPS KVM to support VZ without confusing old users (which expect the trap & emulate implementation), define and start checking KVM_CREATE_VM type codes. The codes available are: - KVM_VM_MIPS_TE = 0 This is the current value expected from the user, and will create a VM using trap & emulate in user mode, confined to the user mode address space. This may in future become unavailable if the kernel is only configured to support VZ, in which case the EINVAL error will be returned and KVM_CAP_MIPS_TE won't be available even though KVM_CAP_MIPS_VZ is. - KVM_VM_MIPS_VZ = 1 This can be provided when the KVM_CAP_MIPS_VZ capability is available to create a VM using VZ, with a fully virtualized guest virtual address space. If VZ support is unavailable in the kernel, the EINVAL error will be returned (although old kernels without the KVM_CAP_MIPS_VZ capability may well succeed and create a trap & emulate VM). This is designed to allow the desired implementation (T&E vs VZ) to be potentially chosen at runtime rather than being fixed in the kernel configuration. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: Jonathan Corbet Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: linux-doc@vger.kernel.org --- Documentation/virtual/kvm/api.txt | 47 ++++++++++++++++++++++++++++++++++++++- arch/mips/kvm/mips.c | 9 ++++++++ include/uapi/linux/kvm.h | 6 +++++ 3 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 3c248f772ae6..4b5fa2571efa 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -115,12 +115,17 @@ will access the virtual machine's physical address space; offset zero corresponds to guest physical address zero. Use of mmap() on a VM fd is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is available. -You most certainly want to use 0 as machine type. +You probably want to use 0 as machine type. In order to create user controlled virtual machines on S390, check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as privileged user (CAP_SYS_ADMIN). +To use hardware assisted virtualization on MIPS (VZ ASE) rather than +the default trap & emulate implementation (which changes the virtual +memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the +flag KVM_VM_MIPS_VZ. + 4.3 KVM_GET_MSR_INDEX_LIST @@ -4147,3 +4152,43 @@ This capability, if KVM_CHECK_EXTENSION indicates that it is available, means that that the kernel can support guests using the hashed page table MMU defined in Power ISA V3.00 (as implemented in the POWER9 processor), including in-memory segment tables. + +8.5 KVM_CAP_MIPS_VZ + +Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that full hardware assisted virtualization capabilities +of the hardware are available for use through KVM. An appropriate +KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which +utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using full hardware assisted virtualization +capabilities of the hardware. This is useful to check after creating a VM with +KVM_VM_MIPS_DEFAULT. + +The value returned by KVM_CHECK_EXTENSION should be compared against known +values (see below). All other values are reserved. This is to allow for the +possibility of other hardware assisted virtualization implementations which +may be incompatible with the MIPS VZ ASE. + + 0: The trap & emulate implementation is in use to run guest code in user + mode. Guest virtual memory segments are rearranged to fit the guest in the + user mode address space. + + 1: The MIPS VZ ASE is in use, providing full hardware assisted + virtualization, including standard guest virtual memory segments. + +8.6 KVM_CAP_MIPS_TE + +Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that the trap & emulate implementation is available to +run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware +assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed +to KVM_CREATE_VM to create a VM which utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using trap & emulate. diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index c507533ef6ea..476ece99bf3b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -107,6 +107,14 @@ void kvm_arch_check_processor_compat(void *rtn) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + switch (type) { + case KVM_VM_MIPS_TE: + break; + default: + /* Unsupported KVM type */ + return -EINVAL; + }; + /* Allocate page table to map GPA -> RPA */ kvm->arch.gpa_mm.pgd = kvm_pgd_alloc(); if (!kvm->arch.gpa_mm.pgd) @@ -1038,6 +1046,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_SYNC_MMU: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_MIPS_TE: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f51d5082a377..58ddedce4235 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -702,6 +702,10 @@ struct kvm_ppc_resize_hpt { #define KVM_VM_PPC_HV 1 #define KVM_VM_PPC_PR 2 +/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */ +#define KVM_VM_MIPS_TE 0 +#define KVM_VM_MIPS_VZ 1 + #define KVM_S390_SIE_PAGE_OFFSET 1 /* @@ -883,6 +887,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 #define KVM_CAP_IMMEDIATE_EXIT 136 +#define KVM_CAP_MIPS_VZ 137 +#define KVM_CAP_MIPS_TE 138 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 578fd61d2d210a3b58dc107f5382b965922ac253 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Mar 2017 10:15:20 +0000 Subject: KVM: MIPS: Add 64BIT capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new KVM_CAP_MIPS_64BIT capability to indicate that 64-bit MIPS guests are available and supported. In this case it should still be possible to run 32-bit guest code. If not available it won't be possible to run 64-bit guest code and the instructions may not be available, or the kernel may not support full context switching of 64-bit registers. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: Jonathan Corbet Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Cc: linux-doc@vger.kernel.org --- Documentation/virtual/kvm/api.txt | 25 +++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4b5fa2571efa..1b8486c094b4 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4192,3 +4192,28 @@ to KVM_CREATE_VM to create a VM which utilises it. If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is available, it means that the VM is using trap & emulate. + +8.7 KVM_CAP_MIPS_64BIT + +Architectures: mips + +This capability indicates the supported architecture type of the guest, i.e. the +supported register and address width. + +The values returned when this capability is checked by KVM_CHECK_EXTENSION on a +kvm VM handle correspond roughly to the CP0_Config.AT register field, and should +be checked specifically against known values (see below). All other values are +reserved. + + 0: MIPS32 or microMIPS32. + Both registers and addresses are 32-bits wide. + It will only be possible to run 32-bit guest code. + + 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments. + Registers are 64-bits wide, but addresses are 32-bits wide. + 64-bit guest code may run but cannot access MIPS64 memory segments. + It will also be possible to run 32-bit guest code. + + 2: MIPS64 or microMIPS64 with access to all address segments. + Both registers and addresses are 64-bits wide. + It will be possible to run 64-bit or 32-bit guest code. diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 58ddedce4235..1e1a6c728a18 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -889,6 +889,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_IMMEDIATE_EXIT 136 #define KVM_CAP_MIPS_VZ 137 #define KVM_CAP_MIPS_TE 138 +#define KVM_CAP_MIPS_64BIT 139 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 47a4693e1d3eb09e523c223753fb5a97721f49b8 Mon Sep 17 00:00:00 2001 From: Yi Min Zhao Date: Fri, 10 Mar 2017 09:29:38 +0100 Subject: KVM: s390: introduce AIS capability Introduce a cap to enable AIS facility bit, and add documentation for this capability. Signed-off-by: Yi Min Zhao Signed-off-by: Fei Li Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 8 ++++++++ arch/s390/kvm/kvm-s390.c | 15 +++++++++++++++ include/uapi/linux/kvm.h | 1 + 3 files changed, 24 insertions(+) (limited to 'include') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 725250858479..598278cd0dc5 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4110,6 +4110,14 @@ Returns: 0 on success; -EINVAL if the machine does not support Allows use of guarded storage for the KVM guest. +7.10 KVM_CAP_S390_AIS + +Architectures: s390 +Parameters: none + +Allow use of adapter-interruption suppression. +Returns: 0 on success; -EBUSY if a VCPU has already been created. + 8. Other capabilities. ---------------------- diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 977cc1660a83..11b7d6638991 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -380,6 +380,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: case KVM_CAP_S390_USER_INSTR0: + case KVM_CAP_S390_AIS: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -544,6 +545,20 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) VM_EVENT(kvm, 3, "ENABLE: CAP_S390_RI %s", r ? "(not available)" : "(success)"); break; + case KVM_CAP_S390_AIS: + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else { + set_kvm_facility(kvm->arch.model.fac_mask, 72); + set_kvm_facility(kvm->arch.model.fac_list, 72); + kvm->arch.float_int.ais_enabled = 1; + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: AIS %s", + r ? "(not available)" : "(success)"); + break; case KVM_CAP_S390_GS: r = -EINVAL; mutex_lock(&kvm->lock); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c9d522765f8f..33dd2a4e36dc 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -884,6 +884,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_MMU_HASH_V3 135 #define KVM_CAP_IMMEDIATE_EXIT 136 #define KVM_CAP_S390_GS 137 +#define KVM_CAP_S390_AIS 138 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From ad6260da1e23cf937806e42c8490af3ff4530474 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 27 Mar 2017 14:30:40 +0200 Subject: KVM: x86: drop legacy device assignment Legacy device assignment has been deprecated since 4.2 (released 1.5 years ago). VFIO is better and everyone should have switched to it. If they haven't, this should convince them. :) Reviewed-by: Alex Williamson Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 204 ------- arch/x86/kvm/Kconfig | 12 - arch/x86/kvm/Makefile | 2 - arch/x86/kvm/assigned-dev.c | 1058 ------------------------------------- arch/x86/kvm/assigned-dev.h | 32 -- arch/x86/kvm/iommu.c | 356 ------------- arch/x86/kvm/x86.c | 14 +- include/linux/kvm_host.h | 16 - virt/kvm/kvm_main.c | 17 - 9 files changed, 1 insertion(+), 1710 deletions(-) delete mode 100644 arch/x86/kvm/assigned-dev.c delete mode 100644 arch/x86/kvm/assigned-dev.h delete mode 100644 arch/x86/kvm/iommu.c (limited to 'include') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 753e88e5eb2a..1a184843bf9c 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1326,130 +1326,6 @@ The flags bitmap is defined as: /* the host supports the ePAPR idle hcall #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) -4.48 KVM_ASSIGN_PCI_DEVICE (deprecated) - -Capability: none -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Assigns a host PCI device to the VM. - -struct kvm_assigned_pci_dev { - __u32 assigned_dev_id; - __u32 busnr; - __u32 devfn; - __u32 flags; - __u32 segnr; - union { - __u32 reserved[11]; - }; -}; - -The PCI device is specified by the triple segnr, busnr, and devfn. -Identification in succeeding service requests is done via assigned_dev_id. The -following flags are specified: - -/* Depends on KVM_CAP_IOMMU */ -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -/* The following two depend on KVM_CAP_PCI_2_3 */ -#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) - -If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts -via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other -assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the -guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details. - -The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure -isolation of the device. Usages not specifying this flag are deprecated. - -Only PCI header type 0 devices with PCI BAR resources are supported by -device assignment. The user requesting this ioctl must have read/write -access to the PCI sysfs resource files associated with the device. - -Errors: - ENOTTY: kernel does not support this ioctl - - Other error conditions may be defined by individual device types or - have their standard meanings. - - -4.49 KVM_DEASSIGN_PCI_DEVICE (deprecated) - -Capability: none -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Ends PCI device assignment, releasing all associated resources. - -See KVM_ASSIGN_PCI_DEVICE for the data structure. Only assigned_dev_id is -used in kvm_assigned_pci_dev to identify the device. - -Errors: - ENOTTY: kernel does not support this ioctl - - Other error conditions may be defined by individual device types or - have their standard meanings. - -4.50 KVM_ASSIGN_DEV_IRQ (deprecated) - -Capability: KVM_CAP_ASSIGN_DEV_IRQ -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_irq (in) -Returns: 0 on success, -1 on error - -Assigns an IRQ to a passed-through device. - -struct kvm_assigned_irq { - __u32 assigned_dev_id; - __u32 host_irq; /* ignored (legacy field) */ - __u32 guest_irq; - __u32 flags; - union { - __u32 reserved[12]; - }; -}; - -The following flags are defined: - -#define KVM_DEV_IRQ_HOST_INTX (1 << 0) -#define KVM_DEV_IRQ_HOST_MSI (1 << 1) -#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) - -#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) -#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) -#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) - -It is not valid to specify multiple types per host or guest IRQ. However, the -IRQ type of host and guest can differ or can even be null. - -Errors: - ENOTTY: kernel does not support this ioctl - - Other error conditions may be defined by individual device types or - have their standard meanings. - - -4.51 KVM_DEASSIGN_DEV_IRQ (deprecated) - -Capability: KVM_CAP_ASSIGN_DEV_IRQ -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_irq (in) -Returns: 0 on success, -1 on error - -Ends an IRQ assignment to a passed-through device. - -See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified -by assigned_dev_id, flags must correspond to the IRQ type specified on -KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. - - 4.52 KVM_SET_GSI_ROUTING Capability: KVM_CAP_IRQ_ROUTING @@ -1536,52 +1412,6 @@ struct kvm_irq_routing_hv_sint { __u32 sint; }; -4.53 KVM_ASSIGN_SET_MSIX_NR (deprecated) - -Capability: none -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_msix_nr (in) -Returns: 0 on success, -1 on error - -Set the number of MSI-X interrupts for an assigned device. The number is -reset again by terminating the MSI-X assignment of the device via -KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier -point will fail. - -struct kvm_assigned_msix_nr { - __u32 assigned_dev_id; - __u16 entry_nr; - __u16 padding; -}; - -#define KVM_MAX_MSIX_PER_DEV 256 - - -4.54 KVM_ASSIGN_SET_MSIX_ENTRY (deprecated) - -Capability: none -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_msix_entry (in) -Returns: 0 on success, -1 on error - -Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting -the GSI vector to zero means disabling the interrupt. - -struct kvm_assigned_msix_entry { - __u32 assigned_dev_id; - __u32 gsi; - __u16 entry; /* The index of entry in the MSI-X table */ - __u16 padding[3]; -}; - -Errors: - ENOTTY: kernel does not support this ioctl - - Other error conditions may be defined by individual device types or - have their standard meanings. - 4.55 KVM_SET_TSC_KHZ @@ -1733,40 +1563,6 @@ should skip processing the bitmap and just invalidate everything. It must be set to the number of set bits in the bitmap. -4.61 KVM_ASSIGN_SET_INTX_MASK (deprecated) - -Capability: KVM_CAP_PCI_2_3 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Allows userspace to mask PCI INTx interrupts from the assigned device. The -kernel will not deliver INTx interrupts to the guest between setting and -clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of -and emulation of PCI 2.3 INTx disable command register behavior. - -This may be used for both PCI 2.3 devices supporting INTx disable natively and -older devices lacking this support. Userspace is responsible for emulating the -read value of the INTx disable bit in the guest visible PCI command register. -When modifying the INTx disable state, userspace should precede updating the -physical device command register by calling this ioctl to inform the kernel of -the new intended INTx mask state. - -Note that the kernel uses the device INTx disable bit to internally manage the -device interrupt state for PCI 2.3 devices. Reads of this register may -therefore not match the expected value. Writes should always use the guest -intended INTx disable value rather than attempting to read-copy-update the -current physical device state. Races between user and kernel updates to the -INTx disable bit are handled lazily in the kernel. It's possible the device -may generate unintended interrupts, but they will not be injected into the -guest. - -See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified -by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is -evaluated. - - 4.62 KVM_CREATE_SPAPR_TCE Capability: KVM_CAP_SPAPR_TCE diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ab8e32f7b9a8..760433b2574a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -86,18 +86,6 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. -config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support (DEPRECATED)" - depends on KVM && PCI && IOMMU_API - default n - ---help--- - Provide support for legacy PCI device assignment through KVM. The - kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes this support and provides - better security. - - If unsure, say N. - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 3bff20710471..09d4b17be022 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,8 +15,6 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o - kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c deleted file mode 100644 index 308b8597c691..000000000000 --- a/arch/x86/kvm/assigned-dev.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * Kernel-based Virtual Machine - device assignment support - * - * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "irq.h" -#include "assigned-dev.h" -#include "trace/events/kvm.h" - -struct kvm_assigned_dev_kernel { - struct kvm_irq_ack_notifier ack_notifier; - struct list_head list; - int assigned_dev_id; - int host_segnr; - int host_busnr; - int host_devfn; - unsigned int entries_nr; - int host_irq; - bool host_irq_disabled; - bool pci_2_3; - struct msix_entry *host_msix_entries; - int guest_irq; - struct msix_entry *guest_msix_entries; - unsigned long irq_requested_type; - int irq_source_id; - int flags; - struct pci_dev *dev; - struct kvm *kvm; - spinlock_t intx_lock; - spinlock_t intx_mask_lock; - char irq_name[32]; - struct pci_saved_state *pci_saved_state; -}; - -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct kvm_assigned_dev_kernel *match; - - list_for_each_entry(match, head, list) { - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static int find_index_from_host_irq(struct kvm_assigned_dev_kernel - *assigned_dev, int irq) -{ - int i, index; - struct msix_entry *host_msix_entries; - - host_msix_entries = assigned_dev->host_msix_entries; - - index = -1; - for (i = 0; i < assigned_dev->entries_nr; i++) - if (irq == host_msix_entries[i].vector) { - index = i; - break; - } - if (index < 0) - printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - - return index; -} - -static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret; - - spin_lock(&assigned_dev->intx_lock); - if (pci_check_and_mask_intx(assigned_dev->dev)) { - assigned_dev->host_irq_disabled = true; - ret = IRQ_WAKE_THREAD; - } else - ret = IRQ_NONE; - spin_unlock(&assigned_dev->intx_lock); - - return ret; -} - -static void -kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, - int vector) -{ - if (unlikely(assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_INTX)) { - spin_lock(&assigned_dev->intx_mask_lock); - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1, - false); - spin_unlock(&assigned_dev->intx_mask_lock); - } else - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1, false); -} - -static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - spin_unlock_irq(&assigned_dev->intx_lock); - } - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, - int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - - -static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - int ret = 0; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - vector, 1); - } - - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); - } - - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev = - container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); - - spin_lock(&dev->intx_mask_lock); - - if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { - bool reassert = false; - - spin_lock_irq(&dev->intx_lock); - /* - * The guest IRQ may be shared so this ack can come from an - * IRQ for another guest device. - */ - if (dev->host_irq_disabled) { - if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) - enable_irq(dev->host_irq); - else if (!pci_check_and_unmask_intx(dev->dev)) - reassert = true; - dev->host_irq_disabled = reassert; - } - spin_unlock_irq(&dev->intx_lock); - - if (reassert) - kvm_set_irq(dev->kvm, dev->irq_source_id, - dev->guest_irq, 1, false); - } - - spin_unlock(&dev->intx_mask_lock); -} - -static void deassign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - if (assigned_dev->ack_notifier.gsi != -1) - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev->ack_notifier); - - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0, false); - - if (assigned_dev->irq_source_id != -1) - kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); - assigned_dev->irq_source_id = -1; - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); -} - -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void deassign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - /* - * We disable irq here to prevent further events. - * - * Notice this maybe result in nested disable if the interrupt type is - * INTx, but it's OK for we are going to free it. - * - * If this function is a part of VM destroy, please ensure that till - * now, the kvm state is still legal for probably we also have to wait - * on a currently running IRQ handler. - */ - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int i; - for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq(assigned_dev->host_msix_entries[i].vector); - - for (i = 0; i < assigned_dev->entries_nr; i++) - free_irq(assigned_dev->host_msix_entries[i].vector, - assigned_dev); - - assigned_dev->entries_nr = 0; - kfree(assigned_dev->host_msix_entries); - kfree(assigned_dev->guest_msix_entries); - pci_disable_msix(assigned_dev->dev); - } else { - /* Deal with MSI and INTx */ - if ((assigned_dev->irq_requested_type & - KVM_DEV_IRQ_HOST_INTX) && - (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - pci_intx(assigned_dev->dev, false); - spin_unlock_irq(&assigned_dev->intx_lock); - synchronize_irq(assigned_dev->host_irq); - } else - disable_irq(assigned_dev->host_irq); - - free_irq(assigned_dev->host_irq, assigned_dev); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) - pci_disable_msi(assigned_dev->dev); - } - - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); -} - -static int kvm_deassign_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev, - unsigned long irq_requested_type) -{ - unsigned long guest_irq_type, host_irq_type; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - /* no irq assignment to deassign */ - if (!assigned_dev->irq_requested_type) - return -ENXIO; - - host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; - guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; - - if (host_irq_type) - deassign_host_irq(kvm, assigned_dev); - if (guest_irq_type) - deassign_guest_irq(kvm, assigned_dev); - - return 0; -} - -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - kvm_free_assigned_irq(kvm, assigned_dev); - - pci_reset_function(assigned_dev->dev); - if (pci_load_and_free_saved_state(assigned_dev->dev, - &assigned_dev->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&assigned_dev->dev->dev)); - else - pci_restore_state(assigned_dev->dev); - - pci_clear_dev_assigned(assigned_dev->dev); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *assigned_dev, *tmp; - - list_for_each_entry_safe(assigned_dev, tmp, - &kvm->arch.assigned_dev_head, list) { - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int assigned_device_enable_host_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - irq_handler_t irq_handler; - unsigned long flags; - - dev->host_irq = dev->dev->irq; - - /* - * We can only share the IRQ line with other host devices if we are - * able to disable the IRQ source at device-level - independently of - * the guest driver. Otherwise host devices may suffer from unbounded - * IRQ latencies when the guest keeps the line asserted. - */ - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - irq_handler = kvm_assigned_dev_intx; - flags = IRQF_SHARED; - } else { - irq_handler = NULL; - flags = IRQF_ONESHOT; - } - if (request_threaded_irq(dev->host_irq, irq_handler, - kvm_assigned_dev_thread_intx, flags, - dev->irq_name, dev)) - return -EIO; - - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - spin_lock_irq(&dev->intx_lock); - pci_intx(dev->dev, true); - spin_unlock_irq(&dev->intx_lock); - } - return 0; -} - -static int assigned_device_enable_host_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int r; - - if (!dev->dev->msi_enabled) { - r = pci_enable_msi(dev->dev); - if (r) - return r; - } - - dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, - kvm_assigned_dev_thread_msi, 0, - dev->irq_name, dev)) { - pci_disable_msi(dev->dev); - return -EIO; - } - - return 0; -} - -static int assigned_device_enable_host_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int i, r = -EINVAL; - - /* host_msix_entries and guest_msix_entries should have been - * initialized */ - if (dev->entries_nr == 0) - return r; - - r = pci_enable_msix_exact(dev->dev, - dev->host_msix_entries, dev->entries_nr); - if (r) - return r; - - for (i = 0; i < dev->entries_nr; i++) { - r = request_threaded_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_msix, - kvm_assigned_dev_thread_msix, - 0, dev->irq_name, dev); - if (r) - goto err; - } - - return 0; -err: - for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, dev); - pci_disable_msix(dev->dev); - return r; -} - -static int assigned_device_enable_guest_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = irq->guest_irq; - return 0; -} - -static int assigned_device_enable_guest_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assigned_device_enable_guest_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - __u32 host_irq_type) -{ - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) - return r; - - snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", - pci_name(dev->dev)); - - switch (host_irq_type) { - case KVM_DEV_IRQ_HOST_INTX: - r = assigned_device_enable_host_intx(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSI: - r = assigned_device_enable_host_msi(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSIX: - r = assigned_device_enable_host_msix(kvm, dev); - break; - default: - r = -EINVAL; - } - dev->host_irq_disabled = false; - - if (!r) - dev->irq_requested_type |= host_irq_type; - - return r; -} - -static int assign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq, - unsigned long guest_irq_type) -{ - int id; - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) - return r; - - id = kvm_request_irq_source_id(kvm); - if (id < 0) - return id; - - dev->irq_source_id = id; - - switch (guest_irq_type) { - case KVM_DEV_IRQ_GUEST_INTX: - r = assigned_device_enable_guest_intx(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSI: - r = assigned_device_enable_guest_msi(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSIX: - r = assigned_device_enable_guest_msix(kvm, dev, irq); - break; - default: - r = -EINVAL; - } - - if (!r) { - dev->irq_requested_type |= guest_irq_type; - if (dev->ack_notifier.gsi != -1) - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else { - kvm_free_irq_source_id(kvm, dev->irq_source_id); - dev->irq_source_id = -1; - } - - return r; -} - -/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq *assigned_irq) -{ - int r = -EINVAL; - struct kvm_assigned_dev_kernel *match; - unsigned long host_irq_type, guest_irq_type; - - if (!irqchip_in_kernel(kvm)) - return r; - - mutex_lock(&kvm->lock); - r = -ENODEV; - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); - guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - - r = -EINVAL; - /* can only assign one type at a time */ - if (hweight_long(host_irq_type) > 1) - goto out; - if (hweight_long(guest_irq_type) > 1) - goto out; - if (host_irq_type == 0 && guest_irq_type == 0) - goto out; - - r = 0; - if (host_irq_type) - r = assign_host_irq(kvm, match, host_irq_type); - if (r) - goto out; - - if (guest_irq_type) - r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = -ENODEV; - struct kvm_assigned_dev_kernel *match; - unsigned long irq_type; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | - KVM_DEV_IRQ_GUEST_MASK); - r = kvm_deassign_irq(kvm, match, irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -/* - * We want to test whether the caller has been granted permissions to - * use this device. To be able to configure and control the device, - * the user needs access to PCI configuration space and BAR resources. - * These are accessed through PCI sysfs. PCI config space is often - * passed to the process calling this ioctl via file descriptor, so we - * can't rely on access to that file. We can check for permissions - * on each of the BAR resource files, which is a pretty clear - * indicator that the user has been granted access to the device. - */ -static int probe_sysfs_permissions(struct pci_dev *dev) -{ -#ifdef CONFIG_SYSFS - int i; - bool bar_found = false; - - for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { - char *kpath, *syspath; - struct path path; - struct inode *inode; - int r; - - if (!pci_resource_len(dev, i)) - continue; - - kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); - if (!kpath) - return -ENOMEM; - - /* Per sysfs-rules, sysfs is always at /sys */ - syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); - kfree(kpath); - if (!syspath) - return -ENOMEM; - - r = kern_path(syspath, LOOKUP_FOLLOW, &path); - kfree(syspath); - if (r) - return r; - - inode = d_backing_inode(path.dentry); - - r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); - path_put(&path); - if (r) - return r; - - bar_found = true; - } - - /* If no resources, probably something special */ - if (!bar_found) - return -EPERM; - - return 0; -#else - return -EINVAL; /* No way to control the device without sysfs */ -#endif -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0, idx; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) - return -EINVAL; - - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EEXIST; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, - assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - - /* Don't allow bridges to be assigned */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { - r = -EPERM; - goto out_put; - } - - r = probe_sysfs_permissions(dev); - if (r) - goto out_put; - - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - - pci_reset_function(dev); - pci_save_state(dev); - match->pci_saved_state = pci_store_saved_state(dev); - if (!match->pci_saved_state) - printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", - __func__, dev_name(&dev->dev)); - - if (!pci_intx_mask_supported(dev)) - assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; - - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_segnr = assigned_dev->segnr; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->flags = assigned_dev->flags; - match->dev = dev; - spin_lock_init(&match->intx_lock); - spin_lock_init(&match->intx_mask_lock); - match->irq_source_id = -1; - match->kvm = kvm; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match->dev); - if (r) - goto out_list_del; - -out: - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -out_list_del: - if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&dev->dev)); - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - printk(KERN_INFO "%s: device hasn't been assigned before, " - "so cannot be deassigned\n", __func__); - r = -EINVAL; - goto out; - } - - kvm_deassign_device(kvm, match->dev); - - kvm_free_assigned_device(kvm, match); - -out: - mutex_unlock(&kvm->lock); - return r; -} - - -static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, - struct kvm_assigned_msix_nr *entry_nr) -{ - int r = 0; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry_nr->assigned_dev_id); - if (!adev) { - r = -EINVAL; - goto msix_nr_out; - } - - if (adev->entries_nr == 0) { - adev->entries_nr = entry_nr->entry_nr; - if (adev->entries_nr == 0 || - adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { - r = -EINVAL; - goto msix_nr_out; - } - - adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * - entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->host_msix_entries) { - r = -ENOMEM; - goto msix_nr_out; - } - adev->guest_msix_entries = - kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->guest_msix_entries) { - kfree(adev->host_msix_entries); - r = -ENOMEM; - goto msix_nr_out; - } - } else /* Not allowed set MSI-X number twice */ - r = -EINVAL; -msix_nr_out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, - struct kvm_assigned_msix_entry *entry) -{ - int r = 0, i; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry->assigned_dev_id); - - if (!adev) { - r = -EINVAL; - goto msix_entry_out; - } - - for (i = 0; i < adev->entries_nr; i++) - if (adev->guest_msix_entries[i].vector == 0 || - adev->guest_msix_entries[i].entry == entry->entry) { - adev->guest_msix_entries[i].entry = entry->entry; - adev->guest_msix_entries[i].vector = entry->gsi; - adev->host_msix_entries[i].entry = entry->entry; - break; - } - if (i == adev->entries_nr) { - r = -ENOSPC; - goto msix_entry_out; - } - -msix_entry_out: - mutex_unlock(&kvm->lock); - - return r; -} - -static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - r = -ENODEV; - goto out; - } - - spin_lock(&match->intx_mask_lock); - - match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; - match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; - - if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { - kvm_set_irq(match->kvm, match->irq_source_id, - match->guest_irq, 0, false); - /* - * Masking at hardware-level is performed on demand, - * i.e. when an IRQ actually arrives at the host. - */ - } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - /* - * Unmask the IRQ line if required. Unmasking at - * device level will be performed by user space. - */ - spin_lock_irq(&match->intx_lock); - if (match->host_irq_disabled) { - enable_irq(match->host_irq); - match->host_irq_disabled = false; - } - spin_unlock_irq(&match->intx_lock); - } - } - - spin_unlock(&match->intx_mask_lock); - -out: - mutex_unlock(&kvm->lock); - return r; -} - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int r; - - switch (ioctl) { - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - r = -EOPNOTSUPP; - break; - } - case KVM_ASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_NR: { - struct kvm_assigned_msix_nr entry_nr; - r = -EFAULT; - if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) - goto out; - r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_ENTRY: { - struct kvm_assigned_msix_entry entry; - r = -EFAULT; - if (copy_from_user(&entry, argp, sizeof entry)) - goto out; - r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_INTX_MASK: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); - break; - } - default: - r = -ENOTTY; - break; - } -out: - return r; -} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h deleted file mode 100644 index a428c1a211b2..000000000000 --- a/arch/x86/kvm/assigned-dev.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H -#define ARCH_X86_KVM_ASSIGNED_DEV_H - -#include - -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); - -int kvm_iommu_map_guest(struct kvm *kvm); -int kvm_iommu_unmap_guest(struct kvm *kvm); - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg); - -void kvm_free_all_assigned_devices(struct kvm *kvm); -#else -static inline int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - return 0; -} - -static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} -#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ - -#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c deleted file mode 100644 index b181426f67b4..000000000000 --- a/arch/x86/kvm/iommu.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Author: Allen M. Kay - * Author: Weidong Han - * Author: Ben-Ami Yassour - */ - -#include -#include -#include -#include -#include -#include -#include "assigned-dev.h" - -static bool allow_unsafe_assigned_interrupts; -module_param_named(allow_unsafe_assigned_interrupts, - allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, - "Enable device assignment on platforms without interrupt remapping support."); - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long npages) -{ - gfn_t end_gfn; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + npages; - gfn += 1; - - if (is_error_noslot_pfn(pfn)) - return pfn; - - while (gfn < end_gfn) - gfn_to_pfn_memslot(slot, gfn++); - - return pfn; -} - -static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - gfn_t gfn, end_gfn; - kvm_pfn_t pfn; - int r = 0; - struct iommu_domain *domain = kvm->arch.iommu_domain; - int flags; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - gfn = slot->base_gfn; - end_gfn = gfn + slot->npages; - - flags = IOMMU_READ; - if (!(slot->flags & KVM_MEM_READONLY)) - flags |= IOMMU_WRITE; - if (!kvm->arch.iommu_noncoherent) - flags |= IOMMU_CACHE; - - - while (gfn < end_gfn) { - unsigned long page_size; - - /* Check if already mapped */ - if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { - gfn += 1; - continue; - } - - /* Get the page size we could use to map */ - page_size = kvm_host_page_size(kvm, gfn); - - /* Make sure the page_size does not exceed the memslot */ - while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) - page_size >>= 1; - - /* Make sure gfn is aligned to the page size we want to map */ - while ((gfn << PAGE_SHIFT) & (page_size - 1)) - page_size >>= 1; - - /* Make sure hva is aligned to the page size we want to map */ - while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) - page_size >>= 1; - - /* - * Pin all pages we are about to map in memory. This is - * important because we unmap and unpin in 4kb steps later. - */ - pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); - if (is_error_noslot_pfn(pfn)) { - gfn += 1; - continue; - } - - /* Map into IO address space */ - r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - page_size, flags); - if (r) { - printk(KERN_ERR "kvm_iommu_map_address:" - "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); - goto unmap_pages; - } - - gfn += page_size >> PAGE_SHIFT; - - cond_resched(); - } - - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int idx, r = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - if (kvm->arch.iommu_noncoherent) - kvm_arch_register_noncoherent_dma(kvm); - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - r = kvm_iommu_map_pages(kvm, memslot); - if (r) - break; - } - srcu_read_unlock(&kvm->srcu, idx); - - return r; -} - -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - int r; - bool noncoherent; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - r = iommu_attach_device(domain, &pdev->dev); - if (r) { - dev_err(&pdev->dev, "kvm assign device failed ret %d", r); - return r; - } - - noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); - - /* Check if need to update IOMMU page table for guest memory */ - if (noncoherent != kvm->arch.iommu_noncoherent) { - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_noncoherent = noncoherent; - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - } - - kvm_arch_start_assignment(kvm); - pci_set_dev_assigned(pdev); - - dev_info(&pdev->dev, "kvm assign device\n"); - - return 0; -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - iommu_detach_device(domain, &pdev->dev); - - pci_clear_dev_assigned(pdev); - kvm_arch_end_assignment(kvm); - - dev_info(&pdev->dev, "kvm deassign device\n"); - - return 0; -} - -int kvm_iommu_map_guest(struct kvm *kvm) -{ - int r; - - if (!iommu_present(&pci_bus_type)) { - printk(KERN_ERR "%s: iommu not found\n", __func__); - return -ENODEV; - } - - mutex_lock(&kvm->slots_lock); - - kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); - if (!kvm->arch.iommu_domain) { - r = -ENOMEM; - goto out_unlock; - } - - if (!allow_unsafe_assigned_interrupts && - !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { - printk(KERN_WARNING "%s: No interrupt remapping support," - " disallowing device assignment." - " Re-enable with \"allow_unsafe_assigned_interrupts=1\"" - " module option.\n", __func__); - iommu_domain_free(kvm->arch.iommu_domain); - kvm->arch.iommu_domain = NULL; - r = -EPERM; - goto out_unlock; - } - - r = kvm_iommu_map_memslots(kvm); - if (r) - kvm_iommu_unmap_memslots(kvm); - -out_unlock: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - struct iommu_domain *domain; - gfn_t end_gfn, gfn; - kvm_pfn_t pfn; - u64 phys; - - domain = kvm->arch.iommu_domain; - end_gfn = base_gfn + npages; - gfn = base_gfn; - - /* check if iommu exists and in use */ - if (!domain) - return; - - while (gfn < end_gfn) { - unsigned long unmap_pages; - size_t size; - - /* Get physical address */ - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - - if (!phys) { - gfn++; - continue; - } - - pfn = phys >> PAGE_SHIFT; - - /* Unmap address from IO address space */ - size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); - unmap_pages = 1ULL << get_order(size); - - /* Unpin all pages we just unmapped to not leak any memory */ - kvm_unpin_pages(kvm, pfn, unmap_pages); - - gfn += unmap_pages; - - cond_resched(); - } -} - -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int idx; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) - kvm_iommu_unmap_pages(kvm, memslot); - - srcu_read_unlock(&kvm->srcu, idx); - - if (kvm->arch.iommu_noncoherent) - kvm_arch_unregister_noncoherent_dma(kvm); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - mutex_lock(&kvm->slots_lock); - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_domain = NULL; - kvm->arch.iommu_noncoherent = false; - mutex_unlock(&kvm->slots_lock); - - iommu_domain_free(domain); - return 0; -} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ccbd45ecd41a..1853cda7f6d5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -27,7 +27,6 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" -#include "assigned-dev.h" #include "pmu.h" #include "hyperv.h" @@ -2675,10 +2674,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_ASSIGN_DEV_IRQ: - case KVM_CAP_PCI_2_3: -#endif r = 1; break; case KVM_CAP_ADJUST_CLOCK: @@ -2713,11 +2708,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_IOMMU: - r = iommu_present(&pci_bus_type); - break; -#endif case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; @@ -4230,7 +4220,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } default: - r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + r = -ENOTTY; } out: return r; @@ -8068,7 +8058,6 @@ void kvm_arch_sync_events(struct kvm *kvm) { cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } @@ -8152,7 +8141,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); - kvm_iommu_unmap_guest(kvm); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d0250744507a..f1339a7756b3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -877,22 +877,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); -#else -static inline int kvm_iommu_map_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - return 0; -} - -static inline void kvm_iommu_unmap_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ -} -#endif - /* * search_memslots() and __gfn_to_memslot() are here because they are * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 88257b311cb5..ff3bf5d26e0b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1019,8 +1019,6 @@ int __kvm_set_memory_region(struct kvm *kvm, old_memslots = install_new_memslots(kvm, as_id, slots); - /* slot was deleted or moved, clear iommu mapping */ - kvm_iommu_unmap_pages(kvm, &old); /* From this point no new shadow pages pointing to a deleted, * or moved, memslot will be created. * @@ -1055,21 +1053,6 @@ int __kvm_set_memory_region(struct kvm *kvm, kvm_free_memslot(kvm, &old, &new); kvfree(old_memslots); - - /* - * IOMMU mapping: New slots need to be mapped. Old slots need to be - * un-mapped and re-mapped if their base changes. Since base change - * unmapping is handled above with slot deletion, mapping alone is - * needed here. Anything else the iommu might care about for existing - * slots (size changes, userspace addr changes and read-only flag - * changes) is disallowed above, so any other attribute changes getting - * here can be skipped. - */ - if (as_id == 0 && (change == KVM_MR_CREATE || change == KVM_MR_MOVE)) { - r = kvm_iommu_map_pages(kvm, &new); - return r; - } - return 0; out_slots: -- cgit v1.2.3 From 4b4357e02523ec63ad853f927f5d93a25101a1d2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 31 Mar 2017 13:53:23 +0200 Subject: kvm: make KVM_COALESCED_MMIO_PAGE_OFFSET public MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its value has never changed; we might as well make it part of the ABI instead of using the return value of KVM_CHECK_EXTENSION(KVM_CAP_COALESCED_MMIO). Because PPC does not always make MMIO available, the code has to be made dependent on CONFIG_KVM_MMIO rather than KVM_COALESCED_MMIO_PAGE_OFFSET. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/arm/include/asm/kvm_host.h | 1 - arch/arm/include/uapi/asm/kvm.h | 2 ++ arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/include/uapi/asm/kvm.h | 2 ++ arch/mips/include/asm/kvm_host.h | 1 - arch/mips/include/uapi/asm/kvm.h | 2 ++ arch/powerpc/include/asm/kvm_host.h | 3 --- arch/powerpc/include/uapi/asm/kvm.h | 3 +++ arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/include/uapi/asm/kvm.h | 3 +++ include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 10 +++++----- 12 files changed, 18 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 31ee468ce667..de67ce647501 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -30,7 +30,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED #define KVM_USER_MEM_SLOTS 32 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HAVE_ONE_REG #define KVM_HALT_POLL_NS_DEFAULT 500000 diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 6ebd3e6a1fd1..254a38cace2a 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -27,6 +27,8 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e7705e7bb07b..522e4f60976e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -31,7 +31,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED #define KVM_USER_MEM_SLOTS 512 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HALT_POLL_NS_DEFAULT 500000 #include diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index c2860358ae3e..aa5ab69c1312 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -39,6 +39,8 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5c518c148f9d..2998479fd4e8 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -83,7 +83,6 @@ /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HALT_POLL_NS_DEFAULT 500000 #ifdef CONFIG_KVM_MIPS_VZ diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h index 3107095d7f0a..0318c6b442ab 100644 --- a/arch/mips/include/uapi/asm/kvm.h +++ b/arch/mips/include/uapi/asm/kvm.h @@ -21,6 +21,8 @@ #define __KVM_HAVE_READONLY_MEM +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + /* * for KVM_GET_REGS and KVM_SET_REGS * diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7bba8f415627..01d05c76f1c7 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -45,9 +45,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED -#ifdef CONFIG_KVM_MMIO -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 -#endif #define KVM_HALT_POLL_NS_DEFAULT 10000 /* 10 us */ /* These values are internal and can be increased later */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 4edbe4bb0e8b..07fbeb927834 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -29,6 +29,9 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_GUEST_DEBUG +/* Not always available, but if it is, this is the correct offset. */ +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + struct kvm_regs { __u64 pc; __u64 cr; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7dbb8d622683..d962fa998a6f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,8 +43,6 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_PIO_PAGE_OFFSET 1 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 #define KVM_HALT_POLL_NS_DEFAULT 400000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 739c0c594022..c2824d02ba37 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -9,6 +9,9 @@ #include #include +#define KVM_PIO_PAGE_OFFSET 1 +#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 + #define DE_VECTOR 0 #define DB_VECTOR 1 #define BP_VECTOR 3 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f1339a7756b3..7e74ae4d99bb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -403,7 +403,7 @@ struct kvm { struct kvm_vm_stat stat; struct kvm_arch arch; refcount_t users_count; -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; spinlock_t ring_lock; struct list_head coalesced_zones; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b5dcde10c53b..f489167839c4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2349,7 +2349,7 @@ static int kvm_vcpu_fault(struct vm_fault *vmf) else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) page = virt_to_page(vcpu->arch.pio_data); #endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif @@ -2918,7 +2918,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: return 1; -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: return KVM_COALESCED_MMIO_PAGE_OFFSET; #endif @@ -2971,7 +2971,7 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; @@ -3163,7 +3163,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) kvm = kvm_create_vm(type); if (IS_ERR(kvm)) return PTR_ERR(kvm); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO r = kvm_coalesced_mmio_init(kvm); if (r < 0) { kvm_put_kvm(kvm); @@ -3216,7 +3216,7 @@ static long kvm_dev_ioctl(struct file *filp, #ifdef CONFIG_X86 r += PAGE_SIZE; /* pio data page */ #endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO r += PAGE_SIZE; /* coalesced mmio ring page */ #endif break; -- cgit v1.2.3 From 328e566479449194979d64685ae6d74c989599bb Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 24 Mar 2016 11:21:04 +0100 Subject: KVM: arm/arm64: vgic: Defer touching GICH_VMCR to vcpu_load/put We don't have to save/restore the VMCR on every entry to/from the guest, since on GICv2 we can access the control interface from EL1 and on VHE systems with GICv3 we can access the control interface from KVM running in EL2. GICv3 systems without VHE becomes the rare case, which has to save/restore the register on each round trip. Note that userspace accesses may see out-of-date values if the VCPU is running while accessing the VGIC state via the KVM device API, but this is already the case and it is up to userspace to quiesce the CPUs before reading the CPU registers from the GIC for an up-to-date view. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_asm.h | 3 +++ arch/arm/kvm/arm.c | 11 ++++++----- arch/arm64/include/asm/kvm_asm.h | 2 ++ include/kvm/arm_vgic.h | 3 +++ virt/kvm/arm/hyp/vgic-v2-sr.c | 3 --- virt/kvm/arm/hyp/vgic-v3-sr.c | 14 ++++++++++---- virt/kvm/arm/vgic/vgic-init.c | 12 ++++++++++++ virt/kvm/arm/vgic/vgic-v2.c | 24 ++++++++++++++++++++++-- virt/kvm/arm/vgic/vgic-v3.c | 22 ++++++++++++++++++++-- virt/kvm/arm/vgic/vgic.c | 22 ++++++++++++++++++++++ virt/kvm/arm/vgic/vgic.h | 6 ++++++ 11 files changed, 106 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 8ef05381984b..dd16044b34b6 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -75,7 +75,10 @@ extern void __init_stage2_translation(void); extern void __kvm_hyp_reset(unsigned long); extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u64 __vgic_v3_read_vmcr(void); +extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); + #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 96dba7cd8be7..46fd37578693 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -351,15 +351,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); kvm_arm_set_running_vcpu(vcpu); + + kvm_vgic_load(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - /* - * The arch-generic KVM code expects the cpu field of a vcpu to be -1 - * if the vcpu is no longer assigned to a cpu. This is used for the - * optimized make_all_cpus_request path. - */ + kvm_vgic_put(vcpu); + vcpu->cpu = -1; kvm_arm_set_running_vcpu(NULL); @@ -633,7 +632,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * non-preemptible context. */ preempt_disable(); + kvm_pmu_flush_hwstate(vcpu); + kvm_timer_flush_hwstate(vcpu); kvm_vgic_flush_hwstate(vcpu); diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index ec3553eb9349..49f99cd02613 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -59,6 +59,8 @@ extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u64 __vgic_v3_read_vmcr(void); +extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index b72dd2ad5f44..f7a2e31eb4c1 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -306,6 +306,9 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); +void kvm_vgic_load(struct kvm_vcpu *vcpu); +void kvm_vgic_put(struct kvm_vcpu *vcpu); + #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index c8aeb7b91ec8..d3d3b9b0c2c3 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -114,8 +114,6 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu) if (!base) return; - cpu_if->vgic_vmcr = readl_relaxed(base + GICH_VMCR); - if (vcpu->arch.vgic_cpu.live_lrs) { cpu_if->vgic_apr = readl_relaxed(base + GICH_APR); @@ -165,7 +163,6 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) } } - writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR); vcpu->arch.vgic_cpu.live_lrs = live_lrs; } diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 3947095cc0a1..e51ee7edf953 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -159,8 +159,6 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) if (!cpu_if->vgic_sre) dsb(st); - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - if (vcpu->arch.vgic_cpu.live_lrs) { int i; u32 max_lr_idx, nr_pri_bits; @@ -261,8 +259,6 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) live_lrs |= (1 << i); } - write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); - if (live_lrs) { write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); @@ -326,3 +322,13 @@ u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) { return read_gicreg(ICH_VTR_EL2); } + +u64 __hyp_text __vgic_v3_read_vmcr(void) +{ + return read_gicreg(ICH_VMCR_EL2); +} + +void __hyp_text __vgic_v3_write_vmcr(u32 vmcr) +{ + write_gicreg(vmcr, ICH_VMCR_EL2); +} diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 276139a24e6f..e8e973b72ca5 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -262,6 +262,18 @@ int vgic_init(struct kvm *kvm) vgic_debug_init(kvm); dist->initialized = true; + + /* + * If we're initializing GICv2 on-demand when first running the VCPU + * then we need to load the VGIC state onto the CPU. We can detect + * this easily by checking if we are in between vcpu_load and vcpu_put + * when we just initialized the VGIC. + */ + preempt_disable(); + vcpu = kvm_arm_get_running_vcpu(); + if (vcpu) + kvm_vgic_load(vcpu); + preempt_enable(); out: return ret; } diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index b834ecdf3225..2f241e026c8f 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -184,6 +184,7 @@ void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; u32 vmcr; vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK; @@ -194,12 +195,15 @@ void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr; + cpu_if->vgic_vmcr = vmcr; } void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr; + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT; @@ -375,3 +379,19 @@ out: return ret; } + +void vgic_v2_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + + writel_relaxed(cpu_if->vgic_vmcr, vgic->vctrl_base + GICH_VMCR); +} + +void vgic_v2_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + + cpu_if->vgic_vmcr = readl_relaxed(vgic->vctrl_base + GICH_VMCR); +} diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index be0f4c3e0142..99213d744e4f 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -173,6 +173,7 @@ void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u32 vmcr; /* @@ -188,12 +189,15 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; - vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; + cpu_if->vgic_vmcr = vmcr; } void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; /* * Ignore the FIQen bit, because GIC emulation always implies @@ -386,3 +390,17 @@ int vgic_v3_probe(const struct gic_kvm_info *info) return 0; } + +void vgic_v3_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); +} + +void vgic_v3_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr); +} diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 654dfd40e449..2ac0def57424 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -656,6 +656,28 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); } +void kvm_vgic_load(struct kvm_vcpu *vcpu) +{ + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_load(vcpu); + else + vgic_v3_load(vcpu); +} + +void kvm_vgic_put(struct kvm_vcpu *vcpu) +{ + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_put(vcpu); + else + vgic_v3_put(vcpu); +} + int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index db28f7cadab2..9afb4557c7e8 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -130,6 +130,9 @@ int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type); +void vgic_v2_load(struct kvm_vcpu *vcpu); +void vgic_v2_put(struct kvm_vcpu *vcpu); + static inline void vgic_get_irq_kref(struct vgic_irq *irq) { if (irq->intid < VGIC_MIN_LPI) @@ -150,6 +153,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); +void vgic_v3_load(struct kvm_vcpu *vcpu); +void vgic_v3_put(struct kvm_vcpu *vcpu); + int vgic_register_its_iodevs(struct kvm *kvm); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); -- cgit v1.2.3 From 00dafa0fcfe9fb1d863f08dc45d6f05ac9505d46 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 23 Dec 2016 00:04:59 +0100 Subject: KVM: arm/arm64: vgic: Get rid of live_lrs There is no need to calculate and maintain live_lrs when we always populate the lowest numbered LRs first on every entry and clear all LRs on every exit. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 2 -- virt/kvm/arm/hyp/vgic-v2-sr.c | 39 ++++++++++----------------------------- virt/kvm/arm/hyp/vgic-v3-sr.c | 42 ++++++++++++------------------------------ 3 files changed, 22 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index f7a2e31eb4c1..ea940dbb5dba 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -264,8 +264,6 @@ struct vgic_cpu { */ struct list_head ap_list_head; - u64 live_lrs; - /* * Members below are used with GICv3 emulation only and represent * parts of the redistributor. diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index d3d3b9b0c2c3..34b37ce0d4be 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -26,27 +26,23 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; u32 eisr0, eisr1; int i; bool expect_mi; expect_mi = !!(cpu_if->vgic_hcr & GICH_HCR_UIE); - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - + for (i = 0; i < used_lrs && !expect_mi; i++) expect_mi |= (!(cpu_if->vgic_lr[i] & GICH_LR_HW) && (cpu_if->vgic_lr[i] & GICH_LR_EOI)); - } if (expect_mi) { cpu_if->vgic_misr = readl_relaxed(base + GICH_MISR); if (cpu_if->vgic_misr & GICH_MISR_EOI) { eisr0 = readl_relaxed(base + GICH_EISR0); - if (unlikely(nr_lr > 32)) + if (unlikely(used_lrs > 32)) eisr1 = readl_relaxed(base + GICH_EISR1); else eisr1 = 0; @@ -87,13 +83,10 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; int i; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - + for (i = 0; i < used_lrs; i++) { if (cpu_if->vgic_elrsr & (1UL << i)) cpu_if->vgic_lr[i] &= ~GICH_LR_STATE; else @@ -110,11 +103,12 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu) struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; struct vgic_dist *vgic = &kvm->arch.vgic; void __iomem *base = kern_hyp_va(vgic->vctrl_base); + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; if (!base) return; - if (vcpu->arch.vgic_cpu.live_lrs) { + if (used_lrs) { cpu_if->vgic_apr = readl_relaxed(base + GICH_APR); save_maint_int_state(vcpu, base); @@ -122,8 +116,6 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu) save_lrs(vcpu, base); writel_relaxed(0, base + GICH_HCR); - - vcpu->arch.vgic_cpu.live_lrs = 0; } else { cpu_if->vgic_eisr = 0; cpu_if->vgic_elrsr = ~0UL; @@ -139,31 +131,20 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; struct vgic_dist *vgic = &kvm->arch.vgic; void __iomem *base = kern_hyp_va(vgic->vctrl_base); - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; int i; - u64 live_lrs = 0; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; if (!base) return; - - for (i = 0; i < nr_lr; i++) - if (cpu_if->vgic_lr[i] & GICH_LR_STATE) - live_lrs |= 1UL << i; - - if (live_lrs) { + if (used_lrs) { writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); writel_relaxed(cpu_if->vgic_apr, base + GICH_APR); - for (i = 0; i < nr_lr; i++) { - if (!(live_lrs & (1UL << i))) - continue; - + for (i = 0; i < used_lrs; i++) { writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4)); } } - - vcpu->arch.vgic_cpu.live_lrs = live_lrs; } #ifdef CONFIG_ARM64 diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index e51ee7edf953..b3c36b64df34 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -118,18 +118,16 @@ static void __hyp_text __gic_v3_set_lr(u64 val, int lr) } } -static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr) +static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; int i; bool expect_mi; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE); - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - + for (i = 0; i < used_lrs; i++) { expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) && (cpu_if->vgic_lr[i] & ICH_LR_EOI)); } @@ -150,6 +148,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr) void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; u64 val; /* @@ -159,23 +158,19 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) if (!cpu_if->vgic_sre) dsb(st); - if (vcpu->arch.vgic_cpu.live_lrs) { + if (used_lrs) { int i; - u32 max_lr_idx, nr_pri_bits; + u32 nr_pri_bits; cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2); write_gicreg(0, ICH_HCR_EL2); val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); nr_pri_bits = vtr_to_nr_pri_bits(val); - save_maint_int_state(vcpu, max_lr_idx + 1); - - for (i = 0; i <= max_lr_idx; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; + save_maint_int_state(vcpu); + for (i = 0; i <= used_lrs; i++) { if (cpu_if->vgic_elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; else @@ -203,8 +198,6 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) default: cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2); } - - vcpu->arch.vgic_cpu.live_lrs = 0; } else { cpu_if->vgic_misr = 0; cpu_if->vgic_eisr = 0; @@ -232,9 +225,9 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; u64 val; - u32 max_lr_idx, nr_pri_bits; - u16 live_lrs = 0; + u32 nr_pri_bits; int i; /* @@ -251,15 +244,9 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) } val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); nr_pri_bits = vtr_to_nr_pri_bits(val); - for (i = 0; i <= max_lr_idx; i++) { - if (cpu_if->vgic_lr[i] & ICH_LR_STATE) - live_lrs |= (1 << i); - } - - if (live_lrs) { + if (used_lrs) { write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); switch (nr_pri_bits) { @@ -282,12 +269,8 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2); } - for (i = 0; i <= max_lr_idx; i++) { - if (!(live_lrs & (1 << i))) - continue; - + for (i = 0; i < used_lrs; i++) __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } } /* @@ -299,7 +282,6 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) isb(); dsb(sy); } - vcpu->arch.vgic_cpu.live_lrs = live_lrs; /* * Prevent the guest from touching the GIC system registers if -- cgit v1.2.3 From 096f31c4360f6bab130e3f68513719ec6890128c Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 29 Dec 2016 15:57:31 +0100 Subject: KVM: arm/arm64: vgic: Get rid of MISR and EISR fields We don't use these fields anymore so let's nuke them completely. Acked-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_vgic.h | 4 ---- virt/kvm/arm/hyp/vgic-v2-sr.c | 2 -- virt/kvm/arm/hyp/vgic-v3-sr.c | 2 -- 3 files changed, 8 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index ea940dbb5dba..26ed4fb896bb 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -225,8 +225,6 @@ struct vgic_dist { struct vgic_v2_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; - u32 vgic_misr; /* Saved only */ - u64 vgic_eisr; /* Saved only */ u64 vgic_elrsr; /* Saved only */ u32 vgic_apr; u32 vgic_lr[VGIC_V2_MAX_LRS]; @@ -236,8 +234,6 @@ struct vgic_v3_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_sre; /* Restored only, change ignored */ - u32 vgic_misr; /* Saved only */ - u32 vgic_eisr; /* Saved only */ u32 vgic_elrsr; /* Saved only */ u32 vgic_ap0r[4]; u32 vgic_ap1r[4]; diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index a4c3bb005725..a3f18d362366 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -77,9 +77,7 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu) writel_relaxed(0, base + GICH_HCR); } else { - cpu_if->vgic_eisr = 0; cpu_if->vgic_elrsr = ~0UL; - cpu_if->vgic_misr = 0; cpu_if->vgic_apr = 0; } } diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 41bbbb054a6f..3d0b1ddb6929 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -170,8 +170,6 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2); } } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; cpu_if->vgic_elrsr = 0xffff; cpu_if->vgic_ap0r[0] = 0; cpu_if->vgic_ap0r[1] = 0; -- cgit v1.2.3 From 3fe17e6826162021d5e9274949571b19fc94826b Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 27 Sep 2016 21:08:05 +0200 Subject: KVM: arm/arm64: Add ARM user space interrupt signaling ABI We have 2 modes for dealing with interrupts in the ARM world. We can either handle them all using hardware acceleration through the vgic or we can emulate a gic in user space and only drive CPU IRQ pins from there. Unfortunately, when driving IRQs from user space, we never tell user space about events from devices emulated inside the kernel, which may result in interrupt line state changes, so we lose out on for example timer and PMU events if we run with user space gic emulation. Define an ABI to publish such device output levels to userspace. Reviewed-by: Alexander Graf Reviewed-by: Marc Zyngier Signed-off-by: Alexander Graf Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 42 +++++++++++++++++++++++++++++++++++++++ arch/arm/include/uapi/asm/kvm.h | 2 ++ arch/arm64/include/uapi/asm/kvm.h | 2 ++ include/uapi/linux/kvm.h | 8 ++++++++ 4 files changed, 54 insertions(+) (limited to 'include') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 3c248f772ae6..3b4e76e5201e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4147,3 +4147,45 @@ This capability, if KVM_CHECK_EXTENSION indicates that it is available, means that that the kernel can support guests using the hashed page table MMU defined in Power ISA V3.00 (as implemented in the POWER9 processor), including in-memory segment tables. + + +8.5 KVM_CAP_ARM_USER_IRQ + +Architectures: arm, arm64 +This capability, if KVM_CHECK_EXTENSION indicates that it is available, means +that if userspace creates a VM without an in-kernel interrupt controller, it +will be notified of changes to the output level of in-kernel emulated devices, +which can generate virtual interrupts, presented to the VM. +For such VMs, on every return to userspace, the kernel +updates the vcpu's run->s.regs.device_irq_level field to represent the actual +output level of the device. + +Whenever kvm detects a change in the device output level, kvm guarantees at +least one return to userspace before running the VM. This exit could either +be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way, +userspace can always sample the device output level and re-compute the state of +the userspace interrupt controller. Userspace should always check the state +of run->s.regs.device_irq_level on every kvm exit. +The value in run->s.regs.device_irq_level can represent both level and edge +triggered interrupt signals, depending on the device. Edge triggered interrupt +signals will exit to userspace with the bit in run->s.regs.device_irq_level +set exactly once per edge signal. + +The field run->s.regs.device_irq_level is available independent of +run->kvm_valid_regs or run->kvm_dirty_regs bits. + +If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a +number larger than 0 indicating the version of this capability is implemented +and thereby which bits in in run->s.regs.device_irq_level can signal values. + +Currently the following bits are defined for the device_irq_level bitmap: + + KVM_CAP_ARM_USER_IRQ >= 1: + + KVM_ARM_DEV_EL1_VTIMER - EL1 virtual timer + KVM_ARM_DEV_EL1_PTIMER - EL1 physical timer + KVM_ARM_DEV_PMU - ARM PMU overflow interrupt signal + +Future versions of kvm may implement additional events. These will get +indicated by returning a higher number from KVM_CHECK_EXTENSION and will be +listed above. diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 6ebd3e6a1fd1..a5838d605e7b 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -114,6 +114,8 @@ struct kvm_debug_exit_arch { }; struct kvm_sync_regs { + /* Used with KVM_CAP_ARM_USER_IRQ */ + __u64 device_irq_level; }; struct kvm_arch_memory_slot { diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index c2860358ae3e..cd6bea495e63 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -143,6 +143,8 @@ struct kvm_debug_exit_arch { #define KVM_GUESTDBG_USE_HW (1 << 17) struct kvm_sync_regs { + /* Used with KVM_CAP_ARM_USER_IRQ */ + __u64 device_irq_level; }; struct kvm_arch_memory_slot { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f51d5082a377..6d6b9b237f0b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -883,6 +883,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 #define KVM_CAP_IMMEDIATE_EXIT 136 +#define KVM_CAP_ARM_USER_IRQ 137 #ifdef KVM_CAP_IRQ_ROUTING @@ -1354,4 +1355,11 @@ struct kvm_assigned_msix_entry { #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) +/* Available with KVM_CAP_ARM_USER_IRQ */ + +/* Bits for run->s.regs.device_irq_level */ +#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) +#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) +#define KVM_ARM_DEV_PMU (1 << 2) + #endif /* __LINUX_KVM_H */ -- cgit v1.2.3 From d9e1397783765a275c3a7930250dcdb7e9480d7d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 27 Sep 2016 21:08:06 +0200 Subject: KVM: arm/arm64: Support arch timers with a userspace gic If you're running with a userspace gic or other interrupt controller (that is no vgic in the kernel), then you have so far not been able to use the architected timers, because the output of the architected timers, which are driven inside the kernel, was a kernel-only construct between the arch timer code and the vgic. This patch implements the new KVM_CAP_ARM_USER_IRQ feature, where we use a side channel on the kvm_run structure, run->s.regs.device_irq_level, to always notify userspace of the timer output levels when using a userspace irqchip. This works by ensuring that before we enter the guest, if the timer output level has changed compared to what we last told userspace, we don't enter the guest, but instead return to userspace to notify it of the new level. If we are exiting, because of an MMIO for example, and the level changed at the same time, the value is also updated and userspace can sample the line as it needs. This is nicely achieved simply always updating the timer_irq_level field after the main run loop. Note that the kvm_timer_update_irq trace event is changed to show the host IRQ number for the timer instead of the guest IRQ number, because the kernel no longer know which IRQ userspace wires up the timer signal to. Also note that this patch implements all required functionality but does not yet advertise the capability. Reviewed-by: Alexander Graf Reviewed-by: Marc Zyngier Signed-off-by: Alexander Graf Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 18 +++---- include/kvm/arm_arch_timer.h | 2 + virt/kvm/arm/arch_timer.c | 122 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 110 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index c378502623f6..ac6e57bcbe4d 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -515,13 +515,7 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) return ret; } - /* - * Enable the arch timers only if we have an in-kernel VGIC - * and it has been properly initialized, since we cannot handle - * interrupts from the virtual timer with a userspace gic. - */ - if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) - ret = kvm_timer_enable(vcpu); + ret = kvm_timer_enable(vcpu); return ret; } @@ -640,9 +634,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) local_irq_disable(); /* - * Re-check atomic conditions + * If we have a singal pending, or need to notify a userspace + * irqchip about timer level changes, then we exit (and update + * the timer level state in kvm_timer_update_run below). */ - if (signal_pending(current)) { + if (signal_pending(current) || + kvm_timer_should_notify_user(vcpu)) { ret = -EINTR; run->exit_reason = KVM_EXIT_INTR; } @@ -714,6 +711,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = handle_exit(vcpu, run, ret); } + /* Tell userspace about in-kernel device output levels */ + kvm_timer_update_run(vcpu); + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); return ret; diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index fe797d6ef89d..295584f31a4e 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -63,6 +63,8 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); +void kvm_timer_update_run(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 363f0d2cfc79..5dc216748d54 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -184,6 +184,27 @@ bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) return cval <= now; } +/* + * Reflect the timer output level into the kvm_run structure + */ +void kvm_timer_update_run(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return; + + /* Populate the device bitmap with the timer states */ + regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | + KVM_ARM_DEV_EL1_PTIMER); + if (vtimer->irq.level) + regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; + if (ptimer->irq.level) + regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; +} + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { @@ -194,9 +215,12 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level); - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq, - timer_ctx->irq.level); - WARN_ON(ret); + if (likely(irqchip_in_kernel(vcpu->kvm))) { + ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + timer_ctx->irq.irq, + timer_ctx->irq.level); + WARN_ON(ret); + } } /* @@ -215,7 +239,7 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) * because the guest would never see the interrupt. Instead wait * until we call this function from kvm_timer_flush_hwstate. */ - if (!timer->enabled) + if (unlikely(!timer->enabled)) return; if (kvm_timer_should_fire(vtimer) != vtimer->irq.level) @@ -282,28 +306,12 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu) timer_disarm(timer); } -/** - * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu - * @vcpu: The vcpu pointer - * - * Check if the virtual timer has expired while we were running in the host, - * and inject an interrupt if that was the case. - */ -void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +static void kvm_timer_flush_hwstate_vgic(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); bool phys_active; int ret; - if (unlikely(!timer->enabled)) - return; - - kvm_timer_update_state(vcpu); - - /* Set the background timer for the physical timer emulation. */ - kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu)); - /* * If we enter the guest with the virtual input level to the VGIC * asserted, then we have already told the VGIC what we need to, and @@ -355,11 +363,72 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) vtimer->active_cleared_last = !phys_active; } +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool vlevel, plevel; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; + plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; + + return vtimer->irq.level != vlevel || + ptimer->irq.level != plevel; +} + +static void kvm_timer_flush_hwstate_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + /* + * To prevent continuously exiting from the guest, we mask the + * physical interrupt such that the guest can make forward progress. + * Once we detect the output level being deasserted, we unmask the + * interrupt again so that we exit from the guest when the timer + * fires. + */ + if (vtimer->irq.level) + disable_percpu_irq(host_vtimer_irq); + else + enable_percpu_irq(host_vtimer_irq, 0); +} + +/** + * kvm_timer_flush_hwstate - prepare timers before running the vcpu + * @vcpu: The vcpu pointer + * + * Check if the virtual timer has expired while we were running in the host, + * and inject an interrupt if that was the case, making sure the timer is + * masked or disabled on the host so that we keep executing. Also schedule a + * software timer for the physical timer if it is enabled. + */ +void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + if (unlikely(!timer->enabled)) + return; + + kvm_timer_update_state(vcpu); + + /* Set the background timer for the physical timer emulation. */ + kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu)); + + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + kvm_timer_flush_hwstate_user(vcpu); + else + kvm_timer_flush_hwstate_vgic(vcpu); +} + /** * kvm_timer_sync_hwstate - sync timer state from cpu * @vcpu: The vcpu pointer * - * Check if the virtual timer has expired while we were running in the guest, + * Check if any of the timers have expired while we were running in the guest, * and inject an interrupt if that was the case. */ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) @@ -559,6 +628,13 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (timer->enabled) return 0; + /* Without a VGIC we do not map virtual IRQs to physical IRQs */ + if (!irqchip_in_kernel(vcpu->kvm)) + goto no_vgic; + + if (!vgic_initialized(vcpu->kvm)) + return -ENODEV; + /* * Find the physical IRQ number corresponding to the host_vtimer_irq */ @@ -582,8 +658,8 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (ret) return ret; +no_vgic: timer->enabled = 1; - return 0; } -- cgit v1.2.3 From 3dbbdf78636e66094d82c4df496c54ff6ae46e31 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Wed, 1 Feb 2017 12:51:52 +0100 Subject: KVM: arm/arm64: Report PMU overflow interrupts to userspace irqchip When not using an in-kernel VGIC, but instead emulating an interrupt controller in userspace, we should report the PMU overflow status to that userspace interrupt controller using the KVM_CAP_ARM_USER_IRQ feature. Reviewed-by: Alexander Graf Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 13 +++++++++---- include/kvm/arm_pmu.h | 7 +++++++ virt/kvm/arm/arch_timer.c | 3 --- virt/kvm/arm/pmu.c | 39 +++++++++++++++++++++++++++++++++++---- 4 files changed, 51 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index ac6e57bcbe4d..9eda2932f686 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -635,11 +635,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) /* * If we have a singal pending, or need to notify a userspace - * irqchip about timer level changes, then we exit (and update - * the timer level state in kvm_timer_update_run below). + * irqchip about timer or PMU level changes, then we exit (and + * update the timer level state in kvm_timer_update_run + * below). */ if (signal_pending(current) || - kvm_timer_should_notify_user(vcpu)) { + kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { ret = -EINTR; run->exit_reason = KVM_EXIT_INTR; } @@ -712,7 +714,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) } /* Tell userspace about in-kernel device output levels */ - kvm_timer_update_run(vcpu); + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { + kvm_timer_update_run(vcpu); + kvm_pmu_update_run(vcpu); + } if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 92e7e97ca8ff..1ab4633adf4f 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -50,6 +50,8 @@ void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); +bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); +void kvm_pmu_update_run(struct kvm_vcpu *vcpu); void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, @@ -85,6 +87,11 @@ static inline void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} +static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline void kvm_pmu_update_run(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 5dc216748d54..5976609ef27c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -193,9 +193,6 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); struct kvm_sync_regs *regs = &vcpu->run->s.regs; - if (likely(irqchip_in_kernel(vcpu->kvm))) - return; - /* Populate the device bitmap with the timer states */ regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | KVM_ARM_DEV_EL1_PTIMER); diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 69ccce308458..4b43e7f3b158 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -230,13 +230,44 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) return; overflow = !!kvm_pmu_overflow_status(vcpu); - if (pmu->irq_level != overflow) { - pmu->irq_level = overflow; - kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - pmu->irq_num, overflow); + if (pmu->irq_level == overflow) + return; + + pmu->irq_level = overflow; + + if (likely(irqchip_in_kernel(vcpu->kvm))) { + int ret; + ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + pmu->irq_num, overflow); + WARN_ON(ret); } } +bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + return pmu->irq_level != run_level; +} + +/* + * Reflect the PMU overflow interrupt output level into the kvm_run structure + */ +void kvm_pmu_update_run(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the timer bitmap for user space */ + regs->device_irq_level &= ~KVM_ARM_DEV_PMU; + if (vcpu->arch.pmu.irq_level) + regs->device_irq_level |= KVM_ARM_DEV_PMU; +} + /** * kvm_pmu_flush_hwstate - flush pmu state to cpu * @vcpu: The vcpu pointer -- cgit v1.2.3 From 993225adf4af20a0e50e37c3d4894b79c98e01c9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Apr 2017 10:50:33 +0200 Subject: KVM: x86: rename kvm_vcpu_request_scan_ioapic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's rename it into a proper arch specific callback. Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/ioapic.c | 2 +- include/linux/kvm_host.h | 4 ++-- virt/kvm/eventfd.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index dc29a2785b81..bdff437acbcb 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -266,7 +266,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) spin_unlock(&ioapic->lock); } -void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { if (!ioapic_in_kernel(kvm)) return; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e74ae4d99bb..397b7b5b1933 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -502,10 +502,10 @@ int __must_check vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC -void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else -static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 4d28a9ddbee0..a8d540398bbd 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -490,7 +490,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); mutex_unlock(&kvm->irq_lock); - kvm_vcpu_request_scan_ioapic(kvm); + kvm_arch_post_irq_ack_notifier_list_update(kvm); } void kvm_unregister_irq_ack_notifier(struct kvm *kvm, @@ -500,7 +500,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); synchronize_srcu(&kvm->irq_srcu); - kvm_vcpu_request_scan_ioapic(kvm); + kvm_arch_post_irq_ack_notifier_list_update(kvm); } #endif -- cgit v1.2.3 From 4898d3f49b5b156c33f0ae0f49ede417ab86195e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 22 Mar 2017 15:21:51 +1100 Subject: KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number This adds a capability number for in-kernel support for VFIO on SPAPR platform. The capability will tell the user space whether in-kernel handlers of H_PUT_TCE can handle VFIO-targeted requests or not. If not, the user space must not attempt allocating a TCE table in the host kernel via the KVM_CREATE_SPAPR_TCE KVM ioctl because in that case TCE requests will not be passed to the user space which is desired action in the situation like that. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6180ea50e9ef..7b488eae61b8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -892,6 +892,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_MIPS_64BIT 139 #define KVM_CAP_S390_GS 140 #define KVM_CAP_S390_AIS 141 +#define KVM_CAP_SPAPR_TCE_VFIO 142 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 121f80ba68f1a5779a36d7b3247206e60e0a7418 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 22 Mar 2017 15:21:56 +1100 Subject: KVM: PPC: VFIO: Add in-kernel acceleration for VFIO This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT and H_STUFF_TCE requests targeted an IOMMU TCE table used for VFIO without passing them to user space which saves time on switching to user space and back. This adds H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE handlers to KVM. KVM tries to handle a TCE request in the real mode, if failed it passes the request to the virtual mode to complete the operation. If it a virtual mode handler fails, the request is passed to the user space; this is not expected to happen though. To avoid dealing with page use counters (which is tricky in real mode), this only accelerates SPAPR TCE IOMMU v2 clients which are required to pre-register the userspace memory. The very first TCE request will be handled in the VFIO SPAPR TCE driver anyway as the userspace view of the TCE table (iommu_table::it_userspace) is not allocated till the very first mapping happens and we cannot call vmalloc in real mode. If we fail to update a hardware IOMMU table unexpected reason, we just clear it and move on as there is nothing really we can do about it - for example, if we hot plug a VFIO device to a guest, existing TCE tables will be mirrored automatically to the hardware and there is no interface to report to the guest about possible failures. This adds new attribute - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE - to the VFIO KVM device. It takes a VFIO group fd and SPAPR TCE table fd and associates a physical IOMMU table with the SPAPR TCE table (which is a guest view of the hardware IOMMU table). The iommu_table object is cached and referenced so we do not have to look up for it in real mode. This does not implement the UNSET counterpart as there is no use for it - once the acceleration is enabled, the existing userspace won't disable it unless a VFIO container is destroyed; this adds necessary cleanup to the KVM_DEV_VFIO_GROUP_DEL handler. This advertises the new KVM_CAP_SPAPR_TCE_VFIO capability to the user space. This adds real mode version of WARN_ON_ONCE() as the generic version causes problems with rcu_sched. Since we testing what vmalloc_to_phys() returns in the code, this also adds a check for already existing vmalloc_to_phys() call in kvmppc_rm_h_put_tce_indirect(). This finally makes use of vfio_external_user_iommu_id() which was introduced quite some time ago and was considered for removal. Tests show that this patch increases transmission speed from 220MB/s to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). Signed-off-by: Alexey Kardashevskiy Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/devices/vfio.txt | 18 +- arch/powerpc/include/asm/kvm_host.h | 8 + arch/powerpc/include/asm/kvm_ppc.h | 4 + arch/powerpc/kvm/book3s_64_vio.c | 306 ++++++++++++++++++++++++++++- arch/powerpc/kvm/book3s_64_vio_hv.c | 201 ++++++++++++++++++- arch/powerpc/kvm/powerpc.c | 2 + include/uapi/linux/kvm.h | 6 + virt/kvm/vfio.c | 105 ++++++++++ 8 files changed, 645 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt index ef51740c67ca..528c77c8022c 100644 --- a/Documentation/virtual/kvm/devices/vfio.txt +++ b/Documentation/virtual/kvm/devices/vfio.txt @@ -16,7 +16,21 @@ Groups: KVM_DEV_VFIO_GROUP attributes: KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table + allocated by sPAPR KVM. + kvm_device_attr.addr points to a struct: -For each, kvm_device_attr.addr points to an int32_t file descriptor -for the VFIO group. + struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; + }; + + where + @groupfd is a file descriptor for a VFIO group; + @tablefd is a file descriptor for a TCE table allocated via + KVM_CREATE_SPAPR_TCE. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0f3ac09cbfe0..77c60826d145 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -188,6 +188,13 @@ struct kvmppc_pginfo { atomic_t refcnt; }; +struct kvmppc_spapr_tce_iommu_table { + struct rcu_head rcu; + struct list_head next; + struct iommu_table *tbl; + struct kref kref; +}; + struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; @@ -196,6 +203,7 @@ struct kvmppc_spapr_tce_table { u32 page_shift; u64 offset; /* in pages */ u64 size; /* window size in pages */ + struct list_head iommu_tables; struct page *pages[0]; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 4d079a29eae2..5885d327c025 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -173,6 +173,10 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm, extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long porder); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); +extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, + struct iommu_group *grp); +extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, + struct iommu_group *grp); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index d507d94e020c..a160c14304eb 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -40,6 +42,7 @@ #include #include #include +#include static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) { @@ -91,6 +94,137 @@ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) return ret; } +static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head) +{ + struct kvmppc_spapr_tce_iommu_table *stit = container_of(head, + struct kvmppc_spapr_tce_iommu_table, rcu); + + iommu_tce_table_put(stit->tbl); + + kfree(stit); +} + +static void kvm_spapr_tce_liobn_put(struct kref *kref) +{ + struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref, + struct kvmppc_spapr_tce_iommu_table, kref); + + list_del_rcu(&stit->next); + + call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free); +} + +extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, + struct iommu_group *grp) +{ + int i; + struct kvmppc_spapr_tce_table *stt; + struct kvmppc_spapr_tce_iommu_table *stit, *tmp; + struct iommu_table_group *table_group = NULL; + + list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) { + + table_group = iommu_group_get_iommudata(grp); + if (WARN_ON(!table_group)) + continue; + + list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) { + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (table_group->tables[i] != stit->tbl) + continue; + + kref_put(&stit->kref, kvm_spapr_tce_liobn_put); + return; + } + } + } +} + +extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, + struct iommu_group *grp) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + bool found = false; + struct iommu_table *tbl = NULL; + struct iommu_table_group *table_group; + long i; + struct kvmppc_spapr_tce_iommu_table *stit; + struct fd f; + + f = fdget(tablefd); + if (!f.file) + return -EBADF; + + list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt == f.file->private_data) { + found = true; + break; + } + } + + fdput(f); + + if (!found) + return -EINVAL; + + table_group = iommu_group_get_iommudata(grp); + if (WARN_ON(!table_group)) + return -EFAULT; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbltmp = table_group->tables[i]; + + if (!tbltmp) + continue; + /* + * Make sure hardware table parameters are exactly the same; + * this is used in the TCE handlers where boundary checks + * use only the first attached table. + */ + if ((tbltmp->it_page_shift == stt->page_shift) && + (tbltmp->it_offset == stt->offset) && + (tbltmp->it_size == stt->size)) { + /* + * Reference the table to avoid races with + * add/remove DMA windows. + */ + tbl = iommu_tce_table_get(tbltmp); + break; + } + } + if (!tbl) + return -EINVAL; + + list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { + if (tbl != stit->tbl) + continue; + + if (!kref_get_unless_zero(&stit->kref)) { + /* stit is being destroyed */ + iommu_tce_table_put(tbl); + return -ENOTTY; + } + /* + * The table is already known to this KVM, we just increased + * its KVM reference counter and can return. + */ + return 0; + } + + stit = kzalloc(sizeof(*stit), GFP_KERNEL); + if (!stit) { + iommu_tce_table_put(tbl); + return -ENOMEM; + } + + stit->tbl = tbl; + kref_init(&stit->kref); + + list_add_rcu(&stit->next, &stt->iommu_tables); + + return 0; +} + static void release_spapr_tce_table(struct rcu_head *head) { struct kvmppc_spapr_tce_table *stt = container_of(head, @@ -130,9 +264,18 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) { struct kvmppc_spapr_tce_table *stt = filp->private_data; + struct kvmppc_spapr_tce_iommu_table *stit, *tmp; list_del_rcu(&stt->list); + list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) { + WARN_ON(!kref_read(&stit->kref)); + while (1) { + if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put)) + break; + } + } + kvm_put_kvm(stt->kvm); kvmppc_account_memlimit( @@ -183,6 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, stt->offset = args->offset; stt->size = size; stt->kvm = kvm; + INIT_LIST_HEAD_RCU(&stt->iommu_tables); for (i = 0; i < npages; i++) { stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); @@ -211,11 +355,101 @@ fail: return ret; } +static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; + + iommu_tce_xchg(tbl, entry, &hpa, &dir); +} + +static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + struct mm_iommu_table_group_mem_t *mem = NULL; + const unsigned long pgsize = 1ULL << tbl->it_page_shift; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup(kvm->mm, *pua, pgsize); + if (!mem) + return H_TOO_HARD; + + mm_iommu_mapped_dec(mem); + + *pua = 0; + + return H_SUCCESS; +} + +static long kvmppc_tce_iommu_unmap(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + enum dma_data_direction dir = DMA_NONE; + unsigned long hpa = 0; + long ret; + + if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) + return H_HARDWARE; + + if (dir == DMA_NONE) + return H_SUCCESS; + + ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); + if (ret != H_SUCCESS) + iommu_tce_xchg(tbl, entry, &hpa, &dir); + + return ret; +} + +long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl, + unsigned long entry, unsigned long ua, + enum dma_data_direction dir) +{ + long ret; + unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + struct mm_iommu_table_group_mem_t *mem; + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift); + if (!mem) + /* This only handles v2 IOMMU type, v1 is handled via ioctl() */ + return H_TOO_HARD; + + if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa))) + return H_HARDWARE; + + if (mm_iommu_mapped_inc(mem)) + return H_CLOSED; + + ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); + if (WARN_ON_ONCE(ret)) { + mm_iommu_mapped_dec(mem); + return H_HARDWARE; + } + + if (dir != DMA_NONE) + kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); + + *pua = ua; + + return 0; +} + long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { struct kvmppc_spapr_tce_table *stt; - long ret; + long ret, idx; + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long entry, ua = 0; + enum dma_data_direction dir; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ @@ -232,7 +466,35 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + dir = iommu_tce_direction(tce); + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + return H_PARAMETER; + + entry = ioba >> stt->page_shift; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + if (dir == DMA_NONE) { + ret = kvmppc_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry); + } else { + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl, + entry, ua, dir); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + + kvmppc_tce_put(stt, entry, tce); return H_SUCCESS; } @@ -247,6 +509,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long entry, ua = 0; u64 __user *tces; u64 tce; + struct kvmppc_spapr_tce_iommu_table *stit; stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) @@ -285,6 +548,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) goto unlock_exit; + if (kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), + &ua, NULL)) + return H_PARAMETER; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + ret = kvmppc_tce_iommu_map(vcpu->kvm, + stit->tbl, entry + i, ua, + iommu_tce_direction(tce)); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + goto unlock_exit; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + kvmppc_tce_put(stt, entry + i, tce); } @@ -301,6 +584,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, { struct kvmppc_spapr_tce_table *stt; long i, ret; + struct kvmppc_spapr_tce_iommu_table *stit; stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) @@ -314,6 +598,24 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) return H_PARAMETER; + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long entry = ioba >> stit->tbl->it_page_shift; + + for (i = 0; i < npages; ++i) { + ret = kvmppc_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry + i); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + } + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 440d3ab5dc32..eda0a8f6fae8 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -40,6 +40,31 @@ #include #include +#ifdef CONFIG_BUG + +#define WARN_ON_ONCE_RM(condition) ({ \ + static bool __section(.data.unlikely) __warned; \ + int __ret_warn_once = !!(condition); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + __warned = true; \ + pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n", \ + __stringify(condition), \ + __func__, __LINE__); \ + dump_stack(); \ + } \ + unlikely(__ret_warn_once); \ +}) + +#else + +#define WARN_ON_ONCE_RM(condition) ({ \ + int __ret_warn_on = !!(condition); \ + unlikely(__ret_warn_on); \ +}) + +#endif + #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) /* @@ -161,11 +186,117 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; + + iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); +} + +static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + struct mm_iommu_table_group_mem_t *mem = NULL; + const unsigned long pgsize = 1ULL << tbl->it_page_shift; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + pua = (void *) vmalloc_to_phys(pua); + if (WARN_ON_ONCE_RM(!pua)) + return H_HARDWARE; + + mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize); + if (!mem) + return H_TOO_HARD; + + mm_iommu_mapped_dec(mem); + + *pua = 0; + + return H_SUCCESS; +} + +static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + enum dma_data_direction dir = DMA_NONE; + unsigned long hpa = 0; + long ret; + + if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) + /* + * real mode xchg can fail if struct page crosses + * a page boundary + */ + return H_TOO_HARD; + + if (dir == DMA_NONE) + return H_SUCCESS; + + ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); + if (ret) + iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + + return ret; +} + +static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl, + unsigned long entry, unsigned long ua, + enum dma_data_direction dir) +{ + long ret; + unsigned long hpa = 0; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + struct mm_iommu_table_group_mem_t *mem; + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift); + if (!mem) + return H_TOO_HARD; + + if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa))) + return H_HARDWARE; + + pua = (void *) vmalloc_to_phys(pua); + if (WARN_ON_ONCE_RM(!pua)) + return H_HARDWARE; + + if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) + return H_CLOSED; + + ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + if (ret) { + mm_iommu_mapped_dec(mem); + /* + * real mode xchg can fail if struct page crosses + * a page boundary + */ + return H_TOO_HARD; + } + + if (dir != DMA_NONE) + kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); + + *pua = ua; + + return 0; +} + long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { struct kvmppc_spapr_tce_table *stt; long ret; + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long entry, ua = 0; + enum dma_data_direction dir; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ @@ -182,7 +313,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + dir = iommu_tce_direction(tce); + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + return H_PARAMETER; + + entry = ioba >> stt->page_shift; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + if (dir == DMA_NONE) + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry); + else + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, + stit->tbl, entry, ua, dir); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + + kvmppc_tce_put(stt, entry, tce); return H_SUCCESS; } @@ -223,6 +379,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long tces, entry, ua = 0; unsigned long *rmap = NULL; bool prereg = false; + struct kvmppc_spapr_tce_iommu_table *stit; stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) @@ -270,6 +427,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, return H_TOO_HARD; rmap = (void *) vmalloc_to_phys(rmap); + if (WARN_ON_ONCE_RM(!rmap)) + return H_HARDWARE; /* * Synchronize with the MMU notifier callbacks in @@ -293,6 +452,27 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) goto unlock_exit; + ua = 0; + if (kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), + &ua, NULL)) + return H_PARAMETER; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, + stit->tbl, entry + i, ua, + iommu_tce_direction(tce)); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + goto unlock_exit; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + kvmppc_tce_put(stt, entry + i, tce); } @@ -309,6 +489,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, { struct kvmppc_spapr_tce_table *stt; long i, ret; + struct kvmppc_spapr_tce_iommu_table *stit; stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) @@ -322,6 +503,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) return H_PARAMETER; + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long entry = ioba >> stit->tbl->it_page_shift; + + for (i = 0; i < npages; ++i) { + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry + i); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + } + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6c7244879bfe..cf725c580fc5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -534,6 +534,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_SPAPR_TCE_64: + /* fallthrough */ + case KVM_CAP_SPAPR_TCE_VFIO: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: case KVM_CAP_PPC_ENABLE_HCALL: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7b488eae61b8..3c168b6fd74b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1097,6 +1097,7 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP 1 #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE 3 enum kvm_device_type { KVM_DEV_TYPE_FSL_MPIC_20 = 1, @@ -1118,6 +1119,11 @@ enum kvm_device_type { KVM_DEV_TYPE_MAX, }; +struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; +}; + /* * ioctls for VM fds */ diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index d32f239eb471..37d9118fd84b 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -20,6 +20,10 @@ #include #include "vfio.h" +#ifdef CONFIG_SPAPR_TCE_IOMMU +#include +#endif + struct kvm_vfio_group { struct list_head node; struct vfio_group *vfio_group; @@ -89,6 +93,47 @@ static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) return ret > 0; } +#ifdef CONFIG_SPAPR_TCE_IOMMU +static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group) +{ + int (*fn)(struct vfio_group *); + int ret = -EINVAL; + + fn = symbol_get(vfio_external_user_iommu_id); + if (!fn) + return ret; + + ret = fn(vfio_group); + + symbol_put(vfio_external_user_iommu_id); + + return ret; +} + +static struct iommu_group *kvm_vfio_group_get_iommu_group( + struct vfio_group *group) +{ + int group_id = kvm_vfio_external_user_iommu_id(group); + + if (group_id < 0) + return NULL; + + return iommu_group_get_by_id(group_id); +} + +static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, + struct vfio_group *vfio_group) +{ + struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group); + + if (WARN_ON_ONCE(!grp)) + return; + + kvm_spapr_tce_release_iommu_group(kvm, grp); + iommu_group_put(grp); +} +#endif + /* * Groups can use the same or different IOMMU domains. If the same then * adding a new group may change the coherency of groups we've previously @@ -211,6 +256,9 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) mutex_unlock(&kv->lock); +#ifdef CONFIG_SPAPR_TCE_IOMMU + kvm_spapr_tce_release_vfio_group(dev->kvm, vfio_group); +#endif kvm_vfio_group_set_kvm(vfio_group, NULL); kvm_vfio_group_put_external_user(vfio_group); @@ -218,6 +266,57 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) kvm_vfio_update_coherency(dev); return ret; + +#ifdef CONFIG_SPAPR_TCE_IOMMU + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: { + struct kvm_vfio_spapr_tce param; + struct kvm_vfio *kv = dev->private; + struct vfio_group *vfio_group; + struct kvm_vfio_group *kvg; + struct fd f; + struct iommu_group *grp; + + if (copy_from_user(¶m, (void __user *)arg, + sizeof(struct kvm_vfio_spapr_tce))) + return -EFAULT; + + f = fdget(param.groupfd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + grp = kvm_vfio_group_get_iommu_group(vfio_group); + if (WARN_ON_ONCE(!grp)) { + kvm_vfio_group_put_external_user(vfio_group); + return -EIO; + } + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group != vfio_group) + continue; + + ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, + param.tablefd, grp); + break; + } + + mutex_unlock(&kv->lock); + + iommu_group_put(grp); + kvm_vfio_group_put_external_user(vfio_group); + + return ret; + } +#endif /* CONFIG_SPAPR_TCE_IOMMU */ } return -ENXIO; @@ -242,6 +341,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, switch (attr->attr) { case KVM_DEV_VFIO_GROUP_ADD: case KVM_DEV_VFIO_GROUP_DEL: +#ifdef CONFIG_SPAPR_TCE_IOMMU + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: +#endif return 0; } @@ -257,6 +359,9 @@ static void kvm_vfio_destroy(struct kvm_device *dev) struct kvm_vfio_group *kvg, *tmp; list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { +#ifdef CONFIG_SPAPR_TCE_IOMMU + kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group); +#endif kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); list_del(&kvg->node); -- cgit v1.2.3 From 668fffa3f838edfcb1679f842f7ef1afa61c3e9a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 21 Apr 2017 12:27:17 +0200 Subject: kvm: better MWAIT emulation for guests Guests that are heavy on futexes end up IPI'ing each other a lot. That can lead to significant slowdowns and latency increase for those guests when running within KVM. If only a single guest is needed on a host, we have a lot of spare host CPU time we can throw at the problem. Modern CPUs implement a feature called "MWAIT" which allows guests to wake up sleeping remote CPUs without an IPI - thus without an exit - at the expense of never going out of guest context. The decision whether this is something sensible to use should be up to the VM admin, so to user space. We can however allow MWAIT execution on systems that support it properly hardware wise. This patch adds a CAP to user space and a KVM cpuid leaf to indicate availability of native MWAIT execution. With that enabled, the worst a guest can do is waste as many cycles as a "jmp ." would do, so it's not a privilege problem. We consciously do *not* expose the feature in our CPUID bitmap, as most people will want to benefit from sleeping vCPUs to allow for over commit. Reported-by: "Gabriel L. Somlo" Signed-off-by: Michael S. Tsirkin [agraf: fix amd, change commit message] Signed-off-by: Alexander Graf Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 9 +++++++++ arch/x86/kvm/svm.c | 7 +++++-- arch/x86/kvm/vmx.c | 6 ++++-- arch/x86/kvm/x86.c | 3 +++ arch/x86/kvm/x86.h | 36 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + 6 files changed, 58 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e60be91d8036..dc674c2b8b31 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4111,3 +4111,12 @@ reserved. 2: MIPS64 or microMIPS64 with access to all address segments. Both registers and addresses are 64-bits wide. It will be possible to run 64-bit or 32-bit guest code. + +8.8 KVM_CAP_X86_GUEST_MWAIT + +Architectures: x86 + +This capability indicates that guest using memory monotoring instructions +(MWAIT/MWAITX) to stop the virtual CPU will not cause a VM exit. As such time +spent while virtual CPU is halted in this way will then be accounted for as +guest running time on the host (as opposed to e.g. HLT). diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1b203abf76e1..c41f03e5090a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1198,10 +1198,13 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_CLGI); set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); set_intercept(svm, INTERCEPT_XSETBV); + if (!kvm_mwait_in_guest()) { + set_intercept(svm, INTERCEPT_MONITOR); + set_intercept(svm, INTERCEPT_MWAIT); + } + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); control->int_ctl = V_INTR_MASKING_MASK; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c1a12b94e1fd..a4ef63718101 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3527,11 +3527,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; + if (!kvm_mwait_in_guest()) + min |= CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING; + opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 49a69c0a0d50..2f9fe6bf7091 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2687,6 +2687,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_TSC_STABLE; break; + case KVM_CAP_X86_GUEST_MWAIT: + r = kvm_mwait_in_guest(); + break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e8ff3e4ce38a..612067074905 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -1,6 +1,8 @@ #ifndef ARCH_X86_KVM_X86_H #define ARCH_X86_KVM_X86_H +#include +#include #include #include #include "kvm_cache_regs.h" @@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) +static inline bool kvm_mwait_in_guest(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) + return false; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* All AMD CPUs have a working MWAIT implementation */ + return true; + case X86_VENDOR_INTEL: + /* Handle Intel below */ + break; + default: + return false; + } + + /* + * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as + * they would allow guest to stop the CPU completely by disabling + * interrupts then invoking MWAIT. + */ + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) + return false; + + return true; +} + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3c168b6fd74b..e43906b95d9f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -893,6 +893,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_GS 140 #define KVM_CAP_S390_AIS 141 #define KVM_CAP_SPAPR_TCE_VFIO 142 +#define KVM_CAP_X86_GUEST_MWAIT 143 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 72875d8a4d92f6f37e051be522b2252fd49bd50e Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 26 Apr 2017 22:32:19 +0200 Subject: KVM: add kvm_{test,clear}_request to replace {test,clear}_bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users were expected to use kvm_check_request() for testing and clearing, but request have expanded their use since then and some users want to only test or do a faster clear. Make sure that requests are not directly accessed with bit operations. Reviewed-by: Christian Borntraeger Signed-off-by: Radim Krčmář Reviewed-by: Andrew Jones Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 2 +- arch/powerpc/kvm/book3s_pr.c | 2 +- arch/powerpc/kvm/book3s_pr_papr.c | 2 +- arch/powerpc/kvm/booke.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 2 +- arch/s390/kvm/kvm-s390.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 8 ++++---- include/linux/kvm_host.h | 14 ++++++++++++-- 9 files changed, 24 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 34e78a3ee9d7..4144bfaef137 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -982,7 +982,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu) * check if any I/O interrupts are pending. */ if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; } } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index f026b062c0ed..69a09444d46e 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -349,7 +349,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) if (msr & MSR_POW) { if (!vcpu->arch.pending_exceptions) { kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); vcpu->stat.halt_wakeup++; /* Unset POW bit after we woke up */ diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index f102616febc7..bcbeeb62dd13 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -344,7 +344,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) case H_CEDE: kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); vcpu->stat.halt_wakeup++; return EMULATE_DONE; case H_LOGICAL_CI_LOAD: diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3c296c2eacf8..3eaac3809977 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -584,7 +584,7 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu) * userspace, so clear the KVM_REQ_WATCHDOG request. */ if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS)) - clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests); + kvm_clear_request(KVM_REQ_WATCHDOG, vcpu); spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); nr_jiffies = watchdog_next_timeout(vcpu); @@ -695,7 +695,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) if (vcpu->arch.shared->msr & MSR_WE) { local_irq_enable(); kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); hard_irq_disable(); kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index cf725c580fc5..1ee22a910074 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -233,7 +233,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) case EV_HCALL_TOKEN(EV_IDLE): r = EV_SUCCESS; kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); break; default: r = EV_UNIMPLEMENTED; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7eb1275cc265..4bafb0a0c8b5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2496,7 +2496,7 @@ retry: } /* nothing to do, just clear the request */ - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); return 0; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a4ef63718101..b003b8dfb20c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6299,7 +6299,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); - if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f68c5b2ba627..2de54c20fa9e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1753,7 +1753,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); spin_unlock(&ka->pvclock_gtod_sync_lock); #endif @@ -7041,7 +7041,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -7169,7 +7169,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; goto out; } @@ -8382,7 +8382,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (atomic_read(&vcpu->arch.nmi_queued)) return true; - if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_SMI, vcpu)) return true; if (kvm_arch_interrupt_allowed(vcpu) && diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 397b7b5b1933..374fa92c7657 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1079,10 +1079,20 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) set_bit(req, &vcpu->requests); } +static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) +{ + return test_bit(req, &vcpu->requests); +} + +static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) +{ + clear_bit(req, &vcpu->requests); +} + static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { - if (test_bit(req, &vcpu->requests)) { - clear_bit(req, &vcpu->requests); + if (kvm_test_request(req, vcpu)) { + kvm_clear_request(req, vcpu); /* * Ensure the rest of the request is visible to kvm_check_request's -- cgit v1.2.3 From 930f7fd6da77ed9476a538345513460fd304aaf5 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 26 Apr 2017 22:32:22 +0200 Subject: KVM: mark requests that do not need a wakeup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some operations must ensure that the guest is not running with stale data, but if the guest is halted, then the update can wait until another event happens. kvm_make_all_requests() currently doesn't wake up, so we can mark all requests used with it. First 8 bits were arbitrarily reserved for request numbers. Most uses of requests have the request type as a constant, so a compiler will optimize the '&'. An alternative would be to have an inline function that would return whether the request needs a wake-up or not, but I like this one better even though it might produce worse assembly. Signed-off-by: Radim Krčmář Reviewed-by: Andrew Jones Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 2 +- arch/arm64/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/kvm_host.h | 6 +++--- include/linux/kvm_host.h | 12 +++++++----- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index de67ce647501..49358f20d36f 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -44,7 +44,7 @@ #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS #endif -#define KVM_REQ_VCPU_EXIT 8 +#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_NO_WAKEUP) u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 522e4f60976e..1c9458a7ec92 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -41,7 +41,7 @@ #define KVM_VCPU_MAX_FEATURES 4 -#define KVM_REQ_VCPU_EXIT 8 +#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_NO_WAKEUP) int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f5c942edbc86..19219826bed6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -61,10 +61,10 @@ #define KVM_REQ_PMI 19 #define KVM_REQ_SMI 20 #define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS 22 -#define KVM_REQ_SCAN_IOAPIC 23 +#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD 25 +#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_CRASH 26 #define KVM_REQ_IOAPIC_EOI_EXIT 27 #define KVM_REQ_HV_RESET 28 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 374fa92c7657..a805ddcb7eb0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -115,12 +115,14 @@ static inline bool is_error_page(struct page *page) return IS_ERR(page); } +#define KVM_REQUEST_MASK GENMASK(7,0) +#define KVM_REQUEST_NO_WAKEUP BIT(8) /* * Architecture-independent vcpu->requests bit members * Bits 4-7 are reserved for more arch-independent bits. */ -#define KVM_REQ_TLB_FLUSH 0 -#define KVM_REQ_MMU_RELOAD 1 +#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_PENDING_TIMER 2 #define KVM_REQ_UNHALT 3 @@ -1076,17 +1078,17 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) * caller. Paired with the smp_mb__after_atomic in kvm_check_request. */ smp_wmb(); - set_bit(req, &vcpu->requests); + set_bit(req & KVM_REQUEST_MASK, &vcpu->requests); } static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) { - return test_bit(req, &vcpu->requests); + return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests); } static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) { - clear_bit(req, &vcpu->requests); + clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests); } static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) -- cgit v1.2.3 From cde9af6e79046e12cd08d161139b1d5e57e9fbac Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 26 Apr 2017 22:32:24 +0200 Subject: KVM: add explicit barrier to kvm_vcpu_kick kvm_vcpu_kick() must issue a general memory barrier prior to reading vcpu->mode in order to ensure correctness of the mutual-exclusion memory barrier pattern used with vcpu->requests. While the cmpxchg called from kvm_vcpu_kick(): kvm_vcpu_kick kvm_arch_vcpu_should_kick kvm_vcpu_exiting_guest_mode cmpxchg implies general memory barriers before and after the operation, that implication is only valid when cmpxchg succeeds. We need an explicit barrier for when it fails, otherwise a VCPU thread on its entry path that reads zero for vcpu->requests does not exclude the possibility the requesting thread sees !IN_GUEST_MODE when it reads vcpu->mode. kvm_make_all_cpus_request already had a barrier, so we remove it, as now it would be redundant. Signed-off-by: Andrew Jones Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 6 ++++++ virt/kvm/kvm_main.c | 3 --- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0936c3e2e51c..69fcee26f4da 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6853,7 +6853,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * 1) We should set ->mode before checking ->requests. Please see - * the comment in kvm_make_all_cpus_request. + * the comment in kvm_vcpu_exiting_guest_mode(). * * 2) For APICv, we should set ->mode before checking PIR.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a805ddcb7eb0..84c5396564f7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -270,6 +270,12 @@ struct kvm_vcpu { static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { + /* + * The memory barrier ensures a previous write to vcpu->requests cannot + * be reordered with the read of vcpu->mode. It pairs with the general + * memory barrier following the write of vcpu->mode in VCPU RUN. + */ + smp_mb__before_atomic(); return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3772f7dcc72d..1efb07643035 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -183,9 +183,6 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) kvm_make_request(req, vcpu); cpu = vcpu->cpu; - /* Set ->requests bit before we read ->mode. */ - smp_mb__after_atomic(); - if (!(req & KVM_REQUEST_NO_WAKEUP)) kvm_vcpu_wake_up(vcpu); -- cgit v1.2.3 From 178f02ffafafc59d4d4b135242e5cc1515743680 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 26 Apr 2017 22:32:26 +0200 Subject: KVM: return if kvm_vcpu_wake_up() did wake up the VCPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No need to kick a VCPU that we have just woken up. Signed-off-by: Radim Krčmář Reviewed-by: Andrew Jones Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 84c5396564f7..f4a2c00092f8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -690,7 +690,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); -void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); +bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1efb07643035..632f7b3e198c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -183,8 +183,8 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) kvm_make_request(req, vcpu); cpu = vcpu->cpu; - if (!(req & KVM_REQUEST_NO_WAKEUP)) - kvm_vcpu_wake_up(vcpu); + if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) + continue; if (cpus != NULL && cpu != -1 && cpu != me && kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) @@ -2195,7 +2195,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_vcpu_block); -void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) +bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { struct swait_queue_head *wqp; @@ -2203,8 +2203,10 @@ void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) if (swait_active(wqp)) { swake_up(wqp); ++vcpu->stat.halt_wakeup; + return true; } + return false; } EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); @@ -2216,7 +2218,9 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) int me; int cpu = vcpu->cpu; - kvm_vcpu_wake_up(vcpu); + if (kvm_vcpu_wake_up(vcpu)) + return; + me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) if (kvm_arch_vcpu_should_kick(vcpu)) -- cgit v1.2.3 From 7a97cec26b94c909f4cbad2dc3186af3e457a522 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Apr 2017 14:33:43 +0200 Subject: KVM: mark requests that need synchronization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_make_all_requests() provides a synchronization that waits until all kicked VCPUs have acknowledged the kick. This is important for KVM_REQ_MMU_RELOAD as it prevents freeing while lockless paging is underway. This patch adds the synchronization property into all requests that are currently being used with kvm_make_all_requests() in order to preserve the current behavior and only introduce a new framework. Removing it from requests where it is not necessary is left for future patches. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 2 +- arch/arm64/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/kvm_host.h | 6 +++--- include/linux/kvm_host.h | 9 +++++---- virt/kvm/kvm_main.c | 25 ++++++++++++++++++++++--- 5 files changed, 32 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 49358f20d36f..3cd04d164c64 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -44,7 +44,7 @@ #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS #endif -#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 1c9458a7ec92..d239ae166c4e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -41,7 +41,7 @@ #define KVM_VCPU_MAX_FEATURES 4 -#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 19219826bed6..84c8489531bb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -61,10 +61,10 @@ #define KVM_REQ_PMI 19 #define KVM_REQ_SMI 20 #define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_CRASH 26 #define KVM_REQ_IOAPIC_EOI_EXIT 27 #define KVM_REQ_HV_RESET 28 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f4a2c00092f8..a5bfffa8c8d4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -117,14 +117,15 @@ static inline bool is_error_page(struct page *page) #define KVM_REQUEST_MASK GENMASK(7,0) #define KVM_REQUEST_NO_WAKEUP BIT(8) +#define KVM_REQUEST_WAIT BIT(9) /* * Architecture-independent vcpu->requests bit members * Bits 4-7 are reserved for more arch-independent bits. */ -#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_PENDING_TIMER 2 -#define KVM_REQ_UNHALT 3 +#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_PENDING_TIMER 2 +#define KVM_REQ_UNHALT 3 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 632f7b3e198c..035bc51f656f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -165,6 +165,24 @@ void vcpu_put(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(vcpu_put); +/* TODO: merge with kvm_arch_vcpu_should_kick */ +static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) +{ + int mode = kvm_vcpu_exiting_guest_mode(vcpu); + + /* + * We need to wait for the VCPU to reenable interrupts and get out of + * READING_SHADOW_PAGE_TABLES mode. + */ + if (req & KVM_REQUEST_WAIT) + return mode != OUTSIDE_GUEST_MODE; + + /* + * Need to kick a running VCPU, but otherwise there is nothing to do. + */ + return mode == IN_GUEST_MODE; +} + static void ack_flush(void *_completed) { } @@ -174,6 +192,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) int i, cpu, me; cpumask_var_t cpus; bool called = true; + bool wait = req & KVM_REQUEST_WAIT; struct kvm_vcpu *vcpu; zalloc_cpumask_var(&cpus, GFP_ATOMIC); @@ -187,13 +206,13 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) continue; if (cpus != NULL && cpu != -1 && cpu != me && - kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) + kvm_request_needs_ipi(vcpu, req)) cpumask_set_cpu(cpu, cpus); } if (unlikely(cpus == NULL)) - smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); + smp_call_function_many(cpu_online_mask, ack_flush, NULL, wait); else if (!cpumask_empty(cpus)) - smp_call_function_many(cpus, ack_flush, NULL, 1); + smp_call_function_many(cpus, ack_flush, NULL, wait); else called = false; put_cpu(); -- cgit v1.2.3 From 5c0aea0e8d98e38858fbb3a09870ed8487a01da2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 28 Apr 2017 17:06:20 +0200 Subject: KVM: x86: don't hold kvm->lock in KVM_SET_GSI_ROUTING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We needed the lock to avoid racing with creation of the irqchip on x86. As kvm_set_irq_routing() calls srcu_synchronize_expedited(), this lock might be held for a longer time. Let's introduce an arch specific callback to check if we can actually add irq routes. For x86, all we have to do is check if we have an irqchip in the kernel. We don't need kvm->lock at that point as the irqchip is marked as inititalized only when actually fully created. Reported-by: Steve Rutherford Reviewed-by: Radim Krčmář Fixes: 1df6ddede10a ("KVM: x86: race between KVM_SET_GSI_ROUTING and KVM_CREATE_IRQCHIP") Signed-off-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/irq.h | 2 +- arch/x86/kvm/irq_comm.c | 15 +++++++++------ arch/x86/kvm/x86.c | 11 +---------- include/linux/kvm_host.h | 1 + virt/kvm/irqchip.c | 5 +++++ virt/kvm/kvm_main.c | 5 ++--- 7 files changed, 19 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 84c8489531bb..f5bddf92faba 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -728,7 +728,6 @@ struct kvm_hv { enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, - KVM_IRQCHIP_INIT_IN_PROGRESS, /* temporarily set during creation */ KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 0edd22c3344c..d5005cc26521 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -111,7 +111,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm) /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); - return mode > KVM_IRQCHIP_INIT_IN_PROGRESS; + return mode != KVM_IRQCHIP_NONE; } void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 4517a4c2ac3a..3cc3b2d130a0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -274,16 +274,19 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - /* also allow creation of routes during KVM_IRQCHIP_INIT_IN_PROGRESS */ - if (kvm->arch.irqchip_mode == KVM_IRQCHIP_NONE) - return -EINVAL; - - /* Matches smp_wmb() when setting irqchip_mode */ - smp_rmb(); + /* We can't check irqchip_in_kernel() here as some callers are + * currently inititalizing the irqchip. Other callers should therefore + * check kvm_arch_can_set_irq_routing() before calling this function. + */ switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: if (irqchip_split(kvm)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be2ade58edb9..2fe9aa116288 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3919,14 +3919,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; if (kvm->created_vcpus) goto split_irqchip_unlock; - kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS; r = kvm_setup_empty_irq_routing(kvm); - if (r) { - kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE; - /* Pairs with smp_rmb() when reading irqchip_mode */ - smp_wmb(); + if (r) goto split_irqchip_unlock; - } /* Pairs with irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; @@ -4012,12 +4007,8 @@ long kvm_arch_vm_ioctl(struct file *filp, goto create_irqchip_unlock; } - kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS; r = kvm_setup_default_irq_routing(kvm); if (r) { - kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE; - /* Pairs with smp_rmb() when reading irqchip_mode */ - smp_wmb(); kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); goto create_irqchip_unlock; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a5bfffa8c8d4..25cf258a1c9b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1018,6 +1018,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) #define KVM_MAX_IRQ_ROUTES 1024 #endif +bool kvm_arch_can_set_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index cc30d01a56be..31e40c9e81df 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -172,6 +172,11 @@ void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm) { } +bool __weak kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return true; +} + int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *ue, unsigned nr, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 035bc51f656f..6281cc2446d5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3075,6 +3075,8 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&routing, argp, sizeof(routing))) goto out; r = -EINVAL; + if (!kvm_arch_can_set_irq_routing(kvm)) + goto out; if (routing.nr > KVM_MAX_IRQ_ROUTES) goto out; if (routing.flags) @@ -3090,11 +3092,8 @@ static long kvm_vm_ioctl(struct file *filp, routing.nr * sizeof(*entries))) goto out_free_irq_routing; } - /* avoid races with KVM_CREATE_IRQCHIP on x86 */ - mutex_lock(&kvm->lock); r = kvm_set_irq_routing(kvm, entries, routing.nr, routing.flags); - mutex_unlock(&kvm->lock); out_free_irq_routing: vfree(entries); break; -- cgit v1.2.3 From 4e335d9e7ddbcf83d03e7fbe65797ebed2272c18 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 2 May 2017 16:20:18 +0200 Subject: Revert "KVM: Support vCPU-based gfn->hva cache" This reverts commit bbd6411513aa8ef3ea02abab61318daf87c1af1e. I've been sitting on this revert for too long and it unfortunately missed 4.11. It's also the reason why I haven't merged ring-based dirty tracking for 4.12. Using kvm_vcpu_memslots in kvm_gfn_to_hva_cache_init and kvm_vcpu_write_guest_offset_cached means that the MSR value can now be used to access SMRAM, simply by making it point to an SMRAM physical address. This is problematic because it lets the guest OS overwrite memory that it shouldn't be able to touch. Cc: stable@vger.kernel.org Fixes: bbd6411513aa8ef3ea02abab61318daf87c1af1e Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 22 ++++++++++++---------- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++-------------------- include/linux/kvm_host.h | 16 ++++++++-------- virt/kvm/kvm_main.c | 34 +++++++++++++++++----------------- 4 files changed, 58 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bad6a25067bc..9fa5b8164961 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -529,14 +529,16 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { - return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) @@ -2285,8 +2287,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32))) + if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); @@ -2338,14 +2340,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; @@ -2439,7 +2441,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data, + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, addr, sizeof(u8)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2fe9aa116288..b38a302858a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1785,7 +1785,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time, + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, &guest_hv_clock, sizeof(guest_hv_clock)))) return; @@ -1806,9 +1806,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -1822,16 +1822,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2064,7 +2064,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, sizeof(u32))) return 1; @@ -2083,7 +2083,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime, + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; @@ -2094,7 +2094,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); @@ -2103,14 +2103,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } @@ -2215,7 +2215,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; @@ -2236,7 +2236,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, data & KVM_STEAL_VALID_BITS, sizeof(struct kvm_steal_time))) return 1; @@ -2858,7 +2858,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.preempted = 1; - kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, offsetof(struct kvm_steal_time, preempted), sizeof(vcpu->arch.st.steal.preempted)); @@ -8527,8 +8527,9 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, + sizeof(val)); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 25cf258a1c9b..3727afdf614d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -650,18 +650,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); -int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); -int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); -int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len); -int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - gpa_t gpa, unsigned long len); +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); +int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, int offset, unsigned long len); +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa, unsigned long len); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6281cc2446d5..4c4d3fe10654 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1975,18 +1975,18 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, return 0; } -int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { - struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init); +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); -int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len) +int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, int offset, unsigned long len) { - struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; @@ -1996,7 +1996,7 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_ __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) - return kvm_vcpu_write_guest(vcpu, gpa, data, len); + return kvm_write_guest(kvm, gpa, data, len); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; @@ -2008,19 +2008,19 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_ return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached); +EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); -int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) { - return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len); + return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached); +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); -int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) { - struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memslots *slots = kvm_memslots(kvm); int r; BUG_ON(len > ghc->len); @@ -2029,7 +2029,7 @@ int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *g __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) - return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len); + return kvm_read_guest(kvm, ghc->gpa, data, len); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; @@ -2040,7 +2040,7 @@ int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *g return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached); +EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { -- cgit v1.2.3