diff options
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
-rw-r--r-- | virt/kvm/arm/vgic-v2.c | 247 | ||||
-rw-r--r-- | virt/kvm/arm/vgic-v3.c | 247 | ||||
-rw-r--r-- | virt/kvm/arm/vgic.c | 1062 | ||||
-rw-r--r-- | virt/kvm/assigned-dev.c | 6 | ||||
-rw-r--r-- | virt/kvm/async_pf.c | 4 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 136 | ||||
-rw-r--r-- | virt/kvm/ioapic.c | 46 | ||||
-rw-r--r-- | virt/kvm/ioapic.h | 2 | ||||
-rw-r--r-- | virt/kvm/iommu.c | 35 | ||||
-rw-r--r-- | virt/kvm/irq_comm.c | 24 | ||||
-rw-r--r-- | virt/kvm/irqchip.c | 98 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 265 | ||||
-rw-r--r-- | virt/kvm/vfio.c | 27 | ||||
-rw-r--r-- | virt/kvm/vfio.h | 17 |
15 files changed, 1682 insertions, 537 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 13f2d19793e3..fc0c5e603eb4 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -6,6 +6,9 @@ config HAVE_KVM config HAVE_KVM_IRQCHIP bool +config HAVE_KVM_IRQFD + bool + config HAVE_KVM_IRQ_ROUTING bool diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c new file mode 100644 index 000000000000..2935405ad22f --- /dev/null +++ b/virt/kvm/arm/vgic-v2.c @@ -0,0 +1,247 @@ +/* + * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved. + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> + +#include <linux/irqchip/arm-gic.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> + +static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr) +{ + struct vgic_lr lr_desc; + u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr]; + + lr_desc.irq = val & GICH_LR_VIRTUALID; + if (lr_desc.irq <= 15) + lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7; + else + lr_desc.source = 0; + lr_desc.state = 0; + + if (val & GICH_LR_PENDING_BIT) + lr_desc.state |= LR_STATE_PENDING; + if (val & GICH_LR_ACTIVE_BIT) + lr_desc.state |= LR_STATE_ACTIVE; + if (val & GICH_LR_EOI) + lr_desc.state |= LR_EOI_INT; + + return lr_desc; +} + +static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr, + struct vgic_lr lr_desc) +{ + u32 lr_val = (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | lr_desc.irq; + + if (lr_desc.state & LR_STATE_PENDING) + lr_val |= GICH_LR_PENDING_BIT; + if (lr_desc.state & LR_STATE_ACTIVE) + lr_val |= GICH_LR_ACTIVE_BIT; + if (lr_desc.state & LR_EOI_INT) + lr_val |= GICH_LR_EOI; + + vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val; +} + +static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, + struct vgic_lr lr_desc) +{ + if (!(lr_desc.state & LR_STATE_MASK)) + vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr); +} + +static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr; +} + +static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr; +} + +static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu) +{ + u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr; + u32 ret = 0; + + if (misr & GICH_MISR_EOI) + ret |= INT_STATUS_EOI; + if (misr & GICH_MISR_U) + ret |= INT_STATUS_UNDERFLOW; + + return ret; +} + +static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu) +{ + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE; +} + +static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu) +{ + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE; +} + +static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr; + + vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT; + vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT; + vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT; + vmcrp->pmr = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT; +} + +static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + u32 vmcr; + + vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK; + vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK; + vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK; + vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; + + vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr; +} + +static void vgic_v2_enable(struct kvm_vcpu *vcpu) +{ + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; + + /* Get the show on the road... */ + vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; +} + +static const struct vgic_ops vgic_v2_ops = { + .get_lr = vgic_v2_get_lr, + .set_lr = vgic_v2_set_lr, + .sync_lr_elrsr = vgic_v2_sync_lr_elrsr, + .get_elrsr = vgic_v2_get_elrsr, + .get_eisr = vgic_v2_get_eisr, + .get_interrupt_status = vgic_v2_get_interrupt_status, + .enable_underflow = vgic_v2_enable_underflow, + .disable_underflow = vgic_v2_disable_underflow, + .get_vmcr = vgic_v2_get_vmcr, + .set_vmcr = vgic_v2_set_vmcr, + .enable = vgic_v2_enable, +}; + +static struct vgic_params vgic_v2_params; + +/** + * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT + * @node: pointer to the DT node + * @ops: address of a pointer to the GICv2 operations + * @params: address of a pointer to HW-specific parameters + * + * Returns 0 if a GICv2 has been found, with the low level operations + * in *ops and the HW parameters in *params. Returns an error code + * otherwise. + */ +int vgic_v2_probe(struct device_node *vgic_node, + const struct vgic_ops **ops, + const struct vgic_params **params) +{ + int ret; + struct resource vctrl_res; + struct resource vcpu_res; + struct vgic_params *vgic = &vgic_v2_params; + + vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0); + if (!vgic->maint_irq) { + kvm_err("error getting vgic maintenance irq from DT\n"); + ret = -ENXIO; + goto out; + } + + ret = of_address_to_resource(vgic_node, 2, &vctrl_res); + if (ret) { + kvm_err("Cannot obtain GICH resource\n"); + goto out; + } + + vgic->vctrl_base = of_iomap(vgic_node, 2); + if (!vgic->vctrl_base) { + kvm_err("Cannot ioremap GICH\n"); + ret = -ENOMEM; + goto out; + } + + vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR); + vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1; + + ret = create_hyp_io_mappings(vgic->vctrl_base, + vgic->vctrl_base + resource_size(&vctrl_res), + vctrl_res.start); + if (ret) { + kvm_err("Cannot map VCTRL into hyp\n"); + goto out_unmap; + } + + if (of_address_to_resource(vgic_node, 3, &vcpu_res)) { + kvm_err("Cannot obtain GICV resource\n"); + ret = -ENXIO; + goto out_unmap; + } + + if (!PAGE_ALIGNED(vcpu_res.start)) { + kvm_err("GICV physical address 0x%llx not page aligned\n", + (unsigned long long)vcpu_res.start); + ret = -ENXIO; + goto out_unmap; + } + + if (!PAGE_ALIGNED(resource_size(&vcpu_res))) { + kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", + (unsigned long long)resource_size(&vcpu_res), + PAGE_SIZE); + ret = -ENXIO; + goto out_unmap; + } + + vgic->vcpu_base = vcpu_res.start; + + kvm_info("%s@%llx IRQ%d\n", vgic_node->name, + vctrl_res.start, vgic->maint_irq); + + vgic->type = VGIC_V2; + *ops = &vgic_v2_ops; + *params = vgic; + goto out; + +out_unmap: + iounmap(vgic->vctrl_base); +out: + of_node_put(vgic_node); + return ret; +} diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c new file mode 100644 index 000000000000..1c2c8eef0599 --- /dev/null +++ b/virt/kvm/arm/vgic-v3.c @@ -0,0 +1,247 @@ +/* + * Copyright (C) 2013 ARM Limited, All Rights Reserved. + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/cpu.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> + +#include <linux/irqchip/arm-gic-v3.h> + +#include <asm/kvm_emulate.h> +#include <asm/kvm_arm.h> +#include <asm/kvm_mmu.h> + +/* These are for GICv2 emulation only */ +#define GICH_LR_VIRTUALID (0x3ffUL << 0) +#define GICH_LR_PHYSID_CPUID_SHIFT (10) +#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) + +/* + * LRs are stored in reverse order in memory. make sure we index them + * correctly. + */ +#define LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) + +static u32 ich_vtr_el2; + +static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr) +{ + struct vgic_lr lr_desc; + u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)]; + + lr_desc.irq = val & GICH_LR_VIRTUALID; + if (lr_desc.irq <= 15) + lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7; + else + lr_desc.source = 0; + lr_desc.state = 0; + + if (val & ICH_LR_PENDING_BIT) + lr_desc.state |= LR_STATE_PENDING; + if (val & ICH_LR_ACTIVE_BIT) + lr_desc.state |= LR_STATE_ACTIVE; + if (val & ICH_LR_EOI) + lr_desc.state |= LR_EOI_INT; + + return lr_desc; +} + +static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr, + struct vgic_lr lr_desc) +{ + u64 lr_val = (((u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | + lr_desc.irq); + + if (lr_desc.state & LR_STATE_PENDING) + lr_val |= ICH_LR_PENDING_BIT; + if (lr_desc.state & LR_STATE_ACTIVE) + lr_val |= ICH_LR_ACTIVE_BIT; + if (lr_desc.state & LR_EOI_INT) + lr_val |= ICH_LR_EOI; + + vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val; +} + +static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, + struct vgic_lr lr_desc) +{ + if (!(lr_desc.state & LR_STATE_MASK)) + vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr); +} + +static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr; +} + +static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr; +} + +static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu) +{ + u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr; + u32 ret = 0; + + if (misr & ICH_MISR_EOI) + ret |= INT_STATUS_EOI; + if (misr & ICH_MISR_U) + ret |= INT_STATUS_UNDERFLOW; + + return ret; +} + +static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; + + vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT; + vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; + vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; + vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; +} + +static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu) +{ + vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE; +} + +static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu) +{ + vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE; +} + +static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) +{ + u32 vmcr; + + vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK; + vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; + vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; + vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; + + vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; +} + +static void vgic_v3_enable(struct kvm_vcpu *vcpu) +{ + /* + * By forcing VMCR to zero, the GIC will restore the binary + * points to their reset values. Anything else resets to zero + * anyway. + */ + vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = 0; + + /* Get the show on the road... */ + vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr = ICH_HCR_EN; +} + +static const struct vgic_ops vgic_v3_ops = { + .get_lr = vgic_v3_get_lr, + .set_lr = vgic_v3_set_lr, + .sync_lr_elrsr = vgic_v3_sync_lr_elrsr, + .get_elrsr = vgic_v3_get_elrsr, + .get_eisr = vgic_v3_get_eisr, + .get_interrupt_status = vgic_v3_get_interrupt_status, + .enable_underflow = vgic_v3_enable_underflow, + .disable_underflow = vgic_v3_disable_underflow, + .get_vmcr = vgic_v3_get_vmcr, + .set_vmcr = vgic_v3_set_vmcr, + .enable = vgic_v3_enable, +}; + +static struct vgic_params vgic_v3_params; + +/** + * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT + * @node: pointer to the DT node + * @ops: address of a pointer to the GICv3 operations + * @params: address of a pointer to HW-specific parameters + * + * Returns 0 if a GICv3 has been found, with the low level operations + * in *ops and the HW parameters in *params. Returns an error code + * otherwise. + */ +int vgic_v3_probe(struct device_node *vgic_node, + const struct vgic_ops **ops, + const struct vgic_params **params) +{ + int ret = 0; + u32 gicv_idx; + struct resource vcpu_res; + struct vgic_params *vgic = &vgic_v3_params; + + vgic->maint_irq = irq_of_parse_and_map(vgic_node, 0); + if (!vgic->maint_irq) { + kvm_err("error getting vgic maintenance irq from DT\n"); + ret = -ENXIO; + goto out; + } + + ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); + + /* + * The ListRegs field is 5 bits, but there is a architectural + * maximum of 16 list registers. Just ignore bit 4... + */ + vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1; + + if (of_property_read_u32(vgic_node, "#redistributor-regions", &gicv_idx)) + gicv_idx = 1; + + gicv_idx += 3; /* Also skip GICD, GICC, GICH */ + if (of_address_to_resource(vgic_node, gicv_idx, &vcpu_res)) { + kvm_err("Cannot obtain GICV region\n"); + ret = -ENXIO; + goto out; + } + + if (!PAGE_ALIGNED(vcpu_res.start)) { + kvm_err("GICV physical address 0x%llx not page aligned\n", + (unsigned long long)vcpu_res.start); + ret = -ENXIO; + goto out; + } + + if (!PAGE_ALIGNED(resource_size(&vcpu_res))) { + kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", + (unsigned long long)resource_size(&vcpu_res), + PAGE_SIZE); + ret = -ENXIO; + goto out; + } + + vgic->vcpu_base = vcpu_res.start; + vgic->vctrl_base = NULL; + vgic->type = VGIC_V3; + + kvm_info("%s@%llx IRQ%d\n", vgic_node->name, + vcpu_res.start, vgic->maint_irq); + + *ops = &vgic_v3_ops; + *params = vgic; + +out: + of_node_put(vgic_node); + return ret; +} diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 476d3bf540a8..3aaca49de325 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -36,21 +36,22 @@ * How the whole thing works (courtesy of Christoffer Dall): * * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if - * something is pending - * - VGIC pending interrupts are stored on the vgic.irq_state vgic - * bitmap (this bitmap is updated by both user land ioctls and guest - * mmio ops, and other in-kernel peripherals such as the - * arch. timers) and indicate the 'wire' state. + * something is pending on the CPU interface. + * - Interrupts that are pending on the distributor are stored on the + * vgic.irq_pending vgic bitmap (this bitmap is updated by both user land + * ioctls and guest mmio ops, and other in-kernel peripherals such as the + * arch. timers). * - Every time the bitmap changes, the irq_pending_on_cpu oracle is * recalculated * - To calculate the oracle, we need info for each cpu from * compute_pending_for_cpu, which considers: - * - PPI: dist->irq_state & dist->irq_enable - * - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target - * - irq_spi_target is a 'formatted' version of the GICD_ICFGR + * - PPI: dist->irq_pending & dist->irq_enable + * - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target + * - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn * registers, stored on each vcpu. We only keep one bit of * information per interrupt, making sure that only one vcpu can * accept the interrupt. + * - If any of the above state changes, we must recalculate the oracle. * - The same is true when injecting an interrupt, except that we only * consider a single interrupt at a time. The irq_spi_cpu array * contains the target CPU for each SPI. @@ -60,13 +61,18 @@ * the 'line' again. This is achieved as such: * * - When a level interrupt is moved onto a vcpu, the corresponding - * bit in irq_active is set. As long as this bit is set, the line + * bit in irq_queued is set. As long as this bit is set, the line * will be ignored for further interrupts. The interrupt is injected * into the vcpu with the GICH_LR_EOI bit set (generate a * maintenance interrupt on EOI). * - When the interrupt is EOIed, the maintenance interrupt fires, - * and clears the corresponding bit in irq_active. This allow the + * and clears the corresponding bit in irq_queued. This allows the * interrupt line to be sampled again. + * - Note that level-triggered interrupts can also be set to pending from + * writes to GICD_ISPENDRn and lowering the external input line does not + * cause the interrupt to become inactive in such a situation. + * Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become + * inactive as long as the external input line is held high. */ #define VGIC_ADDR_UNDEF (-1) @@ -76,14 +82,6 @@ #define IMPLEMENTER_ARM 0x43b #define GICC_ARCH_VERSION_V2 0x2 -/* Physical address of vgic virtual cpu interface */ -static phys_addr_t vgic_vcpu_base; - -/* Virtual control interface base address */ -static void __iomem *vgic_vctrl_base; - -static struct device_node *vgic_node; - #define ACCESS_READ_VALUE (1 << 0) #define ACCESS_READ_RAZ (0 << 0) #define ACCESS_READ_MASK(x) ((x) & (1 << 0)) @@ -94,30 +92,90 @@ static struct device_node *vgic_node; #define ACCESS_WRITE_MASK(x) ((x) & (3 << 1)) static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); +static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu); static void vgic_update_state(struct kvm *kvm); static void vgic_kick_vcpus(struct kvm *kvm); +static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi); static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); -static u32 vgic_nr_lr; +static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); +static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); +static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); +static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); + +static const struct vgic_ops *vgic_ops; +static const struct vgic_params *vgic; + +/* + * struct vgic_bitmap contains a bitmap made of unsigned longs, but + * extracts u32s out of them. + * + * This does not work on 64-bit BE systems, because the bitmap access + * will store two consecutive 32-bit words with the higher-addressed + * register's bits at the lower index and the lower-addressed register's + * bits at the higher index. + * + * Therefore, swizzle the register index when accessing the 32-bit word + * registers to access the right register's value. + */ +#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64 +#define REG_OFFSET_SWIZZLE 1 +#else +#define REG_OFFSET_SWIZZLE 0 +#endif + +static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs) +{ + int nr_longs; -static unsigned int vgic_maint_irq; + nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS); + + b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL); + if (!b->private) + return -ENOMEM; + + b->shared = b->private + nr_cpus; + + return 0; +} + +static void vgic_free_bitmap(struct vgic_bitmap *b) +{ + kfree(b->private); + b->private = NULL; + b->shared = NULL; +} + +/* + * Call this function to convert a u64 value to an unsigned long * bitmask + * in a way that works on both 32-bit and 64-bit LE and BE platforms. + * + * Warning: Calling this function may modify *val. + */ +static unsigned long *u64_to_bitmask(u64 *val) +{ +#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32 + *val = (*val >> 32) | (*val << 32); +#endif + return (unsigned long *)val; +} static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset) { offset >>= 2; if (!offset) - return x->percpu[cpuid].reg; + return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE; else - return x->shared.reg + offset - 1; + return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE); } static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, int cpuid, int irq) { if (irq < VGIC_NR_PRIVATE_IRQS) - return test_bit(irq, x->percpu[cpuid].reg_ul); + return test_bit(irq, x->private + cpuid); - return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared.reg_ul); + return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared); } static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, @@ -126,9 +184,9 @@ static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, unsigned long *reg; if (irq < VGIC_NR_PRIVATE_IRQS) { - reg = x->percpu[cpuid].reg_ul; + reg = x->private + cpuid; } else { - reg = x->shared.reg_ul; + reg = x->shared; irq -= VGIC_NR_PRIVATE_IRQS; } @@ -140,24 +198,49 @@ static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid) { - if (unlikely(cpuid >= VGIC_MAX_CPUS)) - return NULL; - return x->percpu[cpuid].reg_ul; + return x->private + cpuid; } static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) { - return x->shared.reg_ul; + return x->shared; +} + +static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs) +{ + int size; + + size = nr_cpus * VGIC_NR_PRIVATE_IRQS; + size += nr_irqs - VGIC_NR_PRIVATE_IRQS; + + x->private = kzalloc(size, GFP_KERNEL); + if (!x->private) + return -ENOMEM; + + x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32); + return 0; +} + +static void vgic_free_bytemap(struct vgic_bytemap *b) +{ + kfree(b->private); + b->private = NULL; + b->shared = NULL; } static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) { - offset >>= 2; - BUG_ON(offset > (VGIC_NR_IRQS / 4)); - if (offset < 8) - return x->percpu[cpuid] + offset; - else - return x->shared + offset - 8; + u32 *reg; + + if (offset < VGIC_NR_PRIVATE_IRQS) { + reg = x->private; + offset += cpuid * VGIC_NR_PRIVATE_IRQS; + } else { + reg = x->shared; + offset -= VGIC_NR_PRIVATE_IRQS; + } + + return reg + (offset / sizeof(u32)); } #define VGIC_CFG_LEVEL 0 @@ -179,46 +262,81 @@ static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq) return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq); } -static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq) +static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq); +} + +static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1); +} + +static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0); +} + +static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq); + return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq); } -static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq) +static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1); + vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1); } -static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq) +static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0); + vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0); +} + +static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq); +} + +static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0); } static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - return vgic_bitmap_get_irq_val(&dist->irq_state, vcpu->vcpu_id, irq); + return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq); } -static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq) +static void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 1); + vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1); } -static void vgic_dist_irq_clear(struct kvm_vcpu *vcpu, int irq) +static void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 0); + vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0); } static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq) @@ -239,14 +357,19 @@ static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq) vcpu->arch.vgic_cpu.pending_shared); } +static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq) +{ + return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq); +} + static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask) { - return *((u32 *)mmio->data) & mask; + return le32_to_cpu(*((u32 *)mmio->data)) & mask; } static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value) { - *((u32 *)mmio->data) = value & mask; + *((u32 *)mmio->data) = cpu_to_le32(value) & mask; } /** @@ -330,7 +453,7 @@ static bool handle_mmio_misc(struct kvm_vcpu *vcpu, case 4: /* GICD_TYPER */ reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; - reg |= (VGIC_NR_IRQS >> 5) - 1; + reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1; vgic_reg_access(mmio, ®, word_offset, ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); break; @@ -392,11 +515,33 @@ static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, - vcpu->vcpu_id, offset); + u32 *reg, orig; + u32 level_mask; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu->vcpu_id, offset); + level_mask = (~(*reg)); + + /* Mark both level and edge triggered irqs as pending */ + reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset); + orig = *reg; vgic_reg_access(mmio, reg, offset, ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); + if (mmio->is_write) { + /* Set the soft-pending flag only for level-triggered irqs */ + reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); + *reg &= level_mask; + + /* Ignore writes to SGIs */ + if (offset < 2) { + *reg &= ~0xffff; + *reg |= orig & 0xffff; + } + vgic_update_state(vcpu->kvm); return true; } @@ -408,11 +553,34 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state, - vcpu->vcpu_id, offset); + u32 *level_active; + u32 *reg, orig; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu->vcpu_id, offset); + orig = *reg; vgic_reg_access(mmio, reg, offset, ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); if (mmio->is_write) { + /* Re-set level triggered level-active interrupts */ + level_active = vgic_bitmap_get_reg(&dist->irq_level, + vcpu->vcpu_id, offset); + reg = vgic_bitmap_get_reg(&dist->irq_pending, + vcpu->vcpu_id, offset); + *reg |= *level_active; + + /* Ignore writes to SGIs */ + if (offset < 2) { + *reg &= ~0xffff; + *reg |= orig & 0xffff; + } + + /* Clear soft-pending flags */ + reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, + vcpu->vcpu_id, offset); + vgic_reg_access(mmio, reg, offset, + ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); + vgic_update_state(vcpu->kvm); return true; } @@ -593,18 +761,6 @@ static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, return false; } -#define LR_CPUID(lr) \ - (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT) -#define LR_IRQID(lr) \ - ((lr) & GICH_LR_VIRTUALID) - -static void vgic_retire_lr(int lr_nr, int irq, struct vgic_cpu *vgic_cpu) -{ - clear_bit(lr_nr, vgic_cpu->lr_used); - vgic_cpu->vgic_lr[lr_nr] &= ~GICH_LR_STATE; - vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; -} - /** * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs @@ -622,13 +778,10 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; int vcpu_id = vcpu->vcpu_id; - int i, irq, source_cpu; - u32 *lr; + int i; for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) { - lr = &vgic_cpu->vgic_lr[i]; - irq = LR_IRQID(*lr); - source_cpu = LR_CPUID(*lr); + struct vgic_lr lr = vgic_get_lr(vcpu, i); /* * There are three options for the state bits: @@ -640,7 +793,7 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) * If the LR holds only an active interrupt (not pending) then * just leave it alone. */ - if ((*lr & GICH_LR_STATE) == GICH_LR_ACTIVE_BIT) + if ((lr.state & LR_STATE_MASK) == LR_STATE_ACTIVE) continue; /* @@ -649,18 +802,21 @@ static void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) * is fine, then we are only setting a few bits that were * already set. */ - vgic_dist_irq_set(vcpu, irq); - if (irq < VGIC_NR_SGIS) - dist->irq_sgi_sources[vcpu_id][irq] |= 1 << source_cpu; - *lr &= ~GICH_LR_PENDING_BIT; + vgic_dist_irq_set_pending(vcpu, lr.irq); + if (lr.irq < VGIC_NR_SGIS) + *vgic_get_sgi_sources(dist, vcpu_id, lr.irq) |= 1 << lr.source; + lr.state &= ~LR_STATE_PENDING; + vgic_set_lr(vcpu, i, lr); /* * If there's no state left on the LR (it could still be * active), then the LR does not hold any useful info and can * be marked as free for other use. */ - if (!(*lr & GICH_LR_STATE)) - vgic_retire_lr(i, irq, vgic_cpu); + if (!(lr.state & LR_STATE_MASK)) { + vgic_retire_lr(i, lr.irq, vcpu); + vgic_irq_clear_queued(vcpu, lr.irq); + } /* Finally update the VGIC state. */ vgic_update_state(vcpu->kvm); @@ -674,7 +830,7 @@ static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int sgi; - int min_sgi = (offset & ~0x3) * 4; + int min_sgi = (offset & ~0x3); int max_sgi = min_sgi + 3; int vcpu_id = vcpu->vcpu_id; u32 reg = 0; @@ -682,7 +838,7 @@ static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, /* Copy source SGIs from distributor side */ for (sgi = min_sgi; sgi <= max_sgi; sgi++) { int shift = 8 * (sgi - min_sgi); - reg |= (u32)dist->irq_sgi_sources[vcpu_id][sgi] << shift; + reg |= ((u32)*vgic_get_sgi_sources(dist, vcpu_id, sgi)) << shift; } mmio_data_write(mmio, ~0, reg); @@ -695,7 +851,7 @@ static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int sgi; - int min_sgi = (offset & ~0x3) * 4; + int min_sgi = (offset & ~0x3); int max_sgi = min_sgi + 3; int vcpu_id = vcpu->vcpu_id; u32 reg; @@ -706,14 +862,15 @@ static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, /* Clear pending SGIs on the distributor */ for (sgi = min_sgi; sgi <= max_sgi; sgi++) { u8 mask = reg >> (8 * (sgi - min_sgi)); + u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi); if (set) { - if ((dist->irq_sgi_sources[vcpu_id][sgi] & mask) != mask) + if ((*src & mask) != mask) updated = true; - dist->irq_sgi_sources[vcpu_id][sgi] |= mask; + *src |= mask; } else { - if (dist->irq_sgi_sources[vcpu_id][sgi] & mask) + if (*src & mask) updated = true; - dist->irq_sgi_sources[vcpu_id][sgi] &= ~mask; + *src &= ~mask; } } @@ -752,6 +909,7 @@ static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, struct mmio_range { phys_addr_t base; unsigned long len; + int bits_per_irq; bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset); }; @@ -760,56 +918,67 @@ static const struct mmio_range vgic_dist_ranges[] = { { .base = GIC_DIST_CTRL, .len = 12, + .bits_per_irq = 0, .handle_mmio = handle_mmio_misc, }, { .base = GIC_DIST_IGROUP, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_raz_wi, }, { .base = GIC_DIST_ENABLE_SET, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_set_enable_reg, }, { .base = GIC_DIST_ENABLE_CLEAR, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_clear_enable_reg, }, { .base = GIC_DIST_PENDING_SET, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_set_pending_reg, }, { .base = GIC_DIST_PENDING_CLEAR, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_clear_pending_reg, }, { .base = GIC_DIST_ACTIVE_SET, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_raz_wi, }, { .base = GIC_DIST_ACTIVE_CLEAR, - .len = VGIC_NR_IRQS / 8, + .len = VGIC_MAX_IRQS / 8, + .bits_per_irq = 1, .handle_mmio = handle_mmio_raz_wi, }, { .base = GIC_DIST_PRI, - .len = VGIC_NR_IRQS, + .len = VGIC_MAX_IRQS, + .bits_per_irq = 8, .handle_mmio = handle_mmio_priority_reg, }, { .base = GIC_DIST_TARGET, - .len = VGIC_NR_IRQS, + .len = VGIC_MAX_IRQS, + .bits_per_irq = 8, .handle_mmio = handle_mmio_target_reg, }, { .base = GIC_DIST_CONFIG, - .len = VGIC_NR_IRQS / 4, + .len = VGIC_MAX_IRQS / 4, + .bits_per_irq = 2, .handle_mmio = handle_mmio_cfg_reg, }, { @@ -847,6 +1016,22 @@ struct mmio_range *find_matching_range(const struct mmio_range *ranges, return NULL; } +static bool vgic_validate_access(const struct vgic_dist *dist, + const struct mmio_range *range, + unsigned long offset) +{ + int irq; + + if (!range->bits_per_irq) + return true; /* Not an irq-based access */ + + irq = offset * 8 / range->bits_per_irq; + if (irq >= dist->nr_irqs) + return false; + + return true; +} + /** * vgic_handle_mmio - handle an in-kernel MMIO access * @vcpu: pointer to the vcpu performing the access @@ -886,7 +1071,13 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, spin_lock(&vcpu->kvm->arch.vgic.lock); offset = mmio->phys_addr - range->base - base; - updated_state = range->handle_mmio(vcpu, mmio, offset); + if (vgic_validate_access(dist, range, offset)) { + updated_state = range->handle_mmio(vcpu, mmio, offset); + } else { + vgic_reg_access(mmio, NULL, offset, + ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); + updated_state = false; + } spin_unlock(&vcpu->kvm->arch.vgic.lock); kvm_prepare_mmio(run, mmio); kvm_handle_mmio_return(vcpu, run); @@ -897,6 +1088,11 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run, return true; } +static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi) +{ + return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi; +} + static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) { struct kvm *kvm = vcpu->kvm; @@ -929,8 +1125,8 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) kvm_for_each_vcpu(c, vcpu, kvm) { if (target_cpus & 1) { /* Flag the SGI as pending */ - vgic_dist_irq_set(vcpu, sgi); - dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id; + vgic_dist_irq_set_pending(vcpu, sgi); + *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id; kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); } @@ -938,32 +1134,38 @@ static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) } } +static int vgic_nr_shared_irqs(struct vgic_dist *dist) +{ + return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS; +} + static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; unsigned long *pending, *enabled, *pend_percpu, *pend_shared; unsigned long pending_private, pending_shared; + int nr_shared = vgic_nr_shared_irqs(dist); int vcpu_id; vcpu_id = vcpu->vcpu_id; pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; pend_shared = vcpu->arch.vgic_cpu.pending_shared; - pending = vgic_bitmap_get_cpu_map(&dist->irq_state, vcpu_id); + pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id); enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); - pending = vgic_bitmap_get_shared_map(&dist->irq_state); + pending = vgic_bitmap_get_shared_map(&dist->irq_pending); enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); - bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS); + bitmap_and(pend_shared, pending, enabled, nr_shared); bitmap_and(pend_shared, pend_shared, vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), - VGIC_NR_SHARED_IRQS); + nr_shared); pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS); - pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS); + pending_shared = find_first_bit(pend_shared, nr_shared); return (pending_private < VGIC_NR_PRIVATE_IRQS || - pending_shared < VGIC_NR_SHARED_IRQS); + pending_shared < vgic_nr_shared_irqs(dist)); } /* @@ -977,20 +1179,85 @@ static void vgic_update_state(struct kvm *kvm) int c; if (!dist->enabled) { - set_bit(0, &dist->irq_pending_on_cpu); + set_bit(0, dist->irq_pending_on_cpu); return; } kvm_for_each_vcpu(c, vcpu, kvm) { if (compute_pending_for_cpu(vcpu)) { pr_debug("CPU%d has pending interrupts\n", c); - set_bit(c, &dist->irq_pending_on_cpu); + set_bit(c, dist->irq_pending_on_cpu); } } } -#define MK_LR_PEND(src, irq) \ - (GICH_LR_PENDING_BIT | ((src) << GICH_LR_PHYSID_CPUID_SHIFT) | (irq)) +static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr) +{ + return vgic_ops->get_lr(vcpu, lr); +} + +static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, + struct vgic_lr vlr) +{ + vgic_ops->set_lr(vcpu, lr, vlr); +} + +static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, + struct vgic_lr vlr) +{ + vgic_ops->sync_lr_elrsr(vcpu, lr, vlr); +} + +static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu) +{ + return vgic_ops->get_elrsr(vcpu); +} + +static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu) +{ + return vgic_ops->get_eisr(vcpu); +} + +static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu) +{ + return vgic_ops->get_interrupt_status(vcpu); +} + +static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu) +{ + vgic_ops->enable_underflow(vcpu); +} + +static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu) +{ + vgic_ops->disable_underflow(vcpu); +} + +static inline void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + vgic_ops->get_vmcr(vcpu, vmcr); +} + +static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) +{ + vgic_ops->set_vmcr(vcpu, vmcr); +} + +static inline void vgic_enable(struct kvm_vcpu *vcpu) +{ + vgic_ops->enable(vcpu); +} + +static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr); + + vlr.state = 0; + vgic_set_lr(vcpu, lr_nr, vlr); + clear_bit(lr_nr, vgic_cpu->lr_used); + vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; +} /* * An interrupt may have been disabled after being made pending on the @@ -1006,13 +1273,13 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; int lr; - for_each_set_bit(lr, vgic_cpu->lr_used, vgic_cpu->nr_lr) { - int irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + for_each_set_bit(lr, vgic_cpu->lr_used, vgic->nr_lr) { + struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - if (!vgic_irq_is_enabled(vcpu, irq)) { - vgic_retire_lr(lr, irq, vgic_cpu); - if (vgic_irq_is_active(vcpu, irq)) - vgic_irq_clear_active(vcpu, irq); + if (!vgic_irq_is_enabled(vcpu, vlr.irq)) { + vgic_retire_lr(lr, vlr.irq, vcpu); + if (vgic_irq_is_queued(vcpu, vlr.irq)) + vgic_irq_clear_queued(vcpu, vlr.irq); } } } @@ -1024,40 +1291,48 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + struct vgic_lr vlr; int lr; /* Sanitize the input... */ BUG_ON(sgi_source_id & ~7); BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS); - BUG_ON(irq >= VGIC_NR_IRQS); + BUG_ON(irq >= dist->nr_irqs); kvm_debug("Queue IRQ%d\n", irq); lr = vgic_cpu->vgic_irq_lr_map[irq]; /* Do we have an active interrupt for the same CPUID? */ - if (lr != LR_EMPTY && - (LR_CPUID(vgic_cpu->vgic_lr[lr]) == sgi_source_id)) { - kvm_debug("LR%d piggyback for IRQ%d %x\n", - lr, irq, vgic_cpu->vgic_lr[lr]); - BUG_ON(!test_bit(lr, vgic_cpu->lr_used)); - vgic_cpu->vgic_lr[lr] |= GICH_LR_PENDING_BIT; - return true; + if (lr != LR_EMPTY) { + vlr = vgic_get_lr(vcpu, lr); + if (vlr.source == sgi_source_id) { + kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq); + BUG_ON(!test_bit(lr, vgic_cpu->lr_used)); + vlr.state |= LR_STATE_PENDING; + vgic_set_lr(vcpu, lr, vlr); + return true; + } } /* Try to use another LR for this interrupt */ lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used, - vgic_cpu->nr_lr); - if (lr >= vgic_cpu->nr_lr) + vgic->nr_lr); + if (lr >= vgic->nr_lr) return false; kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); - vgic_cpu->vgic_lr[lr] = MK_LR_PEND(sgi_source_id, irq); vgic_cpu->vgic_irq_lr_map[irq] = lr; set_bit(lr, vgic_cpu->lr_used); + vlr.irq = irq; + vlr.source = sgi_source_id; + vlr.state = LR_STATE_PENDING; if (!vgic_irq_is_edge(vcpu, irq)) - vgic_cpu->vgic_lr[lr] |= GICH_LR_EOI; + vlr.state |= LR_EOI_INT; + + vgic_set_lr(vcpu, lr, vlr); return true; } @@ -1069,14 +1344,14 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) int vcpu_id = vcpu->vcpu_id; int c; - sources = dist->irq_sgi_sources[vcpu_id][irq]; + sources = *vgic_get_sgi_sources(dist, vcpu_id, irq); - for_each_set_bit(c, &sources, VGIC_MAX_CPUS) { + for_each_set_bit(c, &sources, dist->nr_cpus) { if (vgic_queue_irq(vcpu, c, irq)) clear_bit(c, &sources); } - dist->irq_sgi_sources[vcpu_id][irq] = sources; + *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources; /* * If the sources bitmap has been cleared it means that we @@ -1085,7 +1360,7 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) * our emulated gic and can get rid of them. */ if (!sources) { - vgic_dist_irq_clear(vcpu, irq); + vgic_dist_irq_clear_pending(vcpu, irq); vgic_cpu_irq_clear(vcpu, irq); return true; } @@ -1095,15 +1370,15 @@ static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq) static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) { - if (vgic_irq_is_active(vcpu, irq)) + if (!vgic_can_sample_irq(vcpu, irq)) return true; /* level interrupt, already queued */ if (vgic_queue_irq(vcpu, 0, irq)) { if (vgic_irq_is_edge(vcpu, irq)) { - vgic_dist_irq_clear(vcpu, irq); + vgic_dist_irq_clear_pending(vcpu, irq); vgic_cpu_irq_clear(vcpu, irq); } else { - vgic_irq_set_active(vcpu, irq); + vgic_irq_set_queued(vcpu, irq); } return true; @@ -1148,66 +1423,83 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) } /* SPIs */ - for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) { + for_each_set_bit(i, vgic_cpu->pending_shared, vgic_nr_shared_irqs(dist)) { if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS)) overflow = 1; } epilog: if (overflow) { - vgic_cpu->vgic_hcr |= GICH_HCR_UIE; + vgic_enable_underflow(vcpu); } else { - vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE; + vgic_disable_underflow(vcpu); /* * We're about to run this VCPU, and we've consumed * everything the distributor had in store for * us. Claim we don't have anything pending. We'll * adjust that if needed while exiting. */ - clear_bit(vcpu_id, &dist->irq_pending_on_cpu); + clear_bit(vcpu_id, dist->irq_pending_on_cpu); } } static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u32 status = vgic_get_interrupt_status(vcpu); bool level_pending = false; - kvm_debug("MISR = %08x\n", vgic_cpu->vgic_misr); + kvm_debug("STATUS = %08x\n", status); - if (vgic_cpu->vgic_misr & GICH_MISR_EOI) { + if (status & INT_STATUS_EOI) { /* * Some level interrupts have been EOIed. Clear their * active bit. */ - int lr, irq; + u64 eisr = vgic_get_eisr(vcpu); + unsigned long *eisr_ptr = u64_to_bitmask(&eisr); + int lr; + + for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { + struct vgic_lr vlr = vgic_get_lr(vcpu, lr); + WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); - for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_eisr, - vgic_cpu->nr_lr) { - irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + vgic_irq_clear_queued(vcpu, vlr.irq); + WARN_ON(vlr.state & LR_STATE_MASK); + vlr.state = 0; + vgic_set_lr(vcpu, lr, vlr); - vgic_irq_clear_active(vcpu, irq); - vgic_cpu->vgic_lr[lr] &= ~GICH_LR_EOI; + /* + * If the IRQ was EOIed it was also ACKed and we we + * therefore assume we can clear the soft pending + * state (should it had been set) for this interrupt. + * + * Note: if the IRQ soft pending state was set after + * the IRQ was acked, it actually shouldn't be + * cleared, but we have no way of knowing that unless + * we start trapping ACKs when the soft-pending state + * is set. + */ + vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); /* Any additional pending interrupt? */ - if (vgic_dist_irq_is_pending(vcpu, irq)) { - vgic_cpu_irq_set(vcpu, irq); + if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { + vgic_cpu_irq_set(vcpu, vlr.irq); level_pending = true; } else { - vgic_cpu_irq_clear(vcpu, irq); + vgic_dist_irq_clear_pending(vcpu, vlr.irq); + vgic_cpu_irq_clear(vcpu, vlr.irq); } /* * Despite being EOIed, the LR may not have * been marked as empty. */ - set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr); - vgic_cpu->vgic_lr[lr] &= ~GICH_LR_ACTIVE_BIT; + vgic_sync_lr_elrsr(vcpu, lr, vlr); } } - if (vgic_cpu->vgic_misr & GICH_MISR_U) - vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE; + if (status & INT_STATUS_UNDERFLOW) + vgic_disable_underflow(vcpu); return level_pending; } @@ -1220,30 +1512,32 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u64 elrsr; + unsigned long *elrsr_ptr; int lr, pending; bool level_pending; level_pending = vgic_process_maintenance(vcpu); + elrsr = vgic_get_elrsr(vcpu); + elrsr_ptr = u64_to_bitmask(&elrsr); /* Clear mappings for empty LRs */ - for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr, - vgic_cpu->nr_lr) { - int irq; + for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) { + struct vgic_lr vlr; if (!test_and_clear_bit(lr, vgic_cpu->lr_used)) continue; - irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID; + vlr = vgic_get_lr(vcpu, lr); - BUG_ON(irq >= VGIC_NR_IRQS); - vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; + BUG_ON(vlr.irq >= dist->nr_irqs); + vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY; } /* Check if we still have something up our sleeve... */ - pending = find_first_zero_bit((unsigned long *)vgic_cpu->vgic_elrsr, - vgic_cpu->nr_lr); - if (level_pending || pending < vgic_cpu->nr_lr) - set_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); + pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr); + if (level_pending || pending < vgic->nr_lr) + set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); } void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) @@ -1277,7 +1571,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) if (!irqchip_in_kernel(vcpu->kvm)) return 0; - return test_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu); + return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); } static void vgic_kick_vcpus(struct kvm *kvm) @@ -1297,34 +1591,36 @@ static void vgic_kick_vcpus(struct kvm *kvm) static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) { - int is_edge = vgic_irq_is_edge(vcpu, irq); - int state = vgic_dist_irq_is_pending(vcpu, irq); + int edge_triggered = vgic_irq_is_edge(vcpu, irq); /* * Only inject an interrupt if: * - edge triggered and we have a rising edge * - level triggered and we change level */ - if (is_edge) + if (edge_triggered) { + int state = vgic_dist_irq_is_pending(vcpu, irq); return level > state; - else + } else { + int state = vgic_dist_irq_get_level(vcpu, irq); return level != state; + } } -static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, +static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid, unsigned int irq_num, bool level) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int is_edge, is_level; + int edge_triggered, level_triggered; int enabled; bool ret = true; spin_lock(&dist->lock); vcpu = kvm_get_vcpu(kvm, cpuid); - is_edge = vgic_irq_is_edge(vcpu, irq_num); - is_level = !is_edge; + edge_triggered = vgic_irq_is_edge(vcpu, irq_num); + level_triggered = !edge_triggered; if (!vgic_validate_injection(vcpu, irq_num, level)) { ret = false; @@ -1338,10 +1634,19 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); - if (level) - vgic_dist_irq_set(vcpu, irq_num); - else - vgic_dist_irq_clear(vcpu, irq_num); + if (level) { + if (level_triggered) + vgic_dist_irq_set_level(vcpu, irq_num); + vgic_dist_irq_set_pending(vcpu, irq_num); + } else { + if (level_triggered) { + vgic_dist_irq_clear_level(vcpu, irq_num); + if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) + vgic_dist_irq_clear_pending(vcpu, irq_num); + } else { + vgic_dist_irq_clear_pending(vcpu, irq_num); + } + } enabled = vgic_irq_is_enabled(vcpu, irq_num); @@ -1350,7 +1655,7 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, goto out; } - if (is_level && vgic_irq_is_active(vcpu, irq_num)) { + if (!vgic_can_sample_irq(vcpu, irq_num)) { /* * Level interrupt in progress, will be picked up * when EOId. @@ -1361,7 +1666,7 @@ static bool vgic_update_irq_state(struct kvm *kvm, int cpuid, if (level) { vgic_cpu_irq_set(vcpu, irq_num); - set_bit(cpuid, &dist->irq_pending_on_cpu); + set_bit(cpuid, dist->irq_pending_on_cpu); } out: @@ -1387,7 +1692,8 @@ out: int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, bool level) { - if (vgic_update_irq_state(kvm, cpuid, irq_num, level)) + if (likely(vgic_initialized(kvm)) && + vgic_update_irq_pending(kvm, cpuid, irq_num, level)) vgic_kick_vcpus(kvm); return 0; @@ -1404,6 +1710,32 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data) return IRQ_HANDLED; } +void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + kfree(vgic_cpu->pending_shared); + kfree(vgic_cpu->vgic_irq_lr_map); + vgic_cpu->pending_shared = NULL; + vgic_cpu->vgic_irq_lr_map = NULL; +} + +static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + int sz = (nr_irqs - VGIC_NR_PRIVATE_IRQS) / 8; + vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL); + vgic_cpu->vgic_irq_lr_map = kzalloc(nr_irqs, GFP_KERNEL); + + if (!vgic_cpu->pending_shared || !vgic_cpu->vgic_irq_lr_map) { + kvm_vgic_vcpu_destroy(vcpu); + return -ENOMEM; + } + + return 0; +} + /** * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state * @vcpu: pointer to the vcpu struct @@ -1411,16 +1743,13 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data) * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to * this vcpu and enable the VGIC for this VCPU */ -int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) +static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int i; - if (vcpu->vcpu_id >= VGIC_MAX_CPUS) - return -EBUSY; - - for (i = 0; i < VGIC_NR_IRQS; i++) { + for (i = 0; i < dist->nr_irqs; i++) { if (i < VGIC_NR_PPIS) vgic_bitmap_set_irq_val(&dist->irq_enabled, vcpu->vcpu_id, i, 1); @@ -1432,135 +1761,119 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) } /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. + * Store the number of LRs per vcpu, so we don't have to go + * all the way to the distributor structure to find out. Only + * assembly code should use this one. */ - vgic_cpu->vgic_vmcr = 0; - - vgic_cpu->nr_lr = vgic_nr_lr; - vgic_cpu->vgic_hcr = GICH_HCR_EN; /* Get the show on the road... */ + vgic_cpu->nr_lr = vgic->nr_lr; - return 0; + vgic_enable(vcpu); } -static void vgic_init_maintenance_interrupt(void *info) +void kvm_vgic_destroy(struct kvm *kvm) { - enable_percpu_irq(vgic_maint_irq, 0); + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_destroy(vcpu); + + vgic_free_bitmap(&dist->irq_enabled); + vgic_free_bitmap(&dist->irq_level); + vgic_free_bitmap(&dist->irq_pending); + vgic_free_bitmap(&dist->irq_soft_pend); + vgic_free_bitmap(&dist->irq_queued); + vgic_free_bitmap(&dist->irq_cfg); + vgic_free_bytemap(&dist->irq_priority); + if (dist->irq_spi_target) { + for (i = 0; i < dist->nr_cpus; i++) + vgic_free_bitmap(&dist->irq_spi_target[i]); + } + kfree(dist->irq_sgi_sources); + kfree(dist->irq_spi_cpu); + kfree(dist->irq_spi_target); + kfree(dist->irq_pending_on_cpu); + dist->irq_sgi_sources = NULL; + dist->irq_spi_cpu = NULL; + dist->irq_spi_target = NULL; + dist->irq_pending_on_cpu = NULL; } -static int vgic_cpu_notify(struct notifier_block *self, - unsigned long action, void *cpu) +/* + * Allocate and initialize the various data structures. Must be called + * with kvm->lock held! + */ +static int vgic_init_maps(struct kvm *kvm) { - switch (action) { - case CPU_STARTING: - case CPU_STARTING_FROZEN: - vgic_init_maintenance_interrupt(NULL); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - disable_percpu_irq(vgic_maint_irq); - break; - } + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int nr_cpus, nr_irqs; + int ret, i; - return NOTIFY_OK; -} + if (dist->nr_cpus) /* Already allocated */ + return 0; -static struct notifier_block vgic_cpu_nb = { - .notifier_call = vgic_cpu_notify, -}; + nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus); + if (!nr_cpus) /* No vcpus? Can't be good... */ + return -EINVAL; -int kvm_vgic_hyp_init(void) -{ - int ret; - struct resource vctrl_res; - struct resource vcpu_res; + /* + * If nobody configured the number of interrupts, use the + * legacy one. + */ + if (!dist->nr_irqs) + dist->nr_irqs = VGIC_NR_IRQS_LEGACY; - vgic_node = of_find_compatible_node(NULL, NULL, "arm,cortex-a15-gic"); - if (!vgic_node) { - kvm_err("error: no compatible vgic node in DT\n"); - return -ENODEV; - } + nr_irqs = dist->nr_irqs; - vgic_maint_irq = irq_of_parse_and_map(vgic_node, 0); - if (!vgic_maint_irq) { - kvm_err("error getting vgic maintenance irq from DT\n"); - ret = -ENXIO; - goto out; - } + ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs); + ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs); + ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs); - ret = request_percpu_irq(vgic_maint_irq, vgic_maintenance_handler, - "vgic", kvm_get_running_vcpus()); - if (ret) { - kvm_err("Cannot register interrupt %d\n", vgic_maint_irq); + if (ret) goto out; - } - - ret = __register_cpu_notifier(&vgic_cpu_nb); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - ret = of_address_to_resource(vgic_node, 2, &vctrl_res); - if (ret) { - kvm_err("Cannot obtain VCTRL resource\n"); - goto out_free_irq; - } - - vgic_vctrl_base = of_iomap(vgic_node, 2); - if (!vgic_vctrl_base) { - kvm_err("Cannot ioremap VCTRL\n"); + dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL); + dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL); + dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus, + GFP_KERNEL); + dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long), + GFP_KERNEL); + if (!dist->irq_sgi_sources || + !dist->irq_spi_cpu || + !dist->irq_spi_target || + !dist->irq_pending_on_cpu) { ret = -ENOMEM; - goto out_free_irq; - } - - vgic_nr_lr = readl_relaxed(vgic_vctrl_base + GICH_VTR); - vgic_nr_lr = (vgic_nr_lr & 0x3f) + 1; - - ret = create_hyp_io_mappings(vgic_vctrl_base, - vgic_vctrl_base + resource_size(&vctrl_res), - vctrl_res.start); - if (ret) { - kvm_err("Cannot map VCTRL into hyp\n"); - goto out_unmap; + goto out; } - if (of_address_to_resource(vgic_node, 3, &vcpu_res)) { - kvm_err("Cannot obtain VCPU resource\n"); - ret = -ENXIO; - goto out_unmap; - } + for (i = 0; i < nr_cpus; i++) + ret |= vgic_init_bitmap(&dist->irq_spi_target[i], + nr_cpus, nr_irqs); - if (!PAGE_ALIGNED(vcpu_res.start)) { - kvm_err("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res.start); - ret = -ENXIO; - goto out_unmap; - } + if (ret) + goto out; - if (!PAGE_ALIGNED(resource_size(&vcpu_res))) { - kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(&vcpu_res), - PAGE_SIZE); - ret = -ENXIO; - goto out_unmap; + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_vcpu_init_maps(vcpu, nr_irqs); + if (ret) { + kvm_err("VGIC: Failed to allocate vcpu memory\n"); + break; + } } - vgic_vcpu_base = vcpu_res.start; - - kvm_info("%s@%llx IRQ%d\n", vgic_node->name, - vctrl_res.start, vgic_maint_irq); - on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); - - goto out; + for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i += 4) + vgic_set_target_reg(kvm, 0, i); -out_unmap: - iounmap(vgic_vctrl_base); -out_free_irq: - free_percpu_irq(vgic_maint_irq, kvm_get_running_vcpus()); out: - of_node_put(vgic_node); + if (ret) + kvm_vgic_destroy(kvm); + return ret; } @@ -1575,6 +1888,7 @@ out: */ int kvm_vgic_init(struct kvm *kvm) { + struct kvm_vcpu *vcpu; int ret = 0, i; if (!irqchip_in_kernel(kvm)) @@ -1592,18 +1906,27 @@ int kvm_vgic_init(struct kvm *kvm) goto out; } + ret = vgic_init_maps(kvm); + if (ret) { + kvm_err("Unable to allocate maps\n"); + goto out; + } + ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base, - vgic_vcpu_base, KVM_VGIC_V2_CPU_SIZE); + vgic->vcpu_base, KVM_VGIC_V2_CPU_SIZE, + true); if (ret) { kvm_err("Unable to remap VGIC CPU to VCPU\n"); goto out; } - for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4) - vgic_set_target_reg(kvm, 0, i); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vgic_vcpu_init(vcpu); kvm->arch.vgic.ready = true; out: + if (ret) + kvm_vgic_destroy(kvm); mutex_unlock(&kvm->lock); return ret; } @@ -1639,7 +1962,8 @@ int kvm_vgic_create(struct kvm *kvm) } spin_lock_init(&kvm->arch.vgic.lock); - kvm->arch.vgic.vctrl_base = vgic_vctrl_base; + kvm->arch.vgic.in_kernel = true; + kvm->arch.vgic.vctrl_base = vgic->vctrl_base; kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; @@ -1654,7 +1978,7 @@ out: return ret; } -static bool vgic_ioaddr_overlap(struct kvm *kvm) +static int vgic_ioaddr_overlap(struct kvm *kvm) { phys_addr_t dist = kvm->arch.vgic.vgic_dist_base; phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base; @@ -1738,39 +2062,40 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, phys_addr_t offset) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u32 reg, mask = 0, shift = 0; bool updated = false; + struct vgic_vmcr vmcr; + u32 *vmcr_field; + u32 reg; + + vgic_get_vmcr(vcpu, &vmcr); switch (offset & ~0x3) { case GIC_CPU_CTRL: - mask = GICH_VMCR_CTRL_MASK; - shift = GICH_VMCR_CTRL_SHIFT; + vmcr_field = &vmcr.ctlr; break; case GIC_CPU_PRIMASK: - mask = GICH_VMCR_PRIMASK_MASK; - shift = GICH_VMCR_PRIMASK_SHIFT; + vmcr_field = &vmcr.pmr; break; case GIC_CPU_BINPOINT: - mask = GICH_VMCR_BINPOINT_MASK; - shift = GICH_VMCR_BINPOINT_SHIFT; + vmcr_field = &vmcr.bpr; break; case GIC_CPU_ALIAS_BINPOINT: - mask = GICH_VMCR_ALIAS_BINPOINT_MASK; - shift = GICH_VMCR_ALIAS_BINPOINT_SHIFT; + vmcr_field = &vmcr.abpr; break; + default: + BUG(); } if (!mmio->is_write) { - reg = (vgic_cpu->vgic_vmcr & mask) >> shift; + reg = *vmcr_field; mmio_data_write(mmio, ~0, reg); } else { reg = mmio_data_read(mmio, ~0); - reg = (reg << shift) & mask; - if (reg != (vgic_cpu->vgic_vmcr & mask)) + if (reg != *vmcr_field) { + *vmcr_field = reg; + vgic_set_vmcr(vcpu, &vmcr); updated = true; - vgic_cpu->vgic_vmcr &= ~mask; - vgic_cpu->vgic_vmcr |= reg; + } } return updated; } @@ -1842,6 +2167,10 @@ static int vgic_attr_regs_access(struct kvm_device *dev, mutex_lock(&dev->kvm->lock); + ret = vgic_init_maps(dev->kvm); + if (ret) + goto out; + if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { ret = -EINVAL; goto out; @@ -1939,6 +2268,36 @@ static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) return vgic_attr_regs_access(dev, attr, ®, true); } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + u32 val; + int ret = 0; + + if (get_user(val, uaddr)) + return -EFAULT; + + /* + * We require: + * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs + * - at most 1024 interrupts + * - a multiple of 32 interrupts + */ + if (val < (VGIC_NR_PRIVATE_IRQS + 32) || + val > VGIC_MAX_IRQS || + (val & 31)) + return -EINVAL; + + mutex_lock(&dev->kvm->lock); + + if (vgic_initialized(dev->kvm) || dev->kvm->arch.vgic.nr_irqs) + ret = -EBUSY; + else + dev->kvm->arch.vgic.nr_irqs = val; + + mutex_unlock(&dev->kvm->lock); + + return ret; + } } @@ -1975,6 +2334,11 @@ static int vgic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) r = put_user(reg, uaddr); break; } + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { + u32 __user *uaddr = (u32 __user *)(long)attr->addr; + r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr); + break; + } } @@ -2011,6 +2375,8 @@ static int vgic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; return vgic_has_attr_regs(vgic_cpu_ranges, offset); + case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: + return 0; } return -ENXIO; } @@ -2025,7 +2391,7 @@ static int vgic_create(struct kvm_device *dev, u32 type) return kvm_vgic_create(dev->kvm); } -struct kvm_device_ops kvm_arm_vgic_v2_ops = { +static struct kvm_device_ops kvm_arm_vgic_v2_ops = { .name = "kvm-arm-vgic", .create = vgic_create, .destroy = vgic_destroy, @@ -2033,3 +2399,81 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = { .get_attr = vgic_get_attr, .has_attr = vgic_has_attr, }; + +static void vgic_init_maintenance_interrupt(void *info) +{ + enable_percpu_irq(vgic->maint_irq, 0); +} + +static int vgic_cpu_notify(struct notifier_block *self, + unsigned long action, void *cpu) +{ + switch (action) { + case CPU_STARTING: + case CPU_STARTING_FROZEN: + vgic_init_maintenance_interrupt(NULL); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + disable_percpu_irq(vgic->maint_irq); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block vgic_cpu_nb = { + .notifier_call = vgic_cpu_notify, +}; + +static const struct of_device_id vgic_ids[] = { + { .compatible = "arm,cortex-a15-gic", .data = vgic_v2_probe, }, + { .compatible = "arm,gic-v3", .data = vgic_v3_probe, }, + {}, +}; + +int kvm_vgic_hyp_init(void) +{ + const struct of_device_id *matched_id; + const int (*vgic_probe)(struct device_node *,const struct vgic_ops **, + const struct vgic_params **); + struct device_node *vgic_node; + int ret; + + vgic_node = of_find_matching_node_and_match(NULL, + vgic_ids, &matched_id); + if (!vgic_node) { + kvm_err("error: no compatible GIC node found\n"); + return -ENODEV; + } + + vgic_probe = matched_id->data; + ret = vgic_probe(vgic_node, &vgic_ops, &vgic); + if (ret) + return ret; + + ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler, + "vgic", kvm_get_running_vcpus()); + if (ret) { + kvm_err("Cannot register interrupt %d\n", vgic->maint_irq); + return ret; + } + + ret = __register_cpu_notifier(&vgic_cpu_nb); + if (ret) { + kvm_err("Cannot register vgic CPU notifier\n"); + goto out_free_irq; + } + + /* Callback into for arch code for setup */ + vgic_arch_setup(vgic); + + on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); + + return kvm_register_device_ops(&kvm_arm_vgic_v2_ops, + KVM_DEV_TYPE_ARM_VGIC_V2); + +out_free_irq: + free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus()); + return ret; +} diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index bf06577fea51..e05000e200d2 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -302,7 +302,7 @@ static void kvm_free_assigned_device(struct kvm *kvm, else pci_restore_state(assigned_dev->dev); - assigned_dev->dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; + pci_clear_dev_assigned(assigned_dev->dev); pci_release_regions(assigned_dev->dev); pci_disable_device(assigned_dev->dev); @@ -526,8 +526,10 @@ static int assign_guest_irq(struct kvm *kvm, dev->irq_requested_type |= guest_irq_type; if (dev->ack_notifier.gsi != -1) kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else + } else { kvm_free_irq_source_id(kvm, dev->irq_source_id); + dev->irq_source_id = -1; + } return r; } diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index d6a3d0993d88..5ff7f7f2689a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -80,9 +80,7 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); - down_read(&mm->mmap_sem); - get_user_pages(NULL, mm, addr, 1, 1, 0, NULL, NULL); - up_read(&mm->mmap_sem); + kvm_get_user_page_io(NULL, mm, addr, 1, NULL); kvm_async_page_present_sync(vcpu, apf); spin_lock(&vcpu->async_pf.lock); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 20c3af7692c5..b0fb390943c6 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -33,10 +33,15 @@ #include <linux/kernel.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <linux/seqlock.h> +#include <trace/events/kvm.h> +#ifdef __KVM_HAVE_IOAPIC +#include "ioapic.h" +#endif #include "iodev.h" -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +#ifdef CONFIG_HAVE_KVM_IRQFD /* * -------------------------------------------------------------------- * irqfd: Allows an fd to be used to inject an interrupt to the guest @@ -75,7 +80,8 @@ struct _irqfd { struct kvm *kvm; wait_queue_t wait; /* Update side is protected by irqfds.lock */ - struct kvm_kernel_irq_routing_entry __rcu *irq_entry; + struct kvm_kernel_irq_routing_entry irq_entry; + seqcount_t irq_entry_sc; /* Used for level IRQ fast-path */ int gsi; struct work_struct inject; @@ -223,16 +229,20 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); unsigned long flags = (unsigned long)key; - struct kvm_kernel_irq_routing_entry *irq; + struct kvm_kernel_irq_routing_entry irq; struct kvm *kvm = irqfd->kvm; + unsigned seq; int idx; if (flags & POLLIN) { idx = srcu_read_lock(&kvm->irq_srcu); - irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu); + do { + seq = read_seqcount_begin(&irqfd->irq_entry_sc); + irq = irqfd->irq_entry; + } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); /* An event has been signaled, inject an interrupt */ - if (irq) - kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, + if (irq.type == KVM_IRQ_ROUTING_MSI) + kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); else schedule_work(&irqfd->inject); @@ -272,34 +282,37 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, } /* Must be called under irqfds.lock */ -static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, - struct kvm_irq_routing_table *irq_rt) +static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd) { struct kvm_kernel_irq_routing_entry *e; + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; + int i, n_entries; - if (irqfd->gsi >= irq_rt->nr_rt_entries) { - rcu_assign_pointer(irqfd->irq_entry, NULL); - return; - } + n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); + + write_seqcount_begin(&irqfd->irq_entry_sc); - hlist_for_each_entry(e, &irq_rt->map[irqfd->gsi], link) { + irqfd->irq_entry.type = 0; + + e = entries; + for (i = 0; i < n_entries; ++i, ++e) { /* Only fast-path MSI. */ if (e->type == KVM_IRQ_ROUTING_MSI) - rcu_assign_pointer(irqfd->irq_entry, e); - else - rcu_assign_pointer(irqfd->irq_entry, NULL); + irqfd->irq_entry = *e; } + + write_seqcount_end(&irqfd->irq_entry_sc); } static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { - struct kvm_irq_routing_table *irq_rt; struct _irqfd *irqfd, *tmp; struct fd f; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; int ret; unsigned int events; + int idx; irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); if (!irqfd) @@ -310,6 +323,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) INIT_LIST_HEAD(&irqfd->list); INIT_WORK(&irqfd->inject, irqfd_inject); INIT_WORK(&irqfd->shutdown, irqfd_shutdown); + seqcount_init(&irqfd->irq_entry_sc); f = fdget(args->fd); if (!f.file) { @@ -392,9 +406,9 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) goto fail; } - irq_rt = rcu_dereference_protected(kvm->irq_routing, - lockdep_is_held(&kvm->irqfds.lock)); - irqfd_update(kvm, irqfd, irq_rt); + idx = srcu_read_lock(&kvm->irq_srcu); + irqfd_update(kvm, irqfd); + srcu_read_unlock(&kvm->irq_srcu, idx); list_add_tail(&irqfd->list, &kvm->irqfds.items); @@ -433,12 +447,73 @@ out: kfree(irqfd); return ret; } + +bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ + struct kvm_irq_ack_notifier *kian; + int gsi, idx; + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); + if (gsi != -1) + hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) { + srcu_read_unlock(&kvm->irq_srcu, idx); + return true; + } + + srcu_read_unlock(&kvm->irq_srcu, idx); + + return false; +} +EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ + struct kvm_irq_ack_notifier *kian; + int gsi, idx; + + trace_kvm_ack_irq(irqchip, pin); + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); + if (gsi != -1) + hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) + kian->irq_acked(kian); + srcu_read_unlock(&kvm->irq_srcu, idx); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + mutex_lock(&kvm->irq_lock); + hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); + mutex_unlock(&kvm->irq_lock); +#ifdef __KVM_HAVE_IOAPIC + kvm_vcpu_request_scan_ioapic(kvm); +#endif +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_init_rcu(&kian->link); + mutex_unlock(&kvm->irq_lock); + synchronize_srcu(&kvm->irq_srcu); +#ifdef __KVM_HAVE_IOAPIC + kvm_vcpu_request_scan_ioapic(kvm); +#endif +} #endif void kvm_eventfd_init(struct kvm *kvm) { -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +#ifdef CONFIG_HAVE_KVM_IRQFD spin_lock_init(&kvm->irqfds.lock); INIT_LIST_HEAD(&kvm->irqfds.items); INIT_LIST_HEAD(&kvm->irqfds.resampler_list); @@ -447,7 +522,7 @@ kvm_eventfd_init(struct kvm *kvm) INIT_LIST_HEAD(&kvm->ioeventfds); } -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING +#ifdef CONFIG_HAVE_KVM_IRQFD /* * shutdown any irqfd's that match fd+gsi */ @@ -466,14 +541,14 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { /* - * This rcu_assign_pointer is needed for when + * This clearing of irq_entry.type is needed for when * another thread calls kvm_irq_routing_update before * we flush workqueue below (we synchronize with * kvm_irq_routing_update using irqfds.lock). - * It is paired with synchronize_srcu done by caller - * of that function. */ - rcu_assign_pointer(irqfd->irq_entry, NULL); + write_seqcount_begin(&irqfd->irq_entry_sc); + irqfd->irq_entry.type = 0; + write_seqcount_end(&irqfd->irq_entry_sc); irqfd_deactivate(irqfd); } } @@ -528,20 +603,17 @@ kvm_irqfd_release(struct kvm *kvm) } /* - * Change irq_routing and irqfd. + * Take note of a change in irq routing. * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. */ -void kvm_irq_routing_update(struct kvm *kvm, - struct kvm_irq_routing_table *irq_rt) +void kvm_irq_routing_update(struct kvm *kvm) { struct _irqfd *irqfd; spin_lock_irq(&kvm->irqfds.lock); - rcu_assign_pointer(kvm->irq_routing, irq_rt); - list_for_each_entry(irqfd, &kvm->irqfds.items, list) - irqfd_update(kvm, irqfd, irq_rt); + irqfd_update(kvm, irqfd); spin_unlock_irq(&kvm->irqfds.lock); } diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index e8ce34c9db32..0ba4057d271b 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -405,6 +405,26 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) spin_unlock(&ioapic->lock); } +static void kvm_ioapic_eoi_inject_work(struct work_struct *work) +{ + int i; + struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic, + eoi_inject.work); + spin_lock(&ioapic->lock); + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG) + continue; + + if (ioapic->irr & (1 << i) && !ent->fields.remote_irr) + ioapic_service(ioapic, i, false); + } + spin_unlock(&ioapic->lock); +} + +#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 + static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { @@ -435,8 +455,26 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ent->fields.remote_irr = 0; - if (ioapic->irr & (1 << i)) - ioapic_service(ioapic, i, false); + if (!ent->fields.mask && (ioapic->irr & (1 << i))) { + ++ioapic->irq_eoi[i]; + if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { + /* + * Real hardware does not deliver the interrupt + * immediately during eoi broadcast, and this + * lets a buggy guest make slow progress + * even if it does not correctly handle a + * level-triggered interrupt. Emulate this + * behavior if we detect an interrupt storm. + */ + schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); + ioapic->irq_eoi[i] = 0; + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); + } else { + ioapic_service(ioapic, i, false); + } + } else { + ioapic->irq_eoi[i] = 0; + } } } @@ -565,12 +603,14 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic) { int i; + cancel_delayed_work_sync(&ioapic->eoi_inject); for (i = 0; i < IOAPIC_NUM_PINS; i++) ioapic->redirtbl[i].fields.mask = 1; ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; ioapic->ioregsel = 0; ioapic->irr = 0; ioapic->id = 0; + memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS); rtc_irq_eoi_tracking_reset(ioapic); update_handled_vectors(ioapic); } @@ -589,6 +629,7 @@ int kvm_ioapic_init(struct kvm *kvm) if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); + INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); @@ -609,6 +650,7 @@ void kvm_ioapic_destroy(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; + cancel_delayed_work_sync(&ioapic->eoi_inject); if (ioapic) { kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); kvm->arch.vioapic = NULL; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 90d43e95dcf8..e23b70634f1e 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -59,6 +59,8 @@ struct kvm_ioapic { spinlock_t lock; DECLARE_BITMAP(handled_vectors, 256); struct rtc_status rtc_status; + struct delayed_work eoi_inject; + u32 irq_eoi[IOAPIC_NUM_PINS]; }; #ifdef DEBUG diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 0df7d4b34dfe..c1e6ae989a43 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -43,13 +43,13 @@ static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long size) + unsigned long npages) { gfn_t end_gfn; pfn_t pfn; pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + (size >> PAGE_SHIFT); + end_gfn = gfn + npages; gfn += 1; if (is_error_noslot_pfn(pfn)) @@ -61,6 +61,14 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, return pfn; } +static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) +{ + unsigned long i; + + for (i = 0; i < npages; ++i) + kvm_release_pfn_clean(pfn + i); +} + int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { gfn_t gfn, end_gfn; @@ -111,7 +119,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) * Pin all pages we are about to map in memory. This is * important because we unmap and unpin in 4kb steps later. */ - pfn = kvm_pin_pages(slot, gfn, page_size); + pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); if (is_error_noslot_pfn(pfn)) { gfn += 1; continue; @@ -123,6 +131,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) if (r) { printk(KERN_ERR "kvm_iommu_map_address:" "iommu failed to map pfn=%llx\n", pfn); + kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); goto unmap_pages; } @@ -134,7 +143,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) return 0; unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn); + kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); return r; } @@ -182,8 +191,7 @@ int kvm_assign_device(struct kvm *kvm, return r; } - noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain, - IOMMU_CAP_CACHE_COHERENCY); + noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); /* Check if need to update IOMMU page table for guest memory */ if (noncoherent != kvm->arch.iommu_noncoherent) { @@ -194,7 +202,7 @@ int kvm_assign_device(struct kvm *kvm, goto out_unmap; } - pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; + pci_set_dev_assigned(pdev); dev_info(&pdev->dev, "kvm assign device\n"); @@ -220,7 +228,7 @@ int kvm_deassign_device(struct kvm *kvm, iommu_detach_device(domain, &pdev->dev); - pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; + pci_clear_dev_assigned(pdev); dev_info(&pdev->dev, "kvm deassign device\n"); @@ -245,8 +253,7 @@ int kvm_iommu_map_guest(struct kvm *kvm) } if (!allow_unsafe_assigned_interrupts && - !iommu_domain_has_cap(kvm->arch.iommu_domain, - IOMMU_CAP_INTR_REMAP)) { + !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { printk(KERN_WARNING "%s: No interrupt remapping support," " disallowing device assignment." " Re-enble with \"allow_unsafe_assigned_interrupts=1\"" @@ -266,14 +273,6 @@ out_unlock: return r; } -static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages) { diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a228ee82bad2..963b8995a9e8 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -160,9 +160,9 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, */ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) { + struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; struct kvm_kernel_irq_routing_entry *e; int ret = -EINVAL; - struct kvm_irq_routing_table *irq_rt; int idx; trace_kvm_set_irq(irq, level, irq_source_id); @@ -176,15 +176,13 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) * which is limited to 1:1 GSI mapping. */ idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (irq < irq_rt->nr_rt_entries) - hlist_for_each_entry(e, &irq_rt->map[irq], link) { - if (likely(e->type == KVM_IRQ_ROUTING_MSI)) - ret = kvm_set_msi_inatomic(e, kvm); - else - ret = -EWOULDBLOCK; - break; - } + if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { + e = &entries[0]; + if (likely(e->type == KVM_IRQ_ROUTING_MSI)) + ret = kvm_set_msi_inatomic(e, kvm); + else + ret = -EWOULDBLOCK; + } srcu_read_unlock(&kvm->irq_srcu, idx); return ret; } @@ -264,7 +262,7 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, int idx, gsi; idx = srcu_read_lock(&kvm->irq_srcu); - gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link) if (kimn->irq == gsi) @@ -272,8 +270,7 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } -int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, - struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; @@ -304,7 +301,6 @@ int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, e->irqchip.pin = ue->u.irqchip.pin + delta; if (e->irqchip.pin >= max_pin) goto out; - rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; break; case KVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index b43c275775cd..7f256f31df10 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -31,65 +31,42 @@ #include <trace/events/kvm.h> #include "irq.h" -bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) -{ - struct kvm_irq_ack_notifier *kian; - int gsi, idx; - - idx = srcu_read_lock(&kvm->irq_srcu); - gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; - if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) - if (kian->gsi == gsi) { - srcu_read_unlock(&kvm->irq_srcu, idx); - return true; - } - - srcu_read_unlock(&kvm->irq_srcu, idx); - - return false; -} -EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); +struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; + struct kvm_kernel_irq_routing_entry *rt_entries; + u32 nr_rt_entries; + /* + * Array indexed by gsi. Each entry contains list of irq chips + * the gsi is connected to. + */ + struct hlist_head map[0]; +}; -void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +int kvm_irq_map_gsi(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *entries, int gsi) { - struct kvm_irq_ack_notifier *kian; - int gsi, idx; - - trace_kvm_ack_irq(irqchip, pin); + struct kvm_irq_routing_table *irq_rt; + struct kvm_kernel_irq_routing_entry *e; + int n = 0; + + irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, + lockdep_is_held(&kvm->irq_lock)); + if (gsi < irq_rt->nr_rt_entries) { + hlist_for_each_entry(e, &irq_rt->map[gsi], link) { + entries[n] = *e; + ++n; + } + } - idx = srcu_read_lock(&kvm->irq_srcu); - gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin]; - if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) - if (kian->gsi == gsi) - kian->irq_acked(kian); - srcu_read_unlock(&kvm->irq_srcu, idx); + return n; } -void kvm_register_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) +int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) { - mutex_lock(&kvm->irq_lock); - hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); - mutex_unlock(&kvm->irq_lock); -#ifdef __KVM_HAVE_IOAPIC - kvm_vcpu_request_scan_ioapic(kvm); -#endif -} + struct kvm_irq_routing_table *irq_rt; -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - mutex_lock(&kvm->irq_lock); - hlist_del_init_rcu(&kian->link); - mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); -#ifdef __KVM_HAVE_IOAPIC - kvm_vcpu_request_scan_ioapic(kvm); -#endif + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + return irq_rt->chip[irqchip][pin]; } int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) @@ -115,9 +92,8 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { - struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; - int ret = -1, i = 0, idx; - struct kvm_irq_routing_table *irq_rt; + struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS]; + int ret = -1, i, idx; trace_kvm_set_irq(irq, level, irq_source_id); @@ -126,10 +102,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, * writes to the unused one. */ idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (irq < irq_rt->nr_rt_entries) - hlist_for_each_entry(e, &irq_rt->map[irq], link) - irq_set[i++] = *e; + i = kvm_irq_map_gsi(kvm, irq_set, irq); srcu_read_unlock(&kvm->irq_srcu, idx); while(i--) { @@ -171,9 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, e->gsi = ue->gsi; e->type = ue->type; - r = kvm_set_routing_entry(rt, e, ue); + r = kvm_set_routing_entry(e, ue); if (r) goto out; + if (e->type == KVM_IRQ_ROUTING_IRQCHIP) + rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi; hlist_add_head(&e->link, &rt->map[e->gsi]); r = 0; @@ -224,7 +199,8 @@ int kvm_set_irq_routing(struct kvm *kvm, mutex_lock(&kvm->irq_lock); old = kvm->irq_routing; - kvm_irq_routing_update(kvm, new); + rcu_assign_pointer(kvm->irq_routing, new); + kvm_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); synchronize_srcu_expedited(&kvm->irq_srcu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4b6c01b477f9..25ffac9e947d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -52,11 +52,13 @@ #include <asm/processor.h> #include <asm/io.h> +#include <asm/ioctl.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include "coalesced_mmio.h" #include "async_pf.h" +#include "vfio.h" #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> @@ -95,8 +97,6 @@ static int hardware_enable_all(void); static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *new, u64 last_generation); static void kvm_release_pfn_dirty(pfn_t pfn); static void mark_page_dirty_in_slot(struct kvm *kvm, @@ -110,7 +110,7 @@ static bool largepages_enabled = true; bool kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); return true; } @@ -129,7 +129,8 @@ int vcpu_load(struct kvm_vcpu *vcpu) struct pid *oldpid = vcpu->pid; struct pid *newpid = get_task_pid(current, PIDTYPE_PID); rcu_assign_pointer(vcpu->pid, newpid); - synchronize_rcu(); + if (oldpid) + synchronize_rcu(); put_pid(oldpid); } cpu = get_cpu(); @@ -152,7 +153,7 @@ static void ack_flush(void *_completed) { } -static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { int i, cpu, me; cpumask_var_t cpus; @@ -189,7 +190,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) long dirty_count = kvm->tlbs_dirty; smp_mb(); - if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) + if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } @@ -197,17 +198,17 @@ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); void kvm_reload_remote_mmus(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); + kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } void kvm_make_mclock_inprogress_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); + kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } void kvm_make_scan_ioapic_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); + kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) @@ -295,6 +296,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + + kvm_arch_mmu_notifier_invalidate_page(kvm, address); + srcu_read_unlock(&kvm->srcu, idx); } @@ -368,7 +372,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; @@ -376,7 +381,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - young = kvm_age_hva(kvm, address); + young = kvm_age_hva(kvm, start, end); if (young) kvm_flush_remote_tlbs(kvm); @@ -465,6 +470,8 @@ static struct kvm *kvm_create_vm(unsigned long type) #ifdef CONFIG_HAVE_KVM_IRQCHIP INIT_HLIST_HEAD(&kvm->mask_notifier_list); +#endif +#ifdef CONFIG_HAVE_KVM_IRQFD INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif @@ -474,6 +481,13 @@ static struct kvm *kvm_create_vm(unsigned long type) kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!kvm->memslots) goto out_err_no_srcu; + + /* + * Init kvm generation close to the maximum to easily test the + * code of handling generation number wrap-around. + */ + kvm->memslots->generation = -150; + kvm_init_memslots_id(kvm); if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; @@ -685,8 +699,7 @@ static void sort_memslots(struct kvm_memslots *slots) } static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *new, - u64 last_generation) + struct kvm_memory_slot *new) { if (new) { int id = new->id; @@ -697,15 +710,13 @@ static void update_memslots(struct kvm_memslots *slots, if (new->npages != npages) sort_memslots(slots); } - - slots->generation = last_generation + 1; } static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; -#ifdef KVM_CAP_READONLY_MEM +#ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif @@ -720,10 +731,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, { struct kvm_memslots *old_memslots = kvm->memslots; - update_memslots(slots, new, kvm->memslots->generation); + /* + * Set the low bit in the generation, which disables SPTE caching + * until the end of synchronize_srcu_expedited. + */ + WARN_ON(old_memslots->generation & 1); + slots->generation = old_memslots->generation + 1; + + update_memslots(slots, new); rcu_assign_pointer(kvm->memslots, slots); synchronize_srcu_expedited(&kvm->srcu); + /* + * Increment the new memslot generation a second time. This prevents + * vm exits that race with memslot updates from caching a memslot + * generation that will (potentially) be valid forever. + */ + slots->generation++; + kvm_arch_memslots_updated(kvm); return old_memslots; @@ -774,7 +799,6 @@ int __kvm_set_memory_region(struct kvm *kvm, base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; npages = mem->memory_size >> PAGE_SHIFT; - r = -EINVAL; if (npages > KVM_MEM_MAX_NR_PAGES) goto out; @@ -788,7 +812,6 @@ int __kvm_set_memory_region(struct kvm *kvm, new.npages = npages; new.flags = mem->flags; - r = -EINVAL; if (npages) { if (!old.npages) change = KVM_MR_CREATE; @@ -844,7 +867,6 @@ int __kvm_set_memory_region(struct kvm *kvm, } if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { - r = -ENOMEM; slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) @@ -1073,9 +1095,9 @@ EXPORT_SYMBOL_GPL(gfn_to_hva); * If writable is set to false, the hva returned by this function is only * allowed to be read. */ -unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, + gfn_t gfn, bool *writable) { - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); if (!kvm_is_error_hva(hva) && writable) @@ -1084,6 +1106,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) return hva; } +unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + + return gfn_to_hva_memslot_prot(slot, gfn, writable); +} + static int kvm_read_hva(void *data, void __user *hva, int len) { return __copy_from_user(data, hva, len); @@ -1105,6 +1134,43 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); } +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, bool write_fault, + struct page **pagep) +{ + int npages; + int locked = 1; + int flags = FOLL_TOUCH | FOLL_HWPOISON | + (pagep ? FOLL_GET : 0) | + (write_fault ? FOLL_WRITE : 0); + + /* + * If retrying the fault, we get here *not* having allowed the filemap + * to wait on the page lock. We should now allow waiting on the IO with + * the mmap semaphore released. + */ + down_read(&mm->mmap_sem); + npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL, + &locked); + if (!locked) { + VM_BUG_ON(npages); + + if (!pagep) + return 0; + + /* + * The previous call has now waited on the IO. Now we can + * retry and complete. Pass TRIED to ensure we do not re + * schedule async IO (see e.g. filemap_fault). + */ + down_read(&mm->mmap_sem); + npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED, + pagep, NULL, NULL); + } + up_read(&mm->mmap_sem); + return npages; +} + static inline int check_user_page_hwpoison(unsigned long addr) { int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; @@ -1167,9 +1233,15 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, npages = get_user_page_nowait(current, current->mm, addr, write_fault, page); up_read(¤t->mm->mmap_sem); - } else - npages = get_user_pages_fast(addr, 1, write_fault, - page); + } else { + /* + * By now we have tried gup_fast, and possibly async_pf, and we + * are certainly not atomic. Time to retry the gup, allowing + * mmap semaphore to be relinquished in the case of IO. + */ + npages = kvm_get_user_page_io(current, current->mm, addr, + write_fault, page); + } if (npages != 1) return npages; @@ -1723,7 +1795,7 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target) rcu_read_lock(); pid = rcu_dereference(target->pid); if (pid) - task = get_pid_task(target->pid, PIDTYPE_PID); + task = get_pid_task(pid, PIDTYPE_PID); rcu_read_unlock(); if (!task) return ret; @@ -1766,8 +1838,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) bool eligible; eligible = !vcpu->spin_loop.in_spin_loop || - (vcpu->spin_loop.in_spin_loop && - vcpu->spin_loop.dy_eligible); + vcpu->spin_loop.dy_eligible; if (vcpu->spin_loop.in_spin_loop) kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); @@ -1973,6 +2044,9 @@ static long kvm_vcpu_ioctl(struct file *filp, if (vcpu->kvm->mm != current->mm) return -EIO; + if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) + return -EINVAL; + #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) /* * Special cases: vcpu ioctls that are asynchronous to vcpu execution, @@ -2257,6 +2331,35 @@ struct kvm_device *kvm_device_from_filp(struct file *filp) return filp->private_data; } +static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { +#ifdef CONFIG_KVM_MPIC + [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, + [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, +#endif + +#ifdef CONFIG_KVM_XICS + [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, +#endif +}; + +int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) +{ + if (type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENOSPC; + + if (kvm_device_ops_table[type] != NULL) + return -EEXIST; + + kvm_device_ops_table[type] = ops; + return 0; +} + +void kvm_unregister_device_ops(u32 type) +{ + if (kvm_device_ops_table[type] != NULL) + kvm_device_ops_table[type] = NULL; +} + static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { @@ -2265,36 +2368,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm, bool test = cd->flags & KVM_CREATE_DEVICE_TEST; int ret; - switch (cd->type) { -#ifdef CONFIG_KVM_MPIC - case KVM_DEV_TYPE_FSL_MPIC_20: - case KVM_DEV_TYPE_FSL_MPIC_42: - ops = &kvm_mpic_ops; - break; -#endif -#ifdef CONFIG_KVM_XICS - case KVM_DEV_TYPE_XICS: - ops = &kvm_xics_ops; - break; -#endif -#ifdef CONFIG_KVM_VFIO - case KVM_DEV_TYPE_VFIO: - ops = &kvm_vfio_ops; - break; -#endif -#ifdef CONFIG_KVM_ARM_VGIC - case KVM_DEV_TYPE_ARM_VGIC_V2: - ops = &kvm_arm_vgic_v2_ops; - break; -#endif -#ifdef CONFIG_S390 - case KVM_DEV_TYPE_FLIC: - ops = &kvm_flic_ops; - break; -#endif - default: + if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENODEV; + + ops = kvm_device_ops_table[cd->type]; + if (ops == NULL) return -ENODEV; - } if (test) return 0; @@ -2324,6 +2403,34 @@ static int kvm_ioctl_create_device(struct kvm *kvm, return 0; } +static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) +{ + switch (arg) { + case KVM_CAP_USER_MEMORY: + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: + case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_CAP_SET_BOOT_CPU_ID: +#endif + case KVM_CAP_INTERNAL_ERROR_DATA: +#ifdef CONFIG_HAVE_KVM_MSI + case KVM_CAP_SIGNAL_MSI: +#endif +#ifdef CONFIG_HAVE_KVM_IRQFD + case KVM_CAP_IRQFD_RESAMPLE: +#endif + case KVM_CAP_CHECK_EXTENSION_VM: + return 1; +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING + case KVM_CAP_IRQ_ROUTING: + return KVM_MAX_IRQ_ROUTES; +#endif + default: + break; + } + return kvm_vm_ioctl_check_extension(kvm, arg); +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2487,6 +2594,9 @@ static long kvm_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_CHECK_EXTENSION: + r = kvm_vm_ioctl_check_extension_generic(kvm, arg); + break; default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) @@ -2571,33 +2681,6 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) return r; } -static long kvm_dev_ioctl_check_extension_generic(long arg) -{ - switch (arg) { - case KVM_CAP_USER_MEMORY: - case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: - case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: -#ifdef CONFIG_KVM_APIC_ARCHITECTURE - case KVM_CAP_SET_BOOT_CPU_ID: -#endif - case KVM_CAP_INTERNAL_ERROR_DATA: -#ifdef CONFIG_HAVE_KVM_MSI - case KVM_CAP_SIGNAL_MSI: -#endif -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING - case KVM_CAP_IRQFD_RESAMPLE: -#endif - return 1; -#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING - case KVM_CAP_IRQ_ROUTING: - return KVM_MAX_IRQ_ROUTES; -#endif - default: - break; - } - return kvm_dev_ioctl_check_extension(arg); -} - static long kvm_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2605,7 +2688,6 @@ static long kvm_dev_ioctl(struct file *filp, switch (ioctl) { case KVM_GET_API_VERSION: - r = -EINVAL; if (arg) goto out; r = KVM_API_VERSION; @@ -2614,10 +2696,9 @@ static long kvm_dev_ioctl(struct file *filp, r = kvm_dev_ioctl_create_vm(arg); break; case KVM_CHECK_EXTENSION: - r = kvm_dev_ioctl_check_extension_generic(arg); + r = kvm_vm_ioctl_check_extension_generic(NULL, arg); break; case KVM_GET_VCPU_MMAP_SIZE: - r = -EINVAL; if (arg) goto out; r = PAGE_SIZE; /* struct kvm_run */ @@ -2662,7 +2743,7 @@ static void hardware_enable_nolock(void *junk) cpumask_set_cpu(cpu, cpus_hardware_enabled); - r = kvm_arch_hardware_enable(NULL); + r = kvm_arch_hardware_enable(); if (r) { cpumask_clear_cpu(cpu, cpus_hardware_enabled); @@ -2687,7 +2768,7 @@ static void hardware_disable_nolock(void *junk) if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) return; cpumask_clear_cpu(cpu, cpus_hardware_enabled); - kvm_arch_hardware_disable(NULL); + kvm_arch_hardware_disable(); } static void hardware_disable(void) @@ -3117,6 +3198,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) if (vcpu->preempted) vcpu->preempted = false; + kvm_arch_sched_in(vcpu, cpu); + kvm_arch_vcpu_load(vcpu, cpu); } @@ -3208,6 +3291,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, goto out_undebugfs; } + r = kvm_vfio_ops_init(); + WARN_ON(r); + return 0; out_undebugfs: @@ -3248,5 +3334,6 @@ void kvm_exit(void) kvm_arch_exit(); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); + kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index ba1a93f935c7..620e37f741b8 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vfio.h> +#include "vfio.h" struct kvm_vfio_group { struct list_head node; @@ -246,6 +247,16 @@ static void kvm_vfio_destroy(struct kvm_device *dev) kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ } +static int kvm_vfio_create(struct kvm_device *dev, u32 type); + +static struct kvm_device_ops kvm_vfio_ops = { + .name = "kvm-vfio", + .create = kvm_vfio_create, + .destroy = kvm_vfio_destroy, + .set_attr = kvm_vfio_set_attr, + .has_attr = kvm_vfio_has_attr, +}; + static int kvm_vfio_create(struct kvm_device *dev, u32 type) { struct kvm_device *tmp; @@ -268,10 +279,12 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) return 0; } -struct kvm_device_ops kvm_vfio_ops = { - .name = "kvm-vfio", - .create = kvm_vfio_create, - .destroy = kvm_vfio_destroy, - .set_attr = kvm_vfio_set_attr, - .has_attr = kvm_vfio_has_attr, -}; +int kvm_vfio_ops_init(void) +{ + return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); +} + +void kvm_vfio_ops_exit(void) +{ + kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); +} diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h new file mode 100644 index 000000000000..ab88c7dc0514 --- /dev/null +++ b/virt/kvm/vfio.h @@ -0,0 +1,17 @@ +#ifndef __KVM_VFIO_H +#define __KVM_VFIO_H + +#ifdef CONFIG_KVM_VFIO +int kvm_vfio_ops_init(void); +void kvm_vfio_ops_exit(void); +#else +static inline int kvm_vfio_ops_init(void) +{ + return 0; +} +static inline void kvm_vfio_ops_exit(void) +{ +} +#endif + +#endif |