41 files changed, 3506 insertions, 4421 deletions
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index a88726359e5f..5e3c673fa3f4 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -196,13 +196,17 @@ struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
 #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO  7
+#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS	8
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT	10
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
 			(0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
 #define VGIC_LEVEL_INFO_LINE_LEVEL	0
 
-#define   KVM_DEV_ARM_VGIC_CTRL_INIT    0
+#define   KVM_DEV_ARM_VGIC_CTRL_INIT		0
+#define   KVM_DEV_ARM_ITS_SAVE_TABLES		1
+#define   KVM_DEV_ARM_ITS_RESTORE_TABLES	2
+#define   KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES	3
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT		24
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 7b3670c2ae7b..d9beee652d36 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -18,9 +18,12 @@ KVM := ../../../virt/kvm
 kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
 
 obj-$(CONFIG_KVM_ARM_HOST) += hyp/
+
 obj-y += kvm-arm.o init.o interrupts.o
-obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
-obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o vgic-v3-coproc.o
+obj-y += handle_exit.o guest.o emulate.o reset.o
+obj-y += coproc.o coproc_a15.o coproc_a7.o   vgic-v3-coproc.o
+obj-y += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
+obj-y += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
 obj-y += $(KVM)/arm/aarch32.o
 
 obj-y += $(KVM)/arm/vgic/vgic.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
deleted file mode 100644
index 8a31906bdc9b..000000000000
--- a/arch/arm/kvm/arm.c
+++ /dev/null
@@ -1,1480 +0,0 @@
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#include <linux/cpu_pm.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/kvm_host.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/mman.h>
-#include <linux/sched.h>
-#include <linux/kvm.h>
-#include <trace/events/kvm.h>
-#include <kvm/arm_pmu.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-#include <linux/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/mman.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/virt.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_coproc.h>
-#include <asm/kvm_psci.h>
-#include <asm/sections.h>
-
-#ifdef REQUIRES_VIRT
-__asm__(".arch_extension	virt");
-#endif
-
-static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-static kvm_cpu_context_t __percpu *kvm_host_cpu_state;
-
-/* Per-CPU variable containing the currently running vcpu. */
-static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
-
-/* The VMID used in the VTTBR */
-static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
-static u32 kvm_next_vmid;
-static unsigned int kvm_vmid_bits __read_mostly;
-static DEFINE_SPINLOCK(kvm_vmid_lock);
-
-static bool vgic_present;
-
-static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
-
-static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
-{
-	BUG_ON(preemptible());
-	__this_cpu_write(kvm_arm_running_vcpu, vcpu);
-}
-
-/**
- * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU.
- * Must be called from non-preemptible context
- */
-struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
-{
-	BUG_ON(preemptible());
-	return __this_cpu_read(kvm_arm_running_vcpu);
-}
-
-/**
- * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus.
- */
-struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
-{
-	return &kvm_arm_running_vcpu;
-}
-
-int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
-{
-	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
-}
-
-int kvm_arch_hardware_setup(void)
-{
-	return 0;
-}
-
-void kvm_arch_check_processor_compat(void *rtn)
-{
-	*(int *)rtn = 0;
-}
-
-
-/**
- * kvm_arch_init_vm - initializes a VM data structure
- * @kvm:	pointer to the KVM struct
- */
-int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-{
-	int ret, cpu;
-
-	if (type)
-		return -EINVAL;
-
-	kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
-	if (!kvm->arch.last_vcpu_ran)
-		return -ENOMEM;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;
-
-	ret = kvm_alloc_stage2_pgd(kvm);
-	if (ret)
-		goto out_fail_alloc;
-
-	ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
-	if (ret)
-		goto out_free_stage2_pgd;
-
-	kvm_vgic_early_init(kvm);
-
-	/* Mark the initial VMID generation invalid */
-	kvm->arch.vmid_gen = 0;
-
-	/* The maximum number of VCPUs is limited by the host's GIC model */
-	kvm->arch.max_vcpus = vgic_present ?
-				kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
-
-	return ret;
-out_free_stage2_pgd:
-	kvm_free_stage2_pgd(kvm);
-out_fail_alloc:
-	free_percpu(kvm->arch.last_vcpu_ran);
-	kvm->arch.last_vcpu_ran = NULL;
-	return ret;
-}
-
-bool kvm_arch_has_vcpu_debugfs(void)
-{
-	return false;
-}
-
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
-int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
-{
-	return VM_FAULT_SIGBUS;
-}
-
-
-/**
- * kvm_arch_destroy_vm - destroy the VM data structure
- * @kvm:	pointer to the KVM struct
- */
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
-	int i;
-
-	free_percpu(kvm->arch.last_vcpu_ran);
-	kvm->arch.last_vcpu_ran = NULL;
-
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			kvm_arch_vcpu_free(kvm->vcpus[i]);
-			kvm->vcpus[i] = NULL;
-		}
-	}
-
-	kvm_vgic_destroy(kvm);
-}
-
-int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
-{
-	int r;
-	switch (ext) {
-	case KVM_CAP_IRQCHIP:
-		r = vgic_present;
-		break;
-	case KVM_CAP_IOEVENTFD:
-	case KVM_CAP_DEVICE_CTRL:
-	case KVM_CAP_USER_MEMORY:
-	case KVM_CAP_SYNC_MMU:
-	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
-	case KVM_CAP_ONE_REG:
-	case KVM_CAP_ARM_PSCI:
-	case KVM_CAP_ARM_PSCI_0_2:
-	case KVM_CAP_READONLY_MEM:
-	case KVM_CAP_MP_STATE:
-	case KVM_CAP_IMMEDIATE_EXIT:
-		r = 1;
-		break;
-	case KVM_CAP_ARM_SET_DEVICE_ADDR:
-		r = 1;
-		break;
-	case KVM_CAP_NR_VCPUS:
-		r = num_online_cpus();
-		break;
-	case KVM_CAP_MAX_VCPUS:
-		r = KVM_MAX_VCPUS;
-		break;
-	case KVM_CAP_NR_MEMSLOTS:
-		r = KVM_USER_MEM_SLOTS;
-		break;
-	case KVM_CAP_MSI_DEVID:
-		if (!kvm)
-			r = -EINVAL;
-		else
-			r = kvm->arch.vgic.msis_require_devid;
-		break;
-	case KVM_CAP_ARM_USER_IRQ:
-		/*
-		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
-		 * (bump this number if adding more devices)
-		 */
-		r = 1;
-		break;
-	default:
-		r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
-		break;
-	}
-	return r;
-}
-
-long kvm_arch_dev_ioctl(struct file *filp,
-			unsigned int ioctl, unsigned long arg)
-{
-	return -EINVAL;
-}
-
-
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
-{
-	int err;
-	struct kvm_vcpu *vcpu;
-
-	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) {
-		err = -EBUSY;
-		goto out;
-	}
-
-	if (id >= kvm->arch.max_vcpus) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-	if (!vcpu) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	err = kvm_vcpu_init(vcpu, kvm, id);
-	if (err)
-		goto free_vcpu;
-
-	err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
-	if (err)
-		goto vcpu_uninit;
-
-	return vcpu;
-vcpu_uninit:
-	kvm_vcpu_uninit(vcpu);
-free_vcpu:
-	kmem_cache_free(kvm_vcpu_cache, vcpu);
-out:
-	return ERR_PTR(err);
-}
-
-void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
-{
-	kvm_vgic_vcpu_early_init(vcpu);
-}
-
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
-{
-	kvm_mmu_free_memory_caches(vcpu);
-	kvm_timer_vcpu_terminate(vcpu);
-	kvm_vgic_vcpu_destroy(vcpu);
-	kvm_pmu_vcpu_destroy(vcpu);
-	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(kvm_vcpu_cache, vcpu);
-}
-
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	kvm_arch_vcpu_free(vcpu);
-}
-
-int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	return kvm_timer_should_fire(vcpu_vtimer(vcpu)) ||
-	       kvm_timer_should_fire(vcpu_ptimer(vcpu));
-}
-
-void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
-{
-	kvm_timer_schedule(vcpu);
-}
-
-void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
-	kvm_timer_unschedule(vcpu);
-}
-
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	/* Force users to call KVM_ARM_VCPU_INIT */
-	vcpu->arch.target = -1;
-	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-
-	/* Set up the timer */
-	kvm_timer_vcpu_init(vcpu);
-
-	kvm_arm_reset_debug_ptr(vcpu);
-
-	return 0;
-}
-
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	int *last_ran;
-
-	last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
-
-	/*
-	 * We might get preempted before the vCPU actually runs, but
-	 * over-invalidation doesn't affect correctness.
-	 */
-	if (*last_ran != vcpu->vcpu_id) {
-		kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
-		*last_ran = vcpu->vcpu_id;
-	}
-
-	vcpu->cpu = cpu;
-	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
-
-	kvm_arm_set_running_vcpu(vcpu);
-
-	kvm_vgic_load(vcpu);
-}
-
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	kvm_vgic_put(vcpu);
-
-	vcpu->cpu = -1;
-
-	kvm_arm_set_running_vcpu(NULL);
-	kvm_timer_vcpu_put(vcpu);
-}
-
-int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
-				    struct kvm_mp_state *mp_state)
-{
-	if (vcpu->arch.power_off)
-		mp_state->mp_state = KVM_MP_STATE_STOPPED;
-	else
-		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
-
-	return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
-				    struct kvm_mp_state *mp_state)
-{
-	switch (mp_state->mp_state) {
-	case KVM_MP_STATE_RUNNABLE:
-		vcpu->arch.power_off = false;
-		break;
-	case KVM_MP_STATE_STOPPED:
-		vcpu->arch.power_off = true;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/**
- * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
- * @v:		The VCPU pointer
- *
- * If the guest CPU is not waiting for interrupts or an interrupt line is
- * asserted, the CPU is by definition runnable.
- */
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
-{
-	return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v))
-		&& !v->arch.power_off && !v->arch.pause);
-}
-
-/* Just ensure a guest exit from a particular CPU */
-static void exit_vm_noop(void *info)
-{
-}
-
-void force_vm_exit(const cpumask_t *mask)
-{
-	preempt_disable();
-	smp_call_function_many(mask, exit_vm_noop, NULL, true);
-	preempt_enable();
-}
-
-/**
- * need_new_vmid_gen - check that the VMID is still valid
- * @kvm: The VM's VMID to check
- *
- * return true if there is a new generation of VMIDs being used
- *
- * The hardware supports only 256 values with the value zero reserved for the
- * host, so we check if an assigned value belongs to a previous generation,
- * which which requires us to assign a new value. If we're the first to use a
- * VMID for the new generation, we must flush necessary caches and TLBs on all
- * CPUs.
- */
-static bool need_new_vmid_gen(struct kvm *kvm)
-{
-	return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
-}
-
-/**
- * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
- * @kvm	The guest that we are about to run
- *
- * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
- * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
- * caches and TLBs.
- */
-static void update_vttbr(struct kvm *kvm)
-{
-	phys_addr_t pgd_phys;
-	u64 vmid;
-
-	if (!need_new_vmid_gen(kvm))
-		return;
-
-	spin_lock(&kvm_vmid_lock);
-
-	/*
-	 * We need to re-check the vmid_gen here to ensure that if another vcpu
-	 * already allocated a valid vmid for this vm, then this vcpu should
-	 * use the same vmid.
-	 */
-	if (!need_new_vmid_gen(kvm)) {
-		spin_unlock(&kvm_vmid_lock);
-		return;
-	}
-
-	/* First user of a new VMID generation? */
-	if (unlikely(kvm_next_vmid == 0)) {
-		atomic64_inc(&kvm_vmid_gen);
-		kvm_next_vmid = 1;
-
-		/*
-		 * On SMP we know no other CPUs can use this CPU's or each
-		 * other's VMID after force_vm_exit returns since the
-		 * kvm_vmid_lock blocks them from reentry to the guest.
-		 */
-		force_vm_exit(cpu_all_mask);
-		/*
-		 * Now broadcast TLB + ICACHE invalidation over the inner
-		 * shareable domain to make sure all data structures are
-		 * clean.
-		 */
-		kvm_call_hyp(__kvm_flush_vm_context);
-	}
-
-	kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
-	kvm->arch.vmid = kvm_next_vmid;
-	kvm_next_vmid++;
-	kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
-
-	/* update vttbr to be used with the new vmid */
-	pgd_phys = virt_to_phys(kvm->arch.pgd);
-	BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
-	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
-	kvm->arch.vttbr = pgd_phys | vmid;
-
-	spin_unlock(&kvm_vmid_lock);
-}
-
-static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	int ret = 0;
-
-	if (likely(vcpu->arch.has_run_once))
-		return 0;
-
-	vcpu->arch.has_run_once = true;
-
-	/*
-	 * Map the VGIC hardware resources before running a vcpu the first
-	 * time on this VM.
-	 */
-	if (unlikely(irqchip_in_kernel(kvm) && !vgic_ready(kvm))) {
-		ret = kvm_vgic_map_resources(kvm);
-		if (ret)
-			return ret;
-	}
-
-	ret = kvm_timer_enable(vcpu);
-
-	return ret;
-}
-
-bool kvm_arch_intc_initialized(struct kvm *kvm)
-{
-	return vgic_initialized(kvm);
-}
-
-void kvm_arm_halt_guest(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		vcpu->arch.pause = true;
-	kvm_make_all_cpus_request(kvm, KVM_REQ_VCPU_EXIT);
-}
-
-void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.pause = true;
-	kvm_vcpu_kick(vcpu);
-}
-
-void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu)
-{
-	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-
-	vcpu->arch.pause = false;
-	swake_up(wq);
-}
-
-void kvm_arm_resume_guest(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_arm_resume_vcpu(vcpu);
-}
-
-static void vcpu_sleep(struct kvm_vcpu *vcpu)
-{
-	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-
-	swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
-				       (!vcpu->arch.pause)));
-}
-
-static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.target >= 0;
-}
-
-/**
- * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
- * @vcpu:	The VCPU pointer
- * @run:	The kvm_run structure pointer used for userspace state exchange
- *
- * This function is called through the VCPU_RUN ioctl called from user space. It
- * will execute VM code in a loop until the time slice for the process is used
- * or some emulation is needed from user space in which case the function will
- * return with return value 0 and with the kvm_run structure filled in with the
- * required data for the requested emulation.
- */
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	int ret;
-	sigset_t sigsaved;
-
-	if (unlikely(!kvm_vcpu_initialized(vcpu)))
-		return -ENOEXEC;
-
-	ret = kvm_vcpu_first_run_init(vcpu);
-	if (ret)
-		return ret;
-
-	if (run->exit_reason == KVM_EXIT_MMIO) {
-		ret = kvm_handle_mmio_return(vcpu, vcpu->run);
-		if (ret)
-			return ret;
-	}
-
-	if (run->immediate_exit)
-		return -EINTR;
-
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
-	ret = 1;
-	run->exit_reason = KVM_EXIT_UNKNOWN;
-	while (ret > 0) {
-		/*
-		 * Check conditions before entering the guest
-		 */
-		cond_resched();
-
-		update_vttbr(vcpu->kvm);
-
-		if (vcpu->arch.power_off || vcpu->arch.pause)
-			vcpu_sleep(vcpu);
-
-		/*
-		 * Preparing the interrupts to be injected also
-		 * involves poking the GIC, which must be done in a
-		 * non-preemptible context.
-		 */
-		preempt_disable();
-
-		kvm_pmu_flush_hwstate(vcpu);
-
-		kvm_timer_flush_hwstate(vcpu);
-		kvm_vgic_flush_hwstate(vcpu);
-
-		local_irq_disable();
-
-		/*
-		 * If we have a singal pending, or need to notify a userspace
-		 * irqchip about timer or PMU level changes, then we exit (and
-		 * update the timer level state in kvm_timer_update_run
-		 * below).
-		 */
-		if (signal_pending(current) ||
-		    kvm_timer_should_notify_user(vcpu) ||
-		    kvm_pmu_should_notify_user(vcpu)) {
-			ret = -EINTR;
-			run->exit_reason = KVM_EXIT_INTR;
-		}
-
-		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
-			vcpu->arch.power_off || vcpu->arch.pause) {
-			local_irq_enable();
-			kvm_pmu_sync_hwstate(vcpu);
-			kvm_timer_sync_hwstate(vcpu);
-			kvm_vgic_sync_hwstate(vcpu);
-			preempt_enable();
-			continue;
-		}
-
-		kvm_arm_setup_debug(vcpu);
-
-		/**************************************************************
-		 * Enter the guest
-		 */
-		trace_kvm_entry(*vcpu_pc(vcpu));
-		guest_enter_irqoff();
-		vcpu->mode = IN_GUEST_MODE;
-
-		ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
-
-		vcpu->mode = OUTSIDE_GUEST_MODE;
-		vcpu->stat.exits++;
-		/*
-		 * Back from guest
-		 *************************************************************/
-
-		kvm_arm_clear_debug(vcpu);
-
-		/*
-		 * We may have taken a host interrupt in HYP mode (ie
-		 * while executing the guest). This interrupt is still
-		 * pending, as we haven't serviced it yet!
-		 *
-		 * We're now back in SVC mode, with interrupts
-		 * disabled.  Enabling the interrupts now will have
-		 * the effect of taking the interrupt again, in SVC
-		 * mode this time.
-		 */
-		local_irq_enable();
-
-		/*
-		 * We do local_irq_enable() before calling guest_exit() so
-		 * that if a timer interrupt hits while running the guest we
-		 * account that tick as being spent in the guest.  We enable
-		 * preemption after calling guest_exit() so that if we get
-		 * preempted we make sure ticks after that is not counted as
-		 * guest time.
-		 */
-		guest_exit();
-		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-
-		/*
-		 * We must sync the PMU and timer state before the vgic state so
-		 * that the vgic can properly sample the updated state of the
-		 * interrupt line.
-		 */
-		kvm_pmu_sync_hwstate(vcpu);
-		kvm_timer_sync_hwstate(vcpu);
-
-		kvm_vgic_sync_hwstate(vcpu);
-
-		preempt_enable();
-
-		ret = handle_exit(vcpu, run, ret);
-	}
-
-	/* Tell userspace about in-kernel device output levels */
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
-		kvm_timer_update_run(vcpu);
-		kvm_pmu_update_run(vcpu);
-	}
-
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	return ret;
-}
-
-static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
-{
-	int bit_index;
-	bool set;
-	unsigned long *ptr;
-
-	if (number == KVM_ARM_IRQ_CPU_IRQ)
-		bit_index = __ffs(HCR_VI);
-	else /* KVM_ARM_IRQ_CPU_FIQ */
-		bit_index = __ffs(HCR_VF);
-
-	ptr = (unsigned long *)&vcpu->arch.irq_lines;
-	if (level)
-		set = test_and_set_bit(bit_index, ptr);
-	else
-		set = test_and_clear_bit(bit_index, ptr);
-
-	/*
-	 * If we didn't change anything, no need to wake up or kick other CPUs
-	 */
-	if (set == level)
-		return 0;
-
-	/*
-	 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
-	 * trigger a world-switch round on the running physical CPU to set the
-	 * virtual IRQ/FIQ fields in the HCR appropriately.
-	 */
-	kvm_vcpu_kick(vcpu);
-
-	return 0;
-}
-
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
-			  bool line_status)
-{
-	u32 irq = irq_level->irq;
-	unsigned int irq_type, vcpu_idx, irq_num;
-	int nrcpus = atomic_read(&kvm->online_vcpus);
-	struct kvm_vcpu *vcpu = NULL;
-	bool level = irq_level->level;
-
-	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
-	vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
-	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
-
-	trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
-
-	switch (irq_type) {
-	case KVM_ARM_IRQ_TYPE_CPU:
-		if (irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (vcpu_idx >= nrcpus)
-			return -EINVAL;
-
-		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
-		if (!vcpu)
-			return -EINVAL;
-
-		if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
-			return -EINVAL;
-
-		return vcpu_interrupt_line(vcpu, irq_num, level);
-	case KVM_ARM_IRQ_TYPE_PPI:
-		if (!irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (vcpu_idx >= nrcpus)
-			return -EINVAL;
-
-		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
-		if (!vcpu)
-			return -EINVAL;
-
-		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
-			return -EINVAL;
-
-		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level);
-	case KVM_ARM_IRQ_TYPE_SPI:
-		if (!irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (irq_num < VGIC_NR_PRIVATE_IRQS)
-			return -EINVAL;
-
-		return kvm_vgic_inject_irq(kvm, 0, irq_num, level);
-	}
-
-	return -EINVAL;
-}
-
-static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-			       const struct kvm_vcpu_init *init)
-{
-	unsigned int i;
-	int phys_target = kvm_target_cpu();
-
-	if (init->target != phys_target)
-		return -EINVAL;
-
-	/*
-	 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
-	 * use the same target.
-	 */
-	if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
-		return -EINVAL;
-
-	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
-	for (i = 0; i < sizeof(init->features) * 8; i++) {
-		bool set = (init->features[i / 32] & (1 << (i % 32)));
-
-		if (set && i >= KVM_VCPU_MAX_FEATURES)
-			return -ENOENT;
-
-		/*
-		 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
-		 * use the same feature set.
-		 */
-		if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
-		    test_bit(i, vcpu->arch.features) != set)
-			return -EINVAL;
-
-		if (set)
-			set_bit(i, vcpu->arch.features);
-	}
-
-	vcpu->arch.target = phys_target;
-
-	/* Now we know what it is, we can reset it. */
-	return kvm_reset_vcpu(vcpu);
-}
-
-
-static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
-					 struct kvm_vcpu_init *init)
-{
-	int ret;
-
-	ret = kvm_vcpu_set_target(vcpu, init);
-	if (ret)
-		return ret;
-
-	/*
-	 * Ensure a rebooted VM will fault in RAM pages and detect if the
-	 * guest MMU is turned off and flush the caches as needed.
-	 */
-	if (vcpu->arch.has_run_once)
-		stage2_unmap_vm(vcpu->kvm);
-
-	vcpu_reset_hcr(vcpu);
-
-	/*
-	 * Handle the "start in power-off" case.
-	 */
-	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-		vcpu->arch.power_off = true;
-	else
-		vcpu->arch.power_off = false;
-
-	return 0;
-}
-
-static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-long kvm_arch_vcpu_ioctl(struct file *filp,
-			 unsigned int ioctl, unsigned long arg)
-{
-	struct kvm_vcpu *vcpu = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	struct kvm_device_attr attr;
-
-	switch (ioctl) {
-	case KVM_ARM_VCPU_INIT: {
-		struct kvm_vcpu_init init;
-
-		if (copy_from_user(&init, argp, sizeof(init)))
-			return -EFAULT;
-
-		return kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
-	}
-	case KVM_SET_ONE_REG:
-	case KVM_GET_ONE_REG: {
-		struct kvm_one_reg reg;
-
-		if (unlikely(!kvm_vcpu_initialized(vcpu)))
-			return -ENOEXEC;
-
-		if (copy_from_user(&reg, argp, sizeof(reg)))
-			return -EFAULT;
-		if (ioctl == KVM_SET_ONE_REG)
-			return kvm_arm_set_reg(vcpu, &reg);
-		else
-			return kvm_arm_get_reg(vcpu, &reg);
-	}
-	case KVM_GET_REG_LIST: {
-		struct kvm_reg_list __user *user_list = argp;
-		struct kvm_reg_list reg_list;
-		unsigned n;
-
-		if (unlikely(!kvm_vcpu_initialized(vcpu)))
-			return -ENOEXEC;
-
-		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
-			return -EFAULT;
-		n = reg_list.n;
-		reg_list.n = kvm_arm_num_regs(vcpu);
-		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
-			return -EFAULT;
-		if (n < reg_list.n)
-			return -E2BIG;
-		return kvm_arm_copy_reg_indices(vcpu, user_list->reg);
-	}
-	case KVM_SET_DEVICE_ATTR: {
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			return -EFAULT;
-		return kvm_arm_vcpu_set_attr(vcpu, &attr);
-	}
-	case KVM_GET_DEVICE_ATTR: {
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			return -EFAULT;
-		return kvm_arm_vcpu_get_attr(vcpu, &attr);
-	}
-	case KVM_HAS_DEVICE_ATTR: {
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			return -EFAULT;
-		return kvm_arm_vcpu_has_attr(vcpu, &attr);
-	}
-	default:
-		return -EINVAL;
-	}
-}
-
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * Steps 1-4 below provide general overview of dirty page logging. See
- * kvm_get_dirty_log_protect() function description for additional details.
- *
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- * always flush the TLB (step 4) even if previous step failed  and the dirty
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- * writes will be marked dirty for next log read.
- *
- *   1. Take a snapshot of the bit and clear it if needed.
- *   2. Write protect the corresponding page.
- *   3. Copy the snapshot to the userspace.
- *   4. Flush TLB's if needed.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-	bool is_dirty = false;
-	int r;
-
-	mutex_lock(&kvm->slots_lock);
-
-	r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
-
-	if (is_dirty)
-		kvm_flush_remote_tlbs(kvm);
-
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
-					struct kvm_arm_device_addr *dev_addr)
-{
-	unsigned long dev_id, type;
-
-	dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
-		KVM_ARM_DEVICE_ID_SHIFT;
-	type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
-		KVM_ARM_DEVICE_TYPE_SHIFT;
-
-	switch (dev_id) {
-	case KVM_ARM_DEVICE_VGIC_V2:
-		if (!vgic_present)
-			return -ENXIO;
-		return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
-	default:
-		return -ENODEV;
-	}
-}
-
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
-{
-	struct kvm *kvm = filp->private_data;
-	void __user *argp = (void __user *)arg;
-
-	switch (ioctl) {
-	case KVM_CREATE_IRQCHIP: {
-		int ret;
-		if (!vgic_present)
-			return -ENXIO;
-		mutex_lock(&kvm->lock);
-		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-		mutex_unlock(&kvm->lock);
-		return ret;
-	}
-	case KVM_ARM_SET_DEVICE_ADDR: {
-		struct kvm_arm_device_addr dev_addr;
-
-		if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
-			return -EFAULT;
-		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
-	}
-	case KVM_ARM_PREFERRED_TARGET: {
-		int err;
-		struct kvm_vcpu_init init;
-
-		err = kvm_vcpu_preferred_target(&init);
-		if (err)
-			return err;
-
-		if (copy_to_user(argp, &init, sizeof(init)))
-			return -EFAULT;
-
-		return 0;
-	}
-	default:
-		return -EINVAL;
-	}
-}
-
-static void cpu_init_hyp_mode(void *dummy)
-{
-	phys_addr_t pgd_ptr;
-	unsigned long hyp_stack_ptr;
-	unsigned long stack_page;
-	unsigned long vector_ptr;
-
-	/* Switch from the HYP stub to our own HYP init vector */
-	__hyp_set_vectors(kvm_get_idmap_vector());
-
-	pgd_ptr = kvm_mmu_get_httbr();
-	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
-	hyp_stack_ptr = stack_page + PAGE_SIZE;
-	vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
-
-	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
-	__cpu_init_stage2();
-
-	if (is_kernel_in_hyp_mode())
-		kvm_timer_init_vhe();
-
-	kvm_arm_init_debug();
-}
-
-static void cpu_hyp_reset(void)
-{
-	if (!is_kernel_in_hyp_mode())
-		__hyp_reset_vectors();
-}
-
-static void cpu_hyp_reinit(void)
-{
-	cpu_hyp_reset();
-
-	if (is_kernel_in_hyp_mode()) {
-		/*
-		 * __cpu_init_stage2() is safe to call even if the PM
-		 * event was cancelled before the CPU was reset.
-		 */
-		__cpu_init_stage2();
-	} else {
-		cpu_init_hyp_mode(NULL);
-	}
-
-	if (vgic_present)
-		kvm_vgic_init_cpu_hardware();
-}
-
-static void _kvm_arch_hardware_enable(void *discard)
-{
-	if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
-		cpu_hyp_reinit();
-		__this_cpu_write(kvm_arm_hardware_enabled, 1);
-	}
-}
-
-int kvm_arch_hardware_enable(void)
-{
-	_kvm_arch_hardware_enable(NULL);
-	return 0;
-}
-
-static void _kvm_arch_hardware_disable(void *discard)
-{
-	if (__this_cpu_read(kvm_arm_hardware_enabled)) {
-		cpu_hyp_reset();
-		__this_cpu_write(kvm_arm_hardware_enabled, 0);
-	}
-}
-
-void kvm_arch_hardware_disable(void)
-{
-	_kvm_arch_hardware_disable(NULL);
-}
-
-#ifdef CONFIG_CPU_PM
-static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
-				    unsigned long cmd,
-				    void *v)
-{
-	/*
-	 * kvm_arm_hardware_enabled is left with its old value over
-	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
-	 * re-enable hyp.
-	 */
-	switch (cmd) {
-	case CPU_PM_ENTER:
-		if (__this_cpu_read(kvm_arm_hardware_enabled))
-			/*
-			 * don't update kvm_arm_hardware_enabled here
-			 * so that the hardware will be re-enabled
-			 * when we resume. See below.
-			 */
-			cpu_hyp_reset();
-
-		return NOTIFY_OK;
-	case CPU_PM_EXIT:
-		if (__this_cpu_read(kvm_arm_hardware_enabled))
-			/* The hardware was enabled before suspend. */
-			cpu_hyp_reinit();
-
-		return NOTIFY_OK;
-
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static struct notifier_block hyp_init_cpu_pm_nb = {
-	.notifier_call = hyp_init_cpu_pm_notifier,
-};
-
-static void __init hyp_cpu_pm_init(void)
-{
-	cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
-}
-static void __init hyp_cpu_pm_exit(void)
-{
-	cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
-}
-#else
-static inline void hyp_cpu_pm_init(void)
-{
-}
-static inline void hyp_cpu_pm_exit(void)
-{
-}
-#endif
-
-static void teardown_common_resources(void)
-{
-	free_percpu(kvm_host_cpu_state);
-}
-
-static int init_common_resources(void)
-{
-	kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
-	if (!kvm_host_cpu_state) {
-		kvm_err("Cannot allocate host CPU state\n");
-		return -ENOMEM;
-	}
-
-	/* set size of VMID supported by CPU */
-	kvm_vmid_bits = kvm_get_vmid_bits();
-	kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
-	return 0;
-}
-
-static int init_subsystems(void)
-{
-	int err = 0;
-
-	/*
-	 * Enable hardware so that subsystem initialisation can access EL2.
-	 */
-	on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
-
-	/*
-	 * Register CPU lower-power notifier
-	 */
-	hyp_cpu_pm_init();
-
-	/*
-	 * Init HYP view of VGIC
-	 */
-	err = kvm_vgic_hyp_init();
-	switch (err) {
-	case 0:
-		vgic_present = true;
-		break;
-	case -ENODEV:
-	case -ENXIO:
-		vgic_present = false;
-		err = 0;
-		break;
-	default:
-		goto out;
-	}
-
-	/*
-	 * Init HYP architected timer support
-	 */
-	err = kvm_timer_hyp_init();
-	if (err)
-		goto out;
-
-	kvm_perf_init();
-	kvm_coproc_table_init();
-
-out:
-	on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
-
-	return err;
-}
-
-static void teardown_hyp_mode(void)
-{
-	int cpu;
-
-	if (is_kernel_in_hyp_mode())
-		return;
-
-	free_hyp_pgds();
-	for_each_possible_cpu(cpu)
-		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
-	hyp_cpu_pm_exit();
-}
-
-static int init_vhe_mode(void)
-{
-	kvm_info("VHE mode initialized successfully\n");
-	return 0;
-}
-
-/**
- * Inits Hyp-mode on all online CPUs
- */
-static int init_hyp_mode(void)
-{
-	int cpu;
-	int err = 0;
-
-	/*
-	 * Allocate Hyp PGD and setup Hyp identity mapping
-	 */
-	err = kvm_mmu_init();
-	if (err)
-		goto out_err;
-
-	/*
-	 * Allocate stack pages for Hypervisor-mode
-	 */
-	for_each_possible_cpu(cpu) {
-		unsigned long stack_page;
-
-		stack_page = __get_free_page(GFP_KERNEL);
-		if (!stack_page) {
-			err = -ENOMEM;
-			goto out_err;
-		}
-
-		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
-	}
-
-	/*
-	 * Map the Hyp-code called directly from the host
-	 */
-	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
-	if (err) {
-		kvm_err("Cannot map world-switch code\n");
-		goto out_err;
-	}
-
-	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
-	if (err) {
-		kvm_err("Cannot map rodata section\n");
-		goto out_err;
-	}
-
-	err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
-				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
-	if (err) {
-		kvm_err("Cannot map bss section\n");
-		goto out_err;
-	}
-
-	/*
-	 * Map the Hyp stack pages
-	 */
-	for_each_possible_cpu(cpu) {
-		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-		err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
-					  PAGE_HYP);
-
-		if (err) {
-			kvm_err("Cannot map hyp stack\n");
-			goto out_err;
-		}
-	}
-
-	for_each_possible_cpu(cpu) {
-		kvm_cpu_context_t *cpu_ctxt;
-
-		cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
-		err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
-
-		if (err) {
-			kvm_err("Cannot map host CPU state: %d\n", err);
-			goto out_err;
-		}
-	}
-
-	kvm_info("Hyp mode initialized successfully\n");
-
-	return 0;
-
-out_err:
-	teardown_hyp_mode();
-	kvm_err("error initializing Hyp mode: %d\n", err);
-	return err;
-}
-
-static void check_kvm_target_cpu(void *ret)
-{
-	*(int *)ret = kvm_target_cpu();
-}
-
-struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	mpidr &= MPIDR_HWID_BITMASK;
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
-			return vcpu;
-	}
-	return NULL;
-}
-
-/**
- * Initialize Hyp-mode and memory mappings on all CPUs.
- */
-int kvm_arch_init(void *opaque)
-{
-	int err;
-	int ret, cpu;
-
-	if (!is_hyp_mode_available()) {
-		kvm_err("HYP mode not available\n");
-		return -ENODEV;
-	}
-
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
-		if (ret < 0) {
-			kvm_err("Error, CPU %d not supported!\n", cpu);
-			return -ENODEV;
-		}
-	}
-
-	err = init_common_resources();
-	if (err)
-		return err;
-
-	if (is_kernel_in_hyp_mode())
-		err = init_vhe_mode();
-	else
-		err = init_hyp_mode();
-	if (err)
-		goto out_err;
-
-	err = init_subsystems();
-	if (err)
-		goto out_hyp;
-
-	return 0;
-
-out_hyp:
-	teardown_hyp_mode();
-out_err:
-	teardown_common_resources();
-	return err;
-}
-
-/* NOP: Compiling as a module not supported */
-void kvm_arch_exit(void)
-{
-	kvm_perf_teardown();
-}
-
-static int arm_init(void)
-{
-	int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
-	return rc;
-}
-
-module_init(arm_init);
diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
deleted file mode 100644
index b6e715fd3c90..000000000000
--- a/arch/arm/kvm/mmio.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#include <linux/kvm_host.h>
-#include <asm/kvm_mmio.h>
-#include <asm/kvm_emulate.h>
-#include <trace/events/kvm.h>
-
-#include "trace.h"
-
-void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
-{
-	void *datap = NULL;
-	union {
-		u8	byte;
-		u16	hword;
-		u32	word;
-		u64	dword;
-	} tmp;
-
-	switch (len) {
-	case 1:
-		tmp.byte	= data;
-		datap		= &tmp.byte;
-		break;
-	case 2:
-		tmp.hword	= data;
-		datap		= &tmp.hword;
-		break;
-	case 4:
-		tmp.word	= data;
-		datap		= &tmp.word;
-		break;
-	case 8:
-		tmp.dword	= data;
-		datap		= &tmp.dword;
-		break;
-	}
-
-	memcpy(buf, datap, len);
-}
-
-unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
-{
-	unsigned long data = 0;
-	union {
-		u16	hword;
-		u32	word;
-		u64	dword;
-	} tmp;
-
-	switch (len) {
-	case 1:
-		data = *(u8 *)buf;
-		break;
-	case 2:
-		memcpy(&tmp.hword, buf, len);
-		data = tmp.hword;
-		break;
-	case 4:
-		memcpy(&tmp.word, buf, len);
-		data = tmp.word;
-		break;
-	case 8:
-		memcpy(&tmp.dword, buf, len);
-		data = tmp.dword;
-		break;
-	}
-
-	return data;
-}
-
-/**
- * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
- *			     or in-kernel IO emulation
- *
- * @vcpu: The VCPU pointer
- * @run:  The VCPU run struct containing the mmio data
- */
-int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	unsigned long data;
-	unsigned int len;
-	int mask;
-
-	if (!run->mmio.is_write) {
-		len = run->mmio.len;
-		if (len > sizeof(unsigned long))
-			return -EINVAL;
-
-		data = kvm_mmio_read_buf(run->mmio.data, len);
-
-		if (vcpu->arch.mmio_decode.sign_extend &&
-		    len < sizeof(unsigned long)) {
-			mask = 1U << ((len * 8) - 1);
-			data = (data ^ mask) - mask;
-		}
-
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
-			       data);
-		data = vcpu_data_host_to_guest(vcpu, data, len);
-		vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
-	}
-
-	return 0;
-}
-
-static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
-{
-	unsigned long rt;
-	int access_size;
-	bool sign_extend;
-
-	if (kvm_vcpu_dabt_iss1tw(vcpu)) {
-		/* page table accesses IO mem: tell guest to fix its TTBR */
-		kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-		return 1;
-	}
-
-	access_size = kvm_vcpu_dabt_get_as(vcpu);
-	if (unlikely(access_size < 0))
-		return access_size;
-
-	*is_write = kvm_vcpu_dabt_iswrite(vcpu);
-	sign_extend = kvm_vcpu_dabt_issext(vcpu);
-	rt = kvm_vcpu_dabt_get_rd(vcpu);
-
-	*len = access_size;
-	vcpu->arch.mmio_decode.sign_extend = sign_extend;
-	vcpu->arch.mmio_decode.rt = rt;
-
-	/*
-	 * The MMIO instruction is emulated and should not be re-executed
-	 * in the guest.
-	 */
-	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-	return 0;
-}
-
-int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
-		 phys_addr_t fault_ipa)
-{
-	unsigned long data;
-	unsigned long rt;
-	int ret;
-	bool is_write;
-	int len;
-	u8 data_buf[8];
-
-	/*
-	 * Prepare MMIO operation. First decode the syndrome data we get
-	 * from the CPU. Then try if some in-kernel emulation feels
-	 * responsible, otherwise let user space do its magic.
-	 */
-	if (kvm_vcpu_dabt_isvalid(vcpu)) {
-		ret = decode_hsr(vcpu, &is_write, &len);
-		if (ret)
-			return ret;
-	} else {
-		kvm_err("load/store instruction decoding not implemented\n");
-		return -ENOSYS;
-	}
-
-	rt = vcpu->arch.mmio_decode.rt;
-
-	if (is_write) {
-		data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
-					       len);
-
-		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data);
-		kvm_mmio_write_buf(data_buf, len, data);
-
-		ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
-				       data_buf);
-	} else {
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
-			       fault_ipa, 0);
-
-		ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
-				      data_buf);
-	}
-
-	/* Now prepare kvm_run for the potential return to userland. */
-	run->mmio.is_write	= is_write;
-	run->mmio.phys_addr	= fault_ipa;
-	run->mmio.len		= len;
-
-	if (!ret) {
-		/* We handled the access successfully in the kernel. */
-		if (!is_write)
-			memcpy(run->mmio.data, data_buf, len);
-		vcpu->stat.mmio_exit_kernel++;
-		kvm_handle_mmio_return(vcpu, run);
-		return 1;
-	}
-
-	if (is_write)
-		memcpy(run->mmio.data, data_buf, len);
-	vcpu->stat.mmio_exit_user++;
-	run->exit_reason	= KVM_EXIT_MMIO;
-	return 0;
-}
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
deleted file mode 100644
index 313ee646480f..000000000000
--- a/arch/arm/kvm/mmu.c
+++ /dev/null
@@ -1,1975 +0,0 @@
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#include <linux/mman.h>
-#include <linux/kvm_host.h>
-#include <linux/io.h>
-#include <linux/hugetlb.h>
-#include <trace/events/kvm.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_mmio.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_emulate.h>
-#include <asm/virt.h>
-
-#include "trace.h"
-
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
-static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
-
-static unsigned long hyp_idmap_start;
-static unsigned long hyp_idmap_end;
-static phys_addr_t hyp_idmap_vector;
-
-#define S2_PGD_SIZE	(PTRS_PER_S2_PGD * sizeof(pgd_t))
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-
-#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
-
-static bool memslot_is_logging(struct kvm_memory_slot *memslot)
-{
-	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
-}
-
-/**
- * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
- * @kvm:	pointer to kvm structure.
- *
- * Interface to HYP function to flush all VM TLB entries
- */
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
-}
-
-static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
-{
-	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
-	__kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
-	__kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
-	__kvm_flush_dcache_pud(pud);
-}
-
-static bool kvm_is_device_pfn(unsigned long pfn)
-{
-	return !pfn_valid(pfn);
-}
-
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @kvm:	pointer to kvm structure.
- * @addr:	IPA
- * @pmd:	pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
- * pages in the range dirty.
- */
-static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
-{
-	if (!pmd_thp_or_huge(*pmd))
-		return;
-
-	pmd_clear(pmd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	put_page(virt_to_page(pmd));
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-				  int min, int max)
-{
-	void *page;
-
-	BUG_ON(max > KVM_NR_MEM_OBJS);
-	if (cache->nobjs >= min)
-		return 0;
-	while (cache->nobjs < max) {
-		page = (void *)__get_free_page(PGALLOC_GFP);
-		if (!page)
-			return -ENOMEM;
-		cache->objects[cache->nobjs++] = page;
-	}
-	return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-	while (mc->nobjs)
-		free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-{
-	void *p;
-
-	BUG_ON(!mc || !mc->nobjs);
-	p = mc->objects[--mc->nobjs];
-	return p;
-}
-
-static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
-{
-	pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
-	stage2_pgd_clear(pgd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	stage2_pud_free(pud_table);
-	put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
-{
-	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
-	VM_BUG_ON(stage2_pud_huge(*pud));
-	stage2_pud_clear(pud);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	stage2_pmd_free(pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(pmd_thp_or_huge(*pmd));
-	pmd_clear(pmd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	pte_free_kernel(NULL, pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-/*
- * Unmapping vs dcache management:
- *
- * If a guest maps certain memory pages as uncached, all writes will
- * bypass the data cache and go directly to RAM.  However, the CPUs
- * can still speculate reads (not writes) and fill cache lines with
- * data.
- *
- * Those cache lines will be *clean* cache lines though, so a
- * clean+invalidate operation is equivalent to an invalidate
- * operation, because no cache lines are marked dirty.
- *
- * Those clean cache lines could be filled prior to an uncached write
- * by the guest, and the cache coherent IO subsystem would therefore
- * end up writing old data to disk.
- *
- * This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
- */
-static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t start_addr = addr;
-	pte_t *pte, *start_pte;
-
-	start_pte = pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			pte_t old_pte = *pte;
-
-			kvm_set_pte(pte, __pte(0));
-			kvm_tlb_flush_vmid_ipa(kvm, addr);
-
-			/* No need to invalidate the cache for device mappings */
-			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
-				kvm_flush_dcache_pte(old_pte);
-
-			put_page(virt_to_page(pte));
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	if (stage2_pte_table_empty(start_pte))
-		clear_stage2_pmd_entry(kvm, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next, start_addr = addr;
-	pmd_t *pmd, *start_pmd;
-
-	start_pmd = pmd = stage2_pmd_offset(pud, addr);
-	do {
-		next = stage2_pmd_addr_end(addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
-				pmd_t old_pmd = *pmd;
-
-				pmd_clear(pmd);
-				kvm_tlb_flush_vmid_ipa(kvm, addr);
-
-				kvm_flush_dcache_pmd(old_pmd);
-
-				put_page(virt_to_page(pmd));
-			} else {
-				unmap_stage2_ptes(kvm, pmd, addr, next);
-			}
-		}
-	} while (pmd++, addr = next, addr != end);
-
-	if (stage2_pmd_table_empty(start_pmd))
-		clear_stage2_pud_entry(kvm, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next, start_addr = addr;
-	pud_t *pud, *start_pud;
-
-	start_pud = pud = stage2_pud_offset(pgd, addr);
-	do {
-		next = stage2_pud_addr_end(addr, end);
-		if (!stage2_pud_none(*pud)) {
-			if (stage2_pud_huge(*pud)) {
-				pud_t old_pud = *pud;
-
-				stage2_pud_clear(pud);
-				kvm_tlb_flush_vmid_ipa(kvm, addr);
-				kvm_flush_dcache_pud(old_pud);
-				put_page(virt_to_page(pud));
-			} else {
-				unmap_stage2_pmds(kvm, pud, addr, next);
-			}
-		}
-	} while (pud++, addr = next, addr != end);
-
-	if (stage2_pud_table_empty(start_pud))
-		clear_stage2_pgd_entry(kvm, pgd, start_addr);
-}
-
-/**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm:   The VM pointer
- * @start: The intermediate physical base address of the range to unmap
- * @size:  The size of the area to unmap
- *
- * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
- * be called while holding mmu_lock (unless for freeing the stage2 pgd before
- * destroying the VM), otherwise another faulting VCPU may come in and mess
- * with things behind our backs.
- */
-static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
-{
-	pgd_t *pgd;
-	phys_addr_t addr = start, end = start + size;
-	phys_addr_t next;
-
-	assert_spin_locked(&kvm->mmu_lock);
-	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
-	do {
-		next = stage2_pgd_addr_end(addr, end);
-		if (!stage2_pgd_none(*pgd))
-			unmap_stage2_puds(kvm, pgd, addr, next);
-		/*
-		 * If the range is too large, release the kvm->mmu_lock
-		 * to prevent starvation and lockup detector warnings.
-		 */
-		if (next != end)
-			cond_resched_lock(&kvm->mmu_lock);
-	} while (pgd++, addr = next, addr != end);
-}
-
-static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
-			kvm_flush_dcache_pte(*pte);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pmd_t *pmd;
-	phys_addr_t next;
-
-	pmd = stage2_pmd_offset(pud, addr);
-	do {
-		next = stage2_pmd_addr_end(addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd))
-				kvm_flush_dcache_pmd(*pmd);
-			else
-				stage2_flush_ptes(kvm, pmd, addr, next);
-		}
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pud_t *pud;
-	phys_addr_t next;
-
-	pud = stage2_pud_offset(pgd, addr);
-	do {
-		next = stage2_pud_addr_end(addr, end);
-		if (!stage2_pud_none(*pud)) {
-			if (stage2_pud_huge(*pud))
-				kvm_flush_dcache_pud(*pud);
-			else
-				stage2_flush_pmds(kvm, pud, addr, next);
-		}
-	} while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_memslot(struct kvm *kvm,
-				 struct kvm_memory_slot *memslot)
-{
-	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
-	phys_addr_t next;
-	pgd_t *pgd;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
-	do {
-		next = stage2_pgd_addr_end(addr, end);
-		stage2_flush_puds(kvm, pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-/**
- * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
- * @kvm: The struct kvm pointer
- *
- * Go through the stage 2 page tables and invalidate any cache lines
- * backing memory already mapped to the VM.
- */
-static void stage2_flush_vm(struct kvm *kvm)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
-
-	slots = kvm_memslots(kvm);
-	kvm_for_each_memslot(memslot, slots)
-		stage2_flush_memslot(kvm, memslot);
-
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
-}
-
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
-	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
-	pgd_clear(pgd);
-	pud_free(NULL, pud_table);
-	put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
-	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
-	VM_BUG_ON(pud_huge(*pud));
-	pud_clear(pud);
-	pmd_free(NULL, pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(pmd_thp_or_huge(*pmd));
-	pmd_clear(pmd);
-	pte_free_kernel(NULL, pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte, *start_pte;
-
-	start_pte = pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			kvm_set_pte(pte, __pte(0));
-			put_page(virt_to_page(pte));
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	if (hyp_pte_table_empty(start_pte))
-		clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	pmd_t *pmd, *start_pmd;
-
-	start_pmd = pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		/* Hyp doesn't use huge pmds */
-		if (!pmd_none(*pmd))
-			unmap_hyp_ptes(pmd, addr, next);
-	} while (pmd++, addr = next, addr != end);
-
-	if (hyp_pmd_table_empty(start_pmd))
-		clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	pud_t *pud, *start_pud;
-
-	start_pud = pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		/* Hyp doesn't use huge puds */
-		if (!pud_none(*pud))
-			unmap_hyp_pmds(pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-
-	if (hyp_pud_table_empty(start_pud))
-		clear_hyp_pgd_entry(pgd);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-	pgd_t *pgd;
-	phys_addr_t addr = start, end = start + size;
-	phys_addr_t next;
-
-	/*
-	 * We don't unmap anything from HYP, except at the hyp tear down.
-	 * Hence, we don't have to invalidate the TLBs here.
-	 */
-	pgd = pgdp + pgd_index(addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (!pgd_none(*pgd))
-			unmap_hyp_puds(pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-/**
- * free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the vmalloc range (from
- * VMALLOC_START to VMALLOC_END).
- *
- * boot_hyp_pgd should only map two pages for the init code.
- */
-void free_hyp_pgds(void)
-{
-	unsigned long addr;
-
-	mutex_lock(&kvm_hyp_pgd_mutex);
-
-	if (boot_hyp_pgd) {
-		unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-		boot_hyp_pgd = NULL;
-	}
-
-	if (hyp_pgd) {
-		unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
-		for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-			unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
-		for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-			unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
-
-		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
-		hyp_pgd = NULL;
-	}
-	if (merged_hyp_pgd) {
-		clear_page(merged_hyp_pgd);
-		free_page((unsigned long)merged_hyp_pgd);
-		merged_hyp_pgd = NULL;
-	}
-
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
-				    unsigned long end, unsigned long pfn,
-				    pgprot_t prot)
-{
-	pte_t *pte;
-	unsigned long addr;
-
-	addr = start;
-	do {
-		pte = pte_offset_kernel(pmd, addr);
-		kvm_set_pte(pte, pfn_pte(pfn, prot));
-		get_page(virt_to_page(pte));
-		kvm_flush_dcache_to_poc(pte, sizeof(*pte));
-		pfn++;
-	} while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long addr, next;
-
-	addr = start;
-	do {
-		pmd = pmd_offset(pud, addr);
-
-		BUG_ON(pmd_sect(*pmd));
-
-		if (pmd_none(*pmd)) {
-			pte = pte_alloc_one_kernel(NULL, addr);
-			if (!pte) {
-				kvm_err("Cannot allocate Hyp pte\n");
-				return -ENOMEM;
-			}
-			pmd_populate_kernel(NULL, pmd, pte);
-			get_page(virt_to_page(pmd));
-			kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
-		}
-
-		next = pmd_addr_end(addr, end);
-
-		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	unsigned long addr, next;
-	int ret;
-
-	addr = start;
-	do {
-		pud = pud_offset(pgd, addr);
-
-		if (pud_none_or_clear_bad(pud)) {
-			pmd = pmd_alloc_one(NULL, addr);
-			if (!pmd) {
-				kvm_err("Cannot allocate Hyp pmd\n");
-				return -ENOMEM;
-			}
-			pud_populate(NULL, pud, pmd);
-			get_page(virt_to_page(pud));
-			kvm_flush_dcache_to_poc(pud, sizeof(*pud));
-		}
-
-		next = pud_addr_end(addr, end);
-		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
-		if (ret)
-			return ret;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp,
-				 unsigned long start, unsigned long end,
-				 unsigned long pfn, pgprot_t prot)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	unsigned long addr, next;
-	int err = 0;
-
-	mutex_lock(&kvm_hyp_pgd_mutex);
-	addr = start & PAGE_MASK;
-	end = PAGE_ALIGN(end);
-	do {
-		pgd = pgdp + pgd_index(addr);
-
-		if (pgd_none(*pgd)) {
-			pud = pud_alloc_one(NULL, addr);
-			if (!pud) {
-				kvm_err("Cannot allocate Hyp pud\n");
-				err = -ENOMEM;
-				goto out;
-			}
-			pgd_populate(NULL, pgd, pud);
-			get_page(virt_to_page(pgd));
-			kvm_flush_dcache_to_poc(pgd, sizeof(*pgd));
-		}
-
-		next = pgd_addr_end(addr, end);
-		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
-		if (err)
-			goto out;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-out:
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-	return err;
-}
-
-static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
-{
-	if (!is_vmalloc_addr(kaddr)) {
-		BUG_ON(!virt_addr_valid(kaddr));
-		return __pa(kaddr);
-	} else {
-		return page_to_phys(vmalloc_to_page(kaddr)) +
-		       offset_in_page(kaddr);
-	}
-}
-
-/**
- * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
- * @from:	The virtual kernel start address of the range
- * @to:		The virtual kernel end address of the range (exclusive)
- * @prot:	The protection to be applied to this range
- *
- * The same virtual address as the kernel virtual address is also used
- * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
- * physical pages.
- */
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
-{
-	phys_addr_t phys_addr;
-	unsigned long virt_addr;
-	unsigned long start = kern_hyp_va((unsigned long)from);
-	unsigned long end = kern_hyp_va((unsigned long)to);
-
-	if (is_kernel_in_hyp_mode())
-		return 0;
-
-	start = start & PAGE_MASK;
-	end = PAGE_ALIGN(end);
-
-	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
-		int err;
-
-		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
-		err = __create_hyp_mappings(hyp_pgd, virt_addr,
-					    virt_addr + PAGE_SIZE,
-					    __phys_to_pfn(phys_addr),
-					    prot);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-/**
- * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
- * @from:	The kernel start VA of the range
- * @to:		The kernel end VA of the range (exclusive)
- * @phys_addr:	The physical start address which gets mapped
- *
- * The resulting HYP VA is the same as the kernel VA, modulo
- * HYP_PAGE_OFFSET.
- */
-int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
-{
-	unsigned long start = kern_hyp_va((unsigned long)from);
-	unsigned long end = kern_hyp_va((unsigned long)to);
-
-	if (is_kernel_in_hyp_mode())
-		return 0;
-
-	/* Check for a valid kernel IO mapping */
-	if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
-		return -EINVAL;
-
-	return __create_hyp_mappings(hyp_pgd, start, end,
-				     __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
-}
-
-/**
- * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
- * @kvm:	The KVM struct pointer for the VM.
- *
- * Allocates only the stage-2 HW PGD level table(s) (can support either full
- * 40-bit input addresses or limited to 32-bit input addresses). Clears the
- * allocated pages.
- *
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_alloc_stage2_pgd(struct kvm *kvm)
-{
-	pgd_t *pgd;
-
-	if (kvm->arch.pgd != NULL) {
-		kvm_err("kvm_arch already initialized?\n");
-		return -EINVAL;
-	}
-
-	/* Allocate the HW PGD, making sure that each page gets its own refcount */
-	pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
-	if (!pgd)
-		return -ENOMEM;
-
-	kvm->arch.pgd = pgd;
-	return 0;
-}
-
-static void stage2_unmap_memslot(struct kvm *kvm,
-				 struct kvm_memory_slot *memslot)
-{
-	hva_t hva = memslot->userspace_addr;
-	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-	phys_addr_t size = PAGE_SIZE * memslot->npages;
-	hva_t reg_end = hva + size;
-
-	/*
-	 * A memory region could potentially cover multiple VMAs, and any holes
-	 * between them, so iterate over all of them to find out if we should
-	 * unmap any of them.
-	 *
-	 *     +--------------------------------------------+
-	 * +---------------+----------------+   +----------------+
-	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-	 * +---------------+----------------+   +----------------+
-	 *     |               memory region                |
-	 *     +--------------------------------------------+
-	 */
-	do {
-		struct vm_area_struct *vma = find_vma(current->mm, hva);
-		hva_t vm_start, vm_end;
-
-		if (!vma || vma->vm_start >= reg_end)
-			break;
-
-		/*
-		 * Take the intersection of this VMA with the memory region
-		 */
-		vm_start = max(hva, vma->vm_start);
-		vm_end = min(reg_end, vma->vm_end);
-
-		if (!(vma->vm_flags & VM_PFNMAP)) {
-			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
-			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
-		}
-		hva = vm_end;
-	} while (hva < reg_end);
-}
-
-/**
- * stage2_unmap_vm - Unmap Stage-2 RAM mappings
- * @kvm: The struct kvm pointer
- *
- * Go through the memregions and unmap any reguler RAM
- * backing memory already mapped to the VM.
- */
-void stage2_unmap_vm(struct kvm *kvm)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	down_read(&current->mm->mmap_sem);
-	spin_lock(&kvm->mmu_lock);
-
-	slots = kvm_memslots(kvm);
-	kvm_for_each_memslot(memslot, slots)
-		stage2_unmap_memslot(kvm, memslot);
-
-	spin_unlock(&kvm->mmu_lock);
-	up_read(&current->mm->mmap_sem);
-	srcu_read_unlock(&kvm->srcu, idx);
-}
-
-/**
- * kvm_free_stage2_pgd - free all stage-2 tables
- * @kvm:	The KVM struct pointer for the VM.
- *
- * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
- * underlying level-2 and level-3 tables before freeing the actual level-1 table
- * and setting the struct pointer to NULL.
- *
- * Note we don't need locking here as this is only called when the VM is
- * destroyed, which can only be done once.
- */
-void kvm_free_stage2_pgd(struct kvm *kvm)
-{
-	if (kvm->arch.pgd == NULL)
-		return;
-
-	spin_lock(&kvm->mmu_lock);
-	unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
-	spin_unlock(&kvm->mmu_lock);
-
-	/* Free the HW pgd, one page at a time */
-	free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE);
-	kvm->arch.pgd = NULL;
-}
-
-static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
-	if (WARN_ON(stage2_pgd_none(*pgd))) {
-		if (!cache)
-			return NULL;
-		pud = mmu_memory_cache_alloc(cache);
-		stage2_pgd_populate(pgd, pud);
-		get_page(virt_to_page(pgd));
-	}
-
-	return stage2_pud_offset(pgd, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-
-	pud = stage2_get_pud(kvm, cache, addr);
-	if (stage2_pud_none(*pud)) {
-		if (!cache)
-			return NULL;
-		pmd = mmu_memory_cache_alloc(cache);
-		stage2_pud_populate(pud, pmd);
-		get_page(virt_to_page(pud));
-	}
-
-	return stage2_pmd_offset(pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
-			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
-{
-	pmd_t *pmd, old_pmd;
-
-	pmd = stage2_get_pmd(kvm, cache, addr);
-	VM_BUG_ON(!pmd);
-
-	/*
-	 * Mapping in huge pages should only happen through a fault.  If a
-	 * page is merged into a transparent huge page, the individual
-	 * subpages of that huge page should be unmapped through MMU
-	 * notifiers before we get here.
-	 *
-	 * Merging of CompoundPages is not supported; they should become
-	 * splitting first, unmapped, merged, and mapped back in on-demand.
-	 */
-	VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
-
-	old_pmd = *pmd;
-	if (pmd_present(old_pmd)) {
-		pmd_clear(pmd);
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	} else {
-		get_page(virt_to_page(pmd));
-	}
-
-	kvm_set_pmd(pmd, *new_pmd);
-	return 0;
-}
-
-static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte,
-			  unsigned long flags)
-{
-	pmd_t *pmd;
-	pte_t *pte, old_pte;
-	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
-	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
-	VM_BUG_ON(logging_active && !cache);
-
-	/* Create stage-2 page table mapping - Levels 0 and 1 */
-	pmd = stage2_get_pmd(kvm, cache, addr);
-	if (!pmd) {
-		/*
-		 * Ignore calls from kvm_set_spte_hva for unallocated
-		 * address ranges.
-		 */
-		return 0;
-	}
-
-	/*
-	 * While dirty page logging - dissolve huge PMD, then continue on to
-	 * allocate page.
-	 */
-	if (logging_active)
-		stage2_dissolve_pmd(kvm, addr, pmd);
-
-	/* Create stage-2 page mappings - Level 2 */
-	if (pmd_none(*pmd)) {
-		if (!cache)
-			return 0; /* ignore calls from kvm_set_spte_hva */
-		pte = mmu_memory_cache_alloc(cache);
-		pmd_populate_kernel(NULL, pmd, pte);
-		get_page(virt_to_page(pmd));
-	}
-
-	pte = pte_offset_kernel(pmd, addr);
-
-	if (iomap && pte_present(*pte))
-		return -EFAULT;
-
-	/* Create 2nd stage page table mapping - Level 3 */
-	old_pte = *pte;
-	if (pte_present(old_pte)) {
-		kvm_set_pte(pte, __pte(0));
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	} else {
-		get_page(virt_to_page(pte));
-	}
-
-	kvm_set_pte(pte, *new_pte);
-	return 0;
-}
-
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-	if (pte_young(*pte)) {
-		*pte = pte_mkold(*pte);
-		return 1;
-	}
-	return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-	return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
-	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-/**
- * kvm_phys_addr_ioremap - map a device range to guest IPA
- *
- * @kvm:	The KVM pointer
- * @guest_ipa:	The IPA at which to insert the mapping
- * @pa:		The physical address of the device
- * @size:	The size of the mapping
- */
-int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
-			  phys_addr_t pa, unsigned long size, bool writable)
-{
-	phys_addr_t addr, end;
-	int ret = 0;
-	unsigned long pfn;
-	struct kvm_mmu_memory_cache cache = { 0, };
-
-	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
-	pfn = __phys_to_pfn(pa);
-
-	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
-		pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
-
-		if (writable)
-			pte = kvm_s2pte_mkwrite(pte);
-
-		ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
-						KVM_NR_MEM_OBJS);
-		if (ret)
-			goto out;
-		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte,
-						KVM_S2PTE_FLAG_IS_IOMAP);
-		spin_unlock(&kvm->mmu_lock);
-		if (ret)
-			goto out;
-
-		pfn++;
-	}
-
-out:
-	mmu_free_memory_cache(&cache);
-	return ret;
-}
-
-static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
-{
-	kvm_pfn_t pfn = *pfnp;
-	gfn_t gfn = *ipap >> PAGE_SHIFT;
-
-	if (PageTransCompoundMap(pfn_to_page(pfn))) {
-		unsigned long mask;
-		/*
-		 * The address we faulted on is backed by a transparent huge
-		 * page.  However, because we map the compound huge page and
-		 * not the individual tail page, we need to transfer the
-		 * refcount to the head page.  We have to be careful that the
-		 * THP doesn't start to split while we are adjusting the
-		 * refcounts.
-		 *
-		 * We are sure this doesn't happen, because mmu_notifier_retry
-		 * was successful and we are holding the mmu_lock, so if this
-		 * THP is trying to split, it will be blocked in the mmu
-		 * notifier before touching any of the pages, specifically
-		 * before being able to call __split_huge_page_refcount().
-		 *
-		 * We can therefore safely transfer the refcount from PG_tail
-		 * to PG_head and switch the pfn from a tail page to the head
-		 * page accordingly.
-		 */
-		mask = PTRS_PER_PMD - 1;
-		VM_BUG_ON((gfn & mask) != (pfn & mask));
-		if (pfn & mask) {
-			*ipap &= PMD_MASK;
-			kvm_release_pfn_clean(pfn);
-			pfn &= ~mask;
-			kvm_get_pfn(pfn);
-			*pfnp = pfn;
-		}
-
-		return true;
-	}
-
-	return false;
-}
-
-static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vcpu_trap_is_iabt(vcpu))
-		return false;
-
-	return kvm_vcpu_dabt_iswrite(vcpu);
-}
-
-/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd:	pointer to pmd entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			if (!kvm_s2pte_readonly(pte))
-				kvm_set_s2pte_readonly(pte);
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * @pud:	pointer to pud entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
-	pmd_t *pmd;
-	phys_addr_t next;
-
-	pmd = stage2_pmd_offset(pud, addr);
-
-	do {
-		next = stage2_pmd_addr_end(addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
-				if (!kvm_s2pmd_readonly(pmd))
-					kvm_set_s2pmd_readonly(pmd);
-			} else {
-				stage2_wp_ptes(pmd, addr, next);
-			}
-		}
-	} while (pmd++, addr = next, addr != end);
-}
-
-/**
-  * stage2_wp_puds - write protect PGD range
-  * @pgd:	pointer to pgd entry
-  * @addr:	range start address
-  * @end:	range end address
-  *
-  * Process PUD entries, for a huge PUD we cause a panic.
-  */
-static void  stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
-	pud_t *pud;
-	phys_addr_t next;
-
-	pud = stage2_pud_offset(pgd, addr);
-	do {
-		next = stage2_pud_addr_end(addr, end);
-		if (!stage2_pud_none(*pud)) {
-			/* TODO:PUD not supported, revisit later if supported */
-			BUG_ON(stage2_pud_huge(*pud));
-			stage2_wp_pmds(pud, addr, next);
-		}
-	} while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_range() - write protect stage2 memory region range
- * @kvm:	The KVM pointer
- * @addr:	Start address of range
- * @end:	End address of range
- */
-static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
-	pgd_t *pgd;
-	phys_addr_t next;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(addr);
-	do {
-		/*
-		 * Release kvm_mmu_lock periodically if the memory region is
-		 * large. Otherwise, we may see kernel panics with
-		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
-		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
-		 * will also starve other vCPUs.
-		 */
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-			cond_resched_lock(&kvm->mmu_lock);
-
-		next = stage2_pgd_addr_end(addr, end);
-		if (stage2_pgd_present(*pgd))
-			stage2_wp_puds(pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-/**
- * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
- * @kvm:	The KVM pointer
- * @slot:	The memory slot to write protect
- *
- * Called to start logging dirty pages after memory region
- * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
- * all present PMD and PTEs are write protected in the memory region.
- * Afterwards read of dirty page log can be called.
- *
- * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
- * serializing operations for VM memory regions.
- */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
-{
-	struct kvm_memslots *slots = kvm_memslots(kvm);
-	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
-	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
-	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
-
-	spin_lock(&kvm->mmu_lock);
-	stage2_wp_range(kvm, start, end);
-	spin_unlock(&kvm->mmu_lock);
-	kvm_flush_remote_tlbs(kvm);
-}
-
-/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
- * @kvm:	The KVM pointer
- * @slot:	The memory slot associated with mask
- * @gfn_offset:	The gfn offset in memory slot
- * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
- *		slot to be write protected
- *
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
- */
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
-		struct kvm_memory_slot *slot,
-		gfn_t gfn_offset, unsigned long mask)
-{
-	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
-	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
-	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
-
-	stage2_wp_range(kvm, start, end);
-}
-
-/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
- *
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
- */
-void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
-		struct kvm_memory_slot *slot,
-		gfn_t gfn_offset, unsigned long mask)
-{
-	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
-}
-
-static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
-				      unsigned long size)
-{
-	__coherent_cache_guest_page(vcpu, pfn, size);
-}
-
-static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-			  struct kvm_memory_slot *memslot, unsigned long hva,
-			  unsigned long fault_status)
-{
-	int ret;
-	bool write_fault, writable, hugetlb = false, force_pte = false;
-	unsigned long mmu_seq;
-	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
-	struct vm_area_struct *vma;
-	kvm_pfn_t pfn;
-	pgprot_t mem_type = PAGE_S2;
-	bool logging_active = memslot_is_logging(memslot);
-	unsigned long flags = 0;
-
-	write_fault = kvm_is_write_fault(vcpu);
-	if (fault_status == FSC_PERM && !write_fault) {
-		kvm_err("Unexpected L2 read permission error\n");
-		return -EFAULT;
-	}
-
-	/* Let's check if we will get back a huge page backed by hugetlbfs */
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma_intersection(current->mm, hva, hva + 1);
-	if (unlikely(!vma)) {
-		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
-		up_read(&current->mm->mmap_sem);
-		return -EFAULT;
-	}
-
-	if (is_vm_hugetlb_page(vma) && !logging_active) {
-		hugetlb = true;
-		gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
-	} else {
-		/*
-		 * Pages belonging to memslots that don't have the same
-		 * alignment for userspace and IPA cannot be mapped using
-		 * block descriptors even if the pages belong to a THP for
-		 * the process, because the stage-2 block descriptor will
-		 * cover more than a single THP and we loose atomicity for
-		 * unmapping, updates, and splits of the THP or other pages
-		 * in the stage-2 block range.
-		 */
-		if ((memslot->userspace_addr & ~PMD_MASK) !=
-		    ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
-			force_pte = true;
-	}
-	up_read(&current->mm->mmap_sem);
-
-	/* We need minimum second+third level pages */
-	ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
-				     KVM_NR_MEM_OBJS);
-	if (ret)
-		return ret;
-
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/*
-	 * Ensure the read of mmu_notifier_seq happens before we call
-	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
-	 * the page we just got a reference to gets unmapped before we have a
-	 * chance to grab the mmu_lock, which ensure that if the page gets
-	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
-	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
-	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
-	 */
-	smp_rmb();
-
-	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
-	if (is_error_noslot_pfn(pfn))
-		return -EFAULT;
-
-	if (kvm_is_device_pfn(pfn)) {
-		mem_type = PAGE_S2_DEVICE;
-		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
-	} else if (logging_active) {
-		/*
-		 * Faults on pages in a memslot with logging enabled
-		 * should not be mapped with huge pages (it introduces churn
-		 * and performance degradation), so force a pte mapping.
-		 */
-		force_pte = true;
-		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
-		/*
-		 * Only actually map the page as writable if this was a write
-		 * fault.
-		 */
-		if (!write_fault)
-			writable = false;
-	}
-
-	spin_lock(&kvm->mmu_lock);
-	if (mmu_notifier_retry(kvm, mmu_seq))
-		goto out_unlock;
-
-	if (!hugetlb && !force_pte)
-		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
-
-	if (hugetlb) {
-		pmd_t new_pmd = pfn_pmd(pfn, mem_type);
-		new_pmd = pmd_mkhuge(new_pmd);
-		if (writable) {
-			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-			kvm_set_pfn_dirty(pfn);
-		}
-		coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
-		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
-	} else {
-		pte_t new_pte = pfn_pte(pfn, mem_type);
-
-		if (writable) {
-			new_pte = kvm_s2pte_mkwrite(new_pte);
-			kvm_set_pfn_dirty(pfn);
-			mark_page_dirty(kvm, gfn);
-		}
-		coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE);
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
-	}
-
-out_unlock:
-	spin_unlock(&kvm->mmu_lock);
-	kvm_set_pfn_accessed(pfn);
-	kvm_release_pfn_clean(pfn);
-	return ret;
-}
-
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
-static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-	kvm_pfn_t pfn;
-	bool pfn_valid = false;
-
-	trace_kvm_access_fault(fault_ipa);
-
-	spin_lock(&vcpu->kvm->mmu_lock);
-
-	pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
-	if (!pmd || pmd_none(*pmd))	/* Nothing there */
-		goto out;
-
-	if (pmd_thp_or_huge(*pmd)) {	/* THP, HugeTLB */
-		*pmd = pmd_mkyoung(*pmd);
-		pfn = pmd_pfn(*pmd);
-		pfn_valid = true;
-		goto out;
-	}
-
-	pte = pte_offset_kernel(pmd, fault_ipa);
-	if (pte_none(*pte))		/* Nothing there either */
-		goto out;
-
-	*pte = pte_mkyoung(*pte);	/* Just a page... */
-	pfn = pte_pfn(*pte);
-	pfn_valid = true;
-out:
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (pfn_valid)
-		kvm_set_pfn_accessed(pfn);
-}
-
-/**
- * kvm_handle_guest_abort - handles all 2nd stage aborts
- * @vcpu:	the VCPU pointer
- * @run:	the kvm_run structure
- *
- * Any abort that gets to the host is almost guaranteed to be caused by a
- * missing second stage translation table entry, which can mean that either the
- * guest simply needs more memory and we must allocate an appropriate page or it
- * can mean that the guest tried to access I/O memory, which is emulated by user
- * space. The distinction is based on the IPA causing the fault and whether this
- * memory region has been registered as standard RAM by user space.
- */
-int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	unsigned long fault_status;
-	phys_addr_t fault_ipa;
-	struct kvm_memory_slot *memslot;
-	unsigned long hva;
-	bool is_iabt, write_fault, writable;
-	gfn_t gfn;
-	int ret, idx;
-
-	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
-	if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) {
-		kvm_inject_vabt(vcpu);
-		return 1;
-	}
-
-	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
-
-	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
-			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
-
-	/* Check the stage-2 fault is trans. fault or write fault */
-	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
-	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
-	    fault_status != FSC_ACCESS) {
-		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
-			kvm_vcpu_trap_get_class(vcpu),
-			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
-			(unsigned long)kvm_vcpu_get_hsr(vcpu));
-		return -EFAULT;
-	}
-
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
-
-	gfn = fault_ipa >> PAGE_SHIFT;
-	memslot = gfn_to_memslot(vcpu->kvm, gfn);
-	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
-	write_fault = kvm_is_write_fault(vcpu);
-	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
-		if (is_iabt) {
-			/* Prefetch Abort on I/O address */
-			kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-			ret = 1;
-			goto out_unlock;
-		}
-
-		/*
-		 * Check for a cache maintenance operation. Since we
-		 * ended-up here, we know it is outside of any memory
-		 * slot. But we can't find out if that is for a device,
-		 * or if the guest is just being stupid. The only thing
-		 * we know for sure is that this range cannot be cached.
-		 *
-		 * So let's assume that the guest is just being
-		 * cautious, and skip the instruction.
-		 */
-		if (kvm_vcpu_dabt_is_cm(vcpu)) {
-			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-			ret = 1;
-			goto out_unlock;
-		}
-
-		/*
-		 * The IPA is reported as [MAX:12], so we need to
-		 * complement it with the bottom 12 bits from the
-		 * faulting VA. This is always 12 bits, irrespective
-		 * of the page size.
-		 */
-		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
-		ret = io_mem_abort(vcpu, run, fault_ipa);
-		goto out_unlock;
-	}
-
-	/* Userspace should not be able to register out-of-bounds IPAs */
-	VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
-
-	if (fault_status == FSC_ACCESS) {
-		handle_access_fault(vcpu, fault_ipa);
-		ret = 1;
-		goto out_unlock;
-	}
-
-	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
-	if (ret == 0)
-		ret = 1;
-out_unlock:
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-	return ret;
-}
-
-static int handle_hva_to_gpa(struct kvm *kvm,
-			     unsigned long start,
-			     unsigned long end,
-			     int (*handler)(struct kvm *kvm,
-					    gpa_t gpa, u64 size,
-					    void *data),
-			     void *data)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int ret = 0;
-
-	slots = kvm_memslots(kvm);
-
-	/* we only care about the pages that the guest sees */
-	kvm_for_each_memslot(memslot, slots) {
-		unsigned long hva_start, hva_end;
-		gfn_t gpa;
-
-		hva_start = max(start, memslot->userspace_addr);
-		hva_end = min(end, memslot->userspace_addr +
-					(memslot->npages << PAGE_SHIFT));
-		if (hva_start >= hva_end)
-			continue;
-
-		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
-		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
-	}
-
-	return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	unmap_stage2_range(kvm, gpa, size);
-	return 0;
-}
-
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
-{
-	unsigned long end = hva + PAGE_SIZE;
-
-	if (!kvm->arch.pgd)
-		return 0;
-
-	trace_kvm_unmap_hva(hva);
-	handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
-	return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
-			unsigned long start, unsigned long end)
-{
-	if (!kvm->arch.pgd)
-		return 0;
-
-	trace_kvm_unmap_hva_range(start, end);
-	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-	return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pte_t *pte = (pte_t *)data;
-
-	WARN_ON(size != PAGE_SIZE);
-	/*
-	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
-	 * flag clear because MMU notifiers will have unmapped a huge PMD before
-	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-	 * therefore stage2_set_pte() never needs to clear out a huge PMD
-	 * through this calling path.
-	 */
-	stage2_set_pte(kvm, NULL, gpa, pte, 0);
-	return 0;
-}
-
-
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
-	unsigned long end = hva + PAGE_SIZE;
-	pte_t stage2_pte;
-
-	if (!kvm->arch.pgd)
-		return;
-
-	trace_kvm_set_spte_hva(hva);
-	stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2);
-	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-}
-
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-
-	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
-	pmd = stage2_get_pmd(kvm, NULL, gpa);
-	if (!pmd || pmd_none(*pmd))	/* Nothing there */
-		return 0;
-
-	if (pmd_thp_or_huge(*pmd))	/* THP, HugeTLB */
-		return stage2_pmdp_test_and_clear_young(pmd);
-
-	pte = pte_offset_kernel(pmd, gpa);
-	if (pte_none(*pte))
-		return 0;
-
-	return stage2_ptep_test_and_clear_young(pte);
-}
-
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-
-	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
-	pmd = stage2_get_pmd(kvm, NULL, gpa);
-	if (!pmd || pmd_none(*pmd))	/* Nothing there */
-		return 0;
-
-	if (pmd_thp_or_huge(*pmd))		/* THP, HugeTLB */
-		return pmd_young(*pmd);
-
-	pte = pte_offset_kernel(pmd, gpa);
-	if (!pte_none(*pte))		/* Just a page... */
-		return pte_young(*pte);
-
-	return 0;
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
-	trace_kvm_age_hva(start, end);
-	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-	trace_kvm_test_age_hva(hva);
-	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
-}
-
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
-phys_addr_t kvm_mmu_get_httbr(void)
-{
-	if (__kvm_cpu_uses_extended_idmap())
-		return virt_to_phys(merged_hyp_pgd);
-	else
-		return virt_to_phys(hyp_pgd);
-}
-
-phys_addr_t kvm_get_idmap_vector(void)
-{
-	return hyp_idmap_vector;
-}
-
-static int kvm_map_idmap_text(pgd_t *pgd)
-{
-	int err;
-
-	/* Create the idmap in the boot page tables */
-	err = 	__create_hyp_mappings(pgd,
-				      hyp_idmap_start, hyp_idmap_end,
-				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP_EXEC);
-	if (err)
-		kvm_err("Failed to idmap %lx-%lx\n",
-			hyp_idmap_start, hyp_idmap_end);
-
-	return err;
-}
-
-int kvm_mmu_init(void)
-{
-	int err;
-
-	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
-	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
-	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
-
-	/*
-	 * We rely on the linker script to ensure at build time that the HYP
-	 * init code does not cross a page boundary.
-	 */
-	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
-
-	kvm_info("IDMAP page: %lx\n", hyp_idmap_start);
-	kvm_info("HYP VA range: %lx:%lx\n",
-		 kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL));
-
-	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
-	    hyp_idmap_start <  kern_hyp_va(~0UL) &&
-	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
-		/*
-		 * The idmap page is intersecting with the VA space,
-		 * it is not safe to continue further.
-		 */
-		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
-		err = -EINVAL;
-		goto out;
-	}
-
-	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-	if (!hyp_pgd) {
-		kvm_err("Hyp mode PGD not allocated\n");
-		err = -ENOMEM;
-		goto out;
-	}
-
-	if (__kvm_cpu_uses_extended_idmap()) {
-		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-							 hyp_pgd_order);
-		if (!boot_hyp_pgd) {
-			kvm_err("Hyp boot PGD not allocated\n");
-			err = -ENOMEM;
-			goto out;
-		}
-
-		err = kvm_map_idmap_text(boot_hyp_pgd);
-		if (err)
-			goto out;
-
-		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-		if (!merged_hyp_pgd) {
-			kvm_err("Failed to allocate extra HYP pgd\n");
-			goto out;
-		}
-		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
-				    hyp_idmap_start);
-	} else {
-		err = kvm_map_idmap_text(hyp_pgd);
-		if (err)
-			goto out;
-	}
-
-	return 0;
-out:
-	free_hyp_pgds();
-	return err;
-}
-
-void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   const struct kvm_userspace_memory_region *mem,
-				   const struct kvm_memory_slot *old,
-				   const struct kvm_memory_slot *new,
-				   enum kvm_mr_change change)
-{
-	/*
-	 * At this point memslot has been committed and there is an
-	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
-	 * memory slot is write protected.
-	 */
-	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
-		kvm_mmu_wp_memory_region(kvm, mem->slot);
-}
-
-int kvm_arch_prepare_memory_region(struct kvm *kvm,
-				   struct kvm_memory_slot *memslot,
-				   const struct kvm_userspace_memory_region *mem,
-				   enum kvm_mr_change change)
-{
-	hva_t hva = mem->userspace_addr;
-	hva_t reg_end = hva + mem->memory_size;
-	bool writable = !(mem->flags & KVM_MEM_READONLY);
-	int ret = 0;
-
-	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
-			change != KVM_MR_FLAGS_ONLY)
-		return 0;
-
-	/*
-	 * Prevent userspace from creating a memory region outside of the IPA
-	 * space addressable by the KVM guest IPA space.
-	 */
-	if (memslot->base_gfn + memslot->npages >=
-	    (KVM_PHYS_SIZE >> PAGE_SHIFT))
-		return -EFAULT;
-
-	down_read(&current->mm->mmap_sem);
-	/*
-	 * A memory region could potentially cover multiple VMAs, and any holes
-	 * between them, so iterate over all of them to find out if we can map
-	 * any of them right now.
-	 *
-	 *     +--------------------------------------------+
-	 * +---------------+----------------+   +----------------+
-	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-	 * +---------------+----------------+   +----------------+
-	 *     |               memory region                |
-	 *     +--------------------------------------------+
-	 */
-	do {
-		struct vm_area_struct *vma = find_vma(current->mm, hva);
-		hva_t vm_start, vm_end;
-
-		if (!vma || vma->vm_start >= reg_end)
-			break;
-
-		/*
-		 * Mapping a read-only VMA is only allowed if the
-		 * memory region is configured as read-only.
-		 */
-		if (writable && !(vma->vm_flags & VM_WRITE)) {
-			ret = -EPERM;
-			break;
-		}
-
-		/*
-		 * Take the intersection of this VMA with the memory region
-		 */
-		vm_start = max(hva, vma->vm_start);
-		vm_end = min(reg_end, vma->vm_end);
-
-		if (vma->vm_flags & VM_PFNMAP) {
-			gpa_t gpa = mem->guest_phys_addr +
-				    (vm_start - mem->userspace_addr);
-			phys_addr_t pa;
-
-			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
-			pa += vm_start - vma->vm_start;
-
-			/* IO region dirty page logging not allowed */
-			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-				ret = -EINVAL;
-				goto out;
-			}
-
-			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
-						    vm_end - vm_start,
-						    writable);
-			if (ret)
-				break;
-		}
-		hva = vm_end;
-	} while (hva < reg_end);
-
-	if (change == KVM_MR_FLAGS_ONLY)
-		goto out;
-
-	spin_lock(&kvm->mmu_lock);
-	if (ret)
-		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-	else
-		stage2_flush_memslot(kvm, memslot);
-	spin_unlock(&kvm->mmu_lock);
-out:
-	up_read(&current->mm->mmap_sem);
-	return ret;
-}
-
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-			   struct kvm_memory_slot *dont)
-{
-}
-
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-			    unsigned long npages)
-{
-	return 0;
-}
-
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
-{
-}
-
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
-	kvm_free_stage2_pgd(kvm);
-}
-
-void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-				   struct kvm_memory_slot *slot)
-{
-	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
-	phys_addr_t size = slot->npages << PAGE_SHIFT;
-
-	spin_lock(&kvm->mmu_lock);
-	unmap_stage2_range(kvm, gpa, size);
-	spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
- *
- * Main problems:
- * - S/W ops are local to a CPU (not broadcast)
- * - We have line migration behind our back (speculation)
- * - System caches don't support S/W at all (damn!)
- *
- * In the face of the above, the best we can do is to try and convert
- * S/W ops to VA ops. Because the guest is not allowed to infer the
- * S/W to PA mapping, it can only use S/W to nuke the whole cache,
- * which is a rather good thing for us.
- *
- * Also, it is only used when turning caches on/off ("The expected
- * usage of the cache maintenance instructions that operate by set/way
- * is associated with the cache maintenance instructions associated
- * with the powerdown and powerup of caches, if this is required by
- * the implementation.").
- *
- * We use the following policy:
- *
- * - If we trap a S/W operation, we enable VM trapping to detect
- *   caches being turned on/off, and do a full clean.
- *
- * - We flush the caches on both caches being turned on and off.
- *
- * - Once the caches are enabled, we stop trapping VM ops.
- */
-void kvm_set_way_flush(struct kvm_vcpu *vcpu)
-{
-	unsigned long hcr = vcpu_get_hcr(vcpu);
-
-	/*
-	 * If this is the first time we do a S/W operation
-	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
-	 * VM trapping.
-	 *
-	 * Otherwise, rely on the VM trapping to wait for the MMU +
-	 * Caches to be turned off. At that point, we'll be able to
-	 * clean the caches again.
-	 */
-	if (!(hcr & HCR_TVM)) {
-		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
-					vcpu_has_cache_enabled(vcpu));
-		stage2_flush_vm(vcpu->kvm);
-		vcpu_set_hcr(vcpu, hcr | HCR_TVM);
-	}
-}
-
-void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
-{
-	bool now_enabled = vcpu_has_cache_enabled(vcpu);
-
-	/*
-	 * If switching the MMU+caches on, need to invalidate the caches.
-	 * If switching it off, need to clean the caches.
-	 * Clean + invalidate does the trick always.
-	 */
-	if (now_enabled != was_enabled)
-		stage2_flush_vm(vcpu->kvm);
-
-	/* Caches are now on, stop trapping VM ops (until a S/W op) */
-	if (now_enabled)
-		vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM);
-
-	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
-}
diff --git a/arch/arm/kvm/perf.c b/arch/arm/kvm/perf.c
deleted file mode 100644
index 1a3849da0b4b..000000000000
--- a/arch/arm/kvm/perf.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Based on the x86 implementation.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/perf_event.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-
-static int kvm_is_in_guest(void)
-{
-        return kvm_arm_get_running_vcpu() != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
-	struct kvm_vcpu *vcpu;
-
-	vcpu = kvm_arm_get_running_vcpu();
-
-	if (vcpu)
-		return !vcpu_mode_priv(vcpu);
-
-	return 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
-	struct kvm_vcpu *vcpu;
-
-	vcpu = kvm_arm_get_running_vcpu();
-
-	if (vcpu)
-		return *vcpu_pc(vcpu);
-
-	return 0;
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
-	.is_in_guest	= kvm_is_in_guest,
-	.is_user_mode	= kvm_is_user_mode,
-	.get_guest_ip	= kvm_get_guest_ip,
-};
-
-int kvm_perf_init(void)
-{
-	return perf_register_guest_info_callbacks(&kvm_guest_cbs);
-}
-
-int kvm_perf_teardown(void)
-{
-	return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
-}
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
deleted file mode 100644
index a08d7a93aebb..000000000000
--- a/arch/arm/kvm/psci.c
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * Copyright (C) 2012 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/preempt.h>
-#include <linux/kvm_host.h>
-#include <linux/wait.h>
-
-#include <asm/cputype.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_psci.h>
-#include <asm/kvm_host.h>
-
-#include <uapi/linux/psci.h>
-
-/*
- * This is an implementation of the Power State Coordination Interface
- * as described in ARM document number ARM DEN 0022A.
- */
-
-#define AFFINITY_MASK(level)	~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
-
-static unsigned long psci_affinity_mask(unsigned long affinity_level)
-{
-	if (affinity_level <= 3)
-		return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
-
-	return 0;
-}
-
-static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * NOTE: For simplicity, we make VCPU suspend emulation to be
-	 * same-as WFI (Wait-for-interrupt) emulation.
-	 *
-	 * This means for KVM the wakeup events are interrupts and
-	 * this is consistent with intended use of StateID as described
-	 * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
-	 *
-	 * Further, we also treat power-down request to be same as
-	 * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
-	 * specification (ARM DEN 0022A). This means all suspend states
-	 * for KVM will preserve the register state.
-	 */
-	kvm_vcpu_block(vcpu);
-
-	return PSCI_RET_SUCCESS;
-}
-
-static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.power_off = true;
-}
-
-static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
-{
-	struct kvm *kvm = source_vcpu->kvm;
-	struct kvm_vcpu *vcpu = NULL;
-	struct swait_queue_head *wq;
-	unsigned long cpu_id;
-	unsigned long context_id;
-	phys_addr_t target_pc;
-
-	cpu_id = vcpu_get_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK;
-	if (vcpu_mode_is_32bit(source_vcpu))
-		cpu_id &= ~((u32) 0);
-
-	vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
-
-	/*
-	 * Make sure the caller requested a valid CPU and that the CPU is
-	 * turned off.
-	 */
-	if (!vcpu)
-		return PSCI_RET_INVALID_PARAMS;
-	if (!vcpu->arch.power_off) {
-		if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
-			return PSCI_RET_ALREADY_ON;
-		else
-			return PSCI_RET_INVALID_PARAMS;
-	}
-
-	target_pc = vcpu_get_reg(source_vcpu, 2);
-	context_id = vcpu_get_reg(source_vcpu, 3);
-
-	kvm_reset_vcpu(vcpu);
-
-	/* Gracefully handle Thumb2 entry point */
-	if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
-		target_pc &= ~((phys_addr_t) 1);
-		vcpu_set_thumb(vcpu);
-	}
-
-	/* Propagate caller endianness */
-	if (kvm_vcpu_is_be(source_vcpu))
-		kvm_vcpu_set_be(vcpu);
-
-	*vcpu_pc(vcpu) = target_pc;
-	/*
-	 * NOTE: We always update r0 (or x0) because for PSCI v0.1
-	 * the general puspose registers are undefined upon CPU_ON.
-	 */
-	vcpu_set_reg(vcpu, 0, context_id);
-	vcpu->arch.power_off = false;
-	smp_mb();		/* Make sure the above is visible */
-
-	wq = kvm_arch_vcpu_wq(vcpu);
-	swake_up(wq);
-
-	return PSCI_RET_SUCCESS;
-}
-
-static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
-{
-	int i, matching_cpus = 0;
-	unsigned long mpidr;
-	unsigned long target_affinity;
-	unsigned long target_affinity_mask;
-	unsigned long lowest_affinity_level;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_vcpu *tmp;
-
-	target_affinity = vcpu_get_reg(vcpu, 1);
-	lowest_affinity_level = vcpu_get_reg(vcpu, 2);
-
-	/* Determine target affinity mask */
-	target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
-	if (!target_affinity_mask)
-		return PSCI_RET_INVALID_PARAMS;
-
-	/* Ignore other bits of target affinity */
-	target_affinity &= target_affinity_mask;
-
-	/*
-	 * If one or more VCPU matching target affinity are running
-	 * then ON else OFF
-	 */
-	kvm_for_each_vcpu(i, tmp, kvm) {
-		mpidr = kvm_vcpu_get_mpidr_aff(tmp);
-		if ((mpidr & target_affinity_mask) == target_affinity) {
-			matching_cpus++;
-			if (!tmp->arch.power_off)
-				return PSCI_0_2_AFFINITY_LEVEL_ON;
-		}
-	}
-
-	if (!matching_cpus)
-		return PSCI_RET_INVALID_PARAMS;
-
-	return PSCI_0_2_AFFINITY_LEVEL_OFF;
-}
-
-static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
-{
-	int i;
-	struct kvm_vcpu *tmp;
-
-	/*
-	 * The KVM ABI specifies that a system event exit may call KVM_RUN
-	 * again and may perform shutdown/reboot at a later time that when the
-	 * actual request is made.  Since we are implementing PSCI and a
-	 * caller of PSCI reboot and shutdown expects that the system shuts
-	 * down or reboots immediately, let's make sure that VCPUs are not run
-	 * after this call is handled and before the VCPUs have been
-	 * re-initialized.
-	 */
-	kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
-		tmp->arch.power_off = true;
-		kvm_vcpu_kick(tmp);
-	}
-
-	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
-	vcpu->run->system_event.type = type;
-	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
-}
-
-static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
-{
-	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
-}
-
-static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
-{
-	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
-}
-
-int kvm_psci_version(struct kvm_vcpu *vcpu)
-{
-	if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
-		return KVM_ARM_PSCI_0_2;
-
-	return KVM_ARM_PSCI_0_1;
-}
-
-static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0);
-	unsigned long val;
-	int ret = 1;
-
-	switch (psci_fn) {
-	case PSCI_0_2_FN_PSCI_VERSION:
-		/*
-		 * Bits[31:16] = Major Version = 0
-		 * Bits[15:0] = Minor Version = 2
-		 */
-		val = 2;
-		break;
-	case PSCI_0_2_FN_CPU_SUSPEND:
-	case PSCI_0_2_FN64_CPU_SUSPEND:
-		val = kvm_psci_vcpu_suspend(vcpu);
-		break;
-	case PSCI_0_2_FN_CPU_OFF:
-		kvm_psci_vcpu_off(vcpu);
-		val = PSCI_RET_SUCCESS;
-		break;
-	case PSCI_0_2_FN_CPU_ON:
-	case PSCI_0_2_FN64_CPU_ON:
-		mutex_lock(&kvm->lock);
-		val = kvm_psci_vcpu_on(vcpu);
-		mutex_unlock(&kvm->lock);
-		break;
-	case PSCI_0_2_FN_AFFINITY_INFO:
-	case PSCI_0_2_FN64_AFFINITY_INFO:
-		val = kvm_psci_vcpu_affinity_info(vcpu);
-		break;
-	case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-		/*
-		 * Trusted OS is MP hence does not require migration
-	         * or
-		 * Trusted OS is not present
-		 */
-		val = PSCI_0_2_TOS_MP;
-		break;
-	case PSCI_0_2_FN_SYSTEM_OFF:
-		kvm_psci_system_off(vcpu);
-		/*
-		 * We should'nt be going back to guest VCPU after
-		 * receiving SYSTEM_OFF request.
-		 *
-		 * If user space accidently/deliberately resumes
-		 * guest VCPU after SYSTEM_OFF request then guest
-		 * VCPU should see internal failure from PSCI return
-		 * value. To achieve this, we preload r0 (or x0) with
-		 * PSCI return value INTERNAL_FAILURE.
-		 */
-		val = PSCI_RET_INTERNAL_FAILURE;
-		ret = 0;
-		break;
-	case PSCI_0_2_FN_SYSTEM_RESET:
-		kvm_psci_system_reset(vcpu);
-		/*
-		 * Same reason as SYSTEM_OFF for preloading r0 (or x0)
-		 * with PSCI return value INTERNAL_FAILURE.
-		 */
-		val = PSCI_RET_INTERNAL_FAILURE;
-		ret = 0;
-		break;
-	default:
-		val = PSCI_RET_NOT_SUPPORTED;
-		break;
-	}
-
-	vcpu_set_reg(vcpu, 0, val);
-	return ret;
-}
-
-static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0);
-	unsigned long val;
-
-	switch (psci_fn) {
-	case KVM_PSCI_FN_CPU_OFF:
-		kvm_psci_vcpu_off(vcpu);
-		val = PSCI_RET_SUCCESS;
-		break;
-	case KVM_PSCI_FN_CPU_ON:
-		mutex_lock(&kvm->lock);
-		val = kvm_psci_vcpu_on(vcpu);
-		mutex_unlock(&kvm->lock);
-		break;
-	default:
-		val = PSCI_RET_NOT_SUPPORTED;
-		break;
-	}
-
-	vcpu_set_reg(vcpu, 0, val);
-	return 1;
-}
-
-/**
- * kvm_psci_call - handle PSCI call if r0 value is in range
- * @vcpu: Pointer to the VCPU struct
- *
- * Handle PSCI calls from guests through traps from HVC instructions.
- * The calling convention is similar to SMC calls to the secure world
- * where the function number is placed in r0.
- *
- * This function returns: > 0 (success), 0 (success but exit to user
- * space), and < 0 (errors)
- *
- * Errors:
- * -EINVAL: Unrecognized PSCI function
- */
-int kvm_psci_call(struct kvm_vcpu *vcpu)
-{
-	switch (kvm_psci_version(vcpu)) {
-	case KVM_ARM_PSCI_0_2:
-		return kvm_psci_0_2_call(vcpu);
-	case KVM_ARM_PSCI_0_1:
-		return kvm_psci_0_1_call(vcpu);
-	default:
-		return -EINVAL;
-	};
-}
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index c25a88598eb0..fc0943776db2 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -6,133 +6,6 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
 
-/*
- * Tracepoints for entry/exit to guest
- */
-TRACE_EVENT(kvm_entry,
-	TP_PROTO(unsigned long vcpu_pc),
-	TP_ARGS(vcpu_pc),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-	),
-
-	TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_exit,
-	TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc),
-	TP_ARGS(idx, exit_reason, vcpu_pc),
-
-	TP_STRUCT__entry(
-		__field(	int,		idx		)
-		__field(	unsigned int,	exit_reason	)
-		__field(	unsigned long,	vcpu_pc		)
-	),
-
-	TP_fast_assign(
-		__entry->idx			= idx;
-		__entry->exit_reason		= exit_reason;
-		__entry->vcpu_pc		= vcpu_pc;
-	),
-
-	TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
-		  __print_symbolic(__entry->idx, kvm_arm_exception_type),
-		  __entry->exit_reason,
-		  __print_symbolic(__entry->exit_reason, kvm_arm_exception_class),
-		  __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_guest_fault,
-	TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
-		 unsigned long hxfar,
-		 unsigned long long ipa),
-	TP_ARGS(vcpu_pc, hsr, hxfar, ipa),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-		__field(	unsigned long,	hsr		)
-		__field(	unsigned long,	hxfar		)
-		__field(   unsigned long long,	ipa		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-		__entry->hsr			= hsr;
-		__entry->hxfar			= hxfar;
-		__entry->ipa			= ipa;
-	),
-
-	TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
-		  __entry->ipa, __entry->hsr,
-		  __entry->hxfar, __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_access_fault,
-	TP_PROTO(unsigned long ipa),
-	TP_ARGS(ipa),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	ipa		)
-	),
-
-	TP_fast_assign(
-		__entry->ipa		= ipa;
-	),
-
-	TP_printk("IPA: %lx", __entry->ipa)
-);
-
-TRACE_EVENT(kvm_irq_line,
-	TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
-	TP_ARGS(type, vcpu_idx, irq_num, level),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	type		)
-		__field(	int,		vcpu_idx	)
-		__field(	int,		irq_num		)
-		__field(	int,		level		)
-	),
-
-	TP_fast_assign(
-		__entry->type		= type;
-		__entry->vcpu_idx	= vcpu_idx;
-		__entry->irq_num	= irq_num;
-		__entry->level		= level;
-	),
-
-	TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d",
-		  (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" :
-		  (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" :
-		  (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN",
-		  __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
-);
-
-TRACE_EVENT(kvm_mmio_emulate,
-	TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
-		 unsigned long cpsr),
-	TP_ARGS(vcpu_pc, instr, cpsr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-		__field(	unsigned long,	instr		)
-		__field(	unsigned long,	cpsr		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-		__entry->instr			= instr;
-		__entry->cpsr			= cpsr;
-	),
-
-	TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
-		  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
-);
-
 /* Architecturally implementation defined CP15 register access */
 TRACE_EVENT(kvm_emulate_cp15_imp,
 	TP_PROTO(unsigned long Op1, unsigned long Rt1, unsigned long CRn,
@@ -181,87 +54,6 @@ TRACE_EVENT(kvm_wfx,
 		__entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
 );
 
-TRACE_EVENT(kvm_unmap_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier unmap hva: %#08lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_unmap_hva_range,
-	TP_PROTO(unsigned long start, unsigned long end),
-	TP_ARGS(start, end),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-	),
-
-	TP_fast_assign(
-		__entry->start		= start;
-		__entry->end		= end;
-	),
-
-	TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
-		  __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
-	TP_PROTO(unsigned long start, unsigned long end),
-	TP_ARGS(start, end),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-	),
-
-	TP_fast_assign(
-		__entry->start		= start;
-		__entry->end		= end;
-	),
-
-	TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
-		  __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
-);
-
 TRACE_EVENT(kvm_hvc,
 	TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
 	TP_ARGS(vcpu_pc, r0, imm),
@@ -282,45 +74,6 @@ TRACE_EVENT(kvm_hvc,
 		  __entry->vcpu_pc, __entry->r0, __entry->imm)
 );
 
-TRACE_EVENT(kvm_set_way_flush,
-	    TP_PROTO(unsigned long vcpu_pc, bool cache),
-	    TP_ARGS(vcpu_pc, cache),
-
-	    TP_STRUCT__entry(
-		    __field(	unsigned long,	vcpu_pc		)
-		    __field(	bool,		cache		)
-	    ),
-
-	    TP_fast_assign(
-		    __entry->vcpu_pc		= vcpu_pc;
-		    __entry->cache		= cache;
-	    ),
-
-	    TP_printk("S/W flush at 0x%016lx (cache %s)",
-		      __entry->vcpu_pc, __entry->cache ? "on" : "off")
-);
-
-TRACE_EVENT(kvm_toggle_cache,
-	    TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
-	    TP_ARGS(vcpu_pc, was, now),
-
-	    TP_STRUCT__entry(
-		    __field(	unsigned long,	vcpu_pc		)
-		    __field(	bool,		was		)
-		    __field(	bool,		now		)
-	    ),
-
-	    TP_fast_assign(
-		    __entry->vcpu_pc		= vcpu_pc;
-		    __entry->was		= was;
-		    __entry->now		= now;
-	    ),
-
-	    TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
-		      __entry->vcpu_pc, __entry->was ? "on" : "off",
-		      __entry->now ? "on" : "off")
-);
-
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index f5ea0ba70f07..fe39e6841326 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -240,6 +240,12 @@ static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu)
 	return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE;
 }
 
+static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
+{
+	u32 esr = kvm_vcpu_get_hsr(vcpu);
+	return (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+}
+
 static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
 {
 	return vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 869ee480deed..70eea2ecc663 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -216,13 +216,17 @@ struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
 #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO  7
+#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT	10
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
 			(0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK	0x3ff
 #define VGIC_LEVEL_INFO_LINE_LEVEL	0
 
-#define   KVM_DEV_ARM_VGIC_CTRL_INIT	0
+#define   KVM_DEV_ARM_VGIC_CTRL_INIT		0
+#define   KVM_DEV_ARM_ITS_SAVE_TABLES           1
+#define   KVM_DEV_ARM_ITS_RESTORE_TABLES        2
+#define   KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES	3
 
 /* Device Control API on vcpu fd */
 #define KVM_ARM_VCPU_PMU_V3_CTRL	0
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index afd51bebb9c5..5d9810086c25 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -7,14 +7,13 @@ CFLAGS_arm.o := -I.
 CFLAGS_mmu.o := -I.
 
 KVM=../../../virt/kvm
-ARM=../../../arch/arm/kvm
 
 obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
 obj-$(CONFIG_KVM_ARM_HOST) += hyp/
 
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index efbe9e8e7a78..0fe27024a2e1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1529,8 +1529,8 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu,
 {
 	struct sys_reg_params params;
 	u32 hsr = kvm_vcpu_get_hsr(vcpu);
-	int Rt = (hsr >> 5) & 0xf;
-	int Rt2 = (hsr >> 10) & 0xf;
+	int Rt = kvm_vcpu_sys_get_rt(vcpu);
+	int Rt2 = (hsr >> 10) & 0x1f;
 
 	params.is_aarch32 = true;
 	params.is_32bit = false;
@@ -1586,7 +1586,7 @@ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu,
 {
 	struct sys_reg_params params;
 	u32 hsr = kvm_vcpu_get_hsr(vcpu);
-	int Rt  = (hsr >> 5) & 0xf;
+	int Rt  = kvm_vcpu_sys_get_rt(vcpu);
 
 	params.is_aarch32 = true;
 	params.is_32bit = true;
@@ -1688,7 +1688,7 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	struct sys_reg_params params;
 	unsigned long esr = kvm_vcpu_get_hsr(vcpu);
-	int Rt = (esr >> 5) & 0x1f;
+	int Rt = kvm_vcpu_sys_get_rt(vcpu);
 	int ret;
 
 	trace_kvm_handle_sys_reg(esr);
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 0593d9479f74..b148496ffe36 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -111,6 +111,8 @@ struct kvmppc_host_state {
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	void __iomem *xics_phys;
+	void __iomem *xive_tima_phys;
+	void __iomem *xive_tima_virt;
 	u32 saved_xirr;
 	u64 dabr;
 	u64 host_mmcr[7];	/* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 77c60826d145..9c51ac4b8f36 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -210,6 +210,12 @@ struct kvmppc_spapr_tce_table {
 /* XICS components, defined in book3s_xics.c */
 struct kvmppc_xics;
 struct kvmppc_icp;
+extern struct kvm_device_ops kvm_xics_ops;
+
+/* XIVE components, defined in book3s_xive.c */
+struct kvmppc_xive;
+struct kvmppc_xive_vcpu;
+extern struct kvm_device_ops kvm_xive_ops;
 
 struct kvmppc_passthru_irqmap;
 
@@ -298,6 +304,7 @@ struct kvm_arch {
 #endif
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_xics *xics;
+	struct kvmppc_xive *xive;
 	struct kvmppc_passthru_irqmap *pimap;
 #endif
 	struct kvmppc_ops *kvm_ops;
@@ -427,7 +434,7 @@ struct kvmppc_passthru_irqmap {
 
 #define KVMPPC_IRQ_DEFAULT	0
 #define KVMPPC_IRQ_MPIC		1
-#define KVMPPC_IRQ_XICS		2
+#define KVMPPC_IRQ_XICS		2 /* Includes a XIVE option */
 
 #define MMIO_HPTE_CACHE_SIZE	4
 
@@ -454,6 +461,21 @@ struct mmio_hpte_cache {
 
 struct openpic;
 
+/* W0 and W1 of a XIVE thread management context */
+union xive_tma_w01 {
+	struct {
+		u8	nsr;
+		u8	cppr;
+		u8	ipb;
+		u8	lsmfb;
+		u8	ack;
+		u8	inc;
+		u8	age;
+		u8	pipr;
+	};
+	__be64 w01;
+};
+
 struct kvm_vcpu_arch {
 	ulong host_stack;
 	u32 host_pid;
@@ -714,6 +736,10 @@ struct kvm_vcpu_arch {
 	struct openpic *mpic;	/* KVM_IRQ_MPIC */
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_icp *icp; /* XICS presentation controller */
+	struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
+	__be32 xive_cam_word;    /* Cooked W2 in proper endian with valid bit */
+	u32 xive_pushed;	 /* Is the VP pushed on the physical CPU ? */
+	union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 76e940a3c145..e0d88c38602b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -240,6 +240,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
 extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
 extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
 extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+
 extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
 				u32 priority);
 extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
@@ -428,6 +429,14 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 	paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
 }
 
+static inline void kvmppc_set_xive_tima(int cpu,
+					unsigned long phys_addr,
+					void __iomem *virt_addr)
+{
+	paca[cpu].kvm_hstate.xive_tima_phys = (void __iomem *)phys_addr;
+	paca[cpu].kvm_hstate.xive_tima_virt = virt_addr;
+}
+
 static inline u32 kvmppc_get_xics_latch(void)
 {
 	u32 xirr;
@@ -458,6 +467,11 @@ static inline void __init kvm_cma_reserve(void)
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
 
+static inline void kvmppc_set_xive_tima(int cpu,
+					unsigned long phys_addr,
+					void __iomem *virt_addr)
+{}
+
 static inline u32 kvmppc_get_xics_latch(void)
 {
 	return 0;
@@ -508,6 +522,10 @@ extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr,
 					struct kvmppc_irq_map *irq_map,
 					struct kvmppc_passthru_irqmap *pimap,
 					bool *again);
+
+extern int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+			       int level, bool line_status);
+
 extern int h_ipi_redirect;
 #else
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
@@ -525,6 +543,60 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 	{ return 0; }
 #endif
 
+#ifdef CONFIG_KVM_XIVE
+/*
+ * Below the first "xive" is the "eXternal Interrupt Virtualization Engine"
+ * ie. P9 new interrupt controller, while the second "xive" is the legacy
+ * "eXternal Interrupt Vector Entry" which is the configuration of an
+ * interrupt on the "xics" interrupt controller on P8 and earlier. Those
+ * two function consume or produce a legacy "XIVE" state from the
+ * new "XIVE" interrupt controller.
+ */
+extern int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+				u32 priority);
+extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+				u32 *priority);
+extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq);
+extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq);
+extern void kvmppc_xive_init_module(void);
+extern void kvmppc_xive_exit_module(void);
+
+extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+				    struct kvm_vcpu *vcpu, u32 cpu);
+extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu);
+extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+				  struct irq_desc *host_desc);
+extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+				  struct irq_desc *host_desc);
+extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
+
+extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+			       int level, bool line_status);
+#else
+static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+				       u32 priority) { return -1; }
+static inline int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+				       u32 *priority) { return -1; }
+static inline int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) { return -1; }
+static inline int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) { return -1; }
+static inline void kvmppc_xive_init_module(void) { }
+static inline void kvmppc_xive_exit_module(void) { }
+
+static inline int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+					   struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
+static inline void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
+static inline int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+					 struct irq_desc *host_desc) { return -ENODEV; }
+static inline int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+					 struct irq_desc *host_desc) { return -ENODEV; }
+static inline u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) { return 0; }
+static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { return -ENOENT; }
+
+static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+				      int level, bool line_status) { return -ENODEV; }
+#endif /* CONFIG_KVM_XIVE */
+
 /*
  * Prototypes for functions called only from assembler code.
  * Having prototypes reduces sparse errors.
@@ -562,6 +634,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
                           unsigned long slb_v, unsigned int status, bool data);
 unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
+unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu);
+unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
 int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
                     unsigned long mfrr);
 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 3cdbeaeac397..c8a822acf962 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -99,7 +99,6 @@ struct xive_q {
 #define XIVE_ESB_SET_PQ_01	0xd00
 #define XIVE_ESB_SET_PQ_10	0xe00
 #define XIVE_ESB_SET_PQ_11	0xf00
-#define XIVE_ESB_MASK		XIVE_ESB_SET_PQ_01
 
 #define XIVE_ESB_VAL_P		0x2
 #define XIVE_ESB_VAL_Q		0x1
@@ -136,11 +135,11 @@ extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
 				       __be32 *qpage, u32 order, bool can_escalate);
 extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
 
-extern bool __xive_irq_trigger(struct xive_irq_data *xd);
-extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
-extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
-
+extern void xive_native_sync_source(u32 hw_irq);
 extern bool is_xive_irq(struct irq_chip *chip);
+extern int xive_native_enable_vp(u32 vp_id);
+extern int xive_native_disable_vp(u32 vp_id);
+extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
 
 #else
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 439c257dec4a..709e23425317 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -634,6 +634,8 @@ int main(void)
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
 	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+	HSTATE_FIELD(HSTATE_XIVE_TIMA_PHYS, xive_tima_phys);
+	HSTATE_FIELD(HSTATE_XIVE_TIMA_VIRT, xive_tima_virt);
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
 	HSTATE_FIELD(HSTATE_PTID, ptid);
@@ -719,6 +721,14 @@ int main(void)
 	OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6);
 #endif
 
+#ifdef CONFIG_KVM_XICS
+	DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu,
+					       arch.xive_saved_state));
+	DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
+					    arch.xive_cam_word));
+	DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu);
 	OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl);
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 65a471de96de..24de532c1736 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -197,6 +197,11 @@ config KVM_XICS
 	  Specification) interrupt controller architecture used on
 	  IBM POWER (pSeries) servers.
 
+config KVM_XIVE
+	bool
+	default y
+	depends on KVM_XICS && PPC_XIVE_NATIVE && KVM_BOOK3S_HV_POSSIBLE
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b87ccde2137a..d91a2604c496 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -74,7 +74,7 @@ kvm-hv-y += \
 	book3s_64_mmu_radix.o
 
 kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
-	book3s_hv_rm_xics.o
+	book3s_hv_rm_xics.o book3s_hv_rm_xive.o
 
 ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
@@ -89,6 +89,8 @@ endif
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
 	book3s_xics.o
 
+kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o
+
 kvm-book3s_64-module-objs := \
 	$(common-objs-y) \
 	book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 8c4d7e9d27d2..72d977e30952 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -35,6 +35,7 @@
 #include <asm/kvm_book3s.h>
 #include <asm/mmu_context.h>
 #include <asm/page.h>
+#include <asm/xive.h>
 
 #include "book3s.h"
 #include "trace.h"
@@ -596,11 +597,14 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
 			break;
 #ifdef CONFIG_KVM_XICS
 		case KVM_REG_PPC_ICP_STATE:
-			if (!vcpu->arch.icp) {
+			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
 				r = -ENXIO;
 				break;
 			}
-			*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
+			if (xive_enabled())
+				*val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
+			else
+				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
 			break;
 #endif /* CONFIG_KVM_XICS */
 		case KVM_REG_PPC_FSCR:
@@ -666,12 +670,14 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
 #endif /* CONFIG_VSX */
 #ifdef CONFIG_KVM_XICS
 		case KVM_REG_PPC_ICP_STATE:
-			if (!vcpu->arch.icp) {
+			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
 				r = -ENXIO;
 				break;
 			}
-			r = kvmppc_xics_set_icp(vcpu,
-						set_reg_val(id, *val));
+			if (xive_enabled())
+				r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
+			else
+				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
 			break;
 #endif /* CONFIG_KVM_XICS */
 		case KVM_REG_PPC_FSCR:
@@ -942,6 +948,50 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
 	return kvm->arch.kvm_ops->hcall_implemented(hcall);
 }
 
+#ifdef CONFIG_KVM_XICS
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status)
+{
+	if (xive_enabled())
+		return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
+					   line_status);
+	else
+		return kvmppc_xics_set_irq(kvm, irq_source_id, irq, level,
+					   line_status);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
+			      struct kvm *kvm, int irq_source_id,
+			      int level, bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
+			   level, line_status);
+}
+static int kvmppc_book3s_set_irq(struct kvm_kernel_irq_routing_entry *e,
+				 struct kvm *kvm, int irq_source_id, int level,
+				 bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
+}
+
+int kvm_irq_map_gsi(struct kvm *kvm,
+		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
+{
+	entries->gsi = gsi;
+	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
+	entries->set = kvmppc_book3s_set_irq;
+	entries->irqchip.irqchip = 0;
+	entries->irqchip.pin = gsi;
+	return 1;
+}
+
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+	return pin;
+}
+
+#endif /* CONFIG_KVM_XICS */
+
 static int kvmppc_book3s_init(void)
 {
 	int r;
@@ -952,12 +1002,25 @@ static int kvmppc_book3s_init(void)
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	r = kvmppc_book3s_init_pr();
 #endif
-	return r;
 
+#ifdef CONFIG_KVM_XICS
+#ifdef CONFIG_KVM_XIVE
+	if (xive_enabled()) {
+		kvmppc_xive_init_module();
+		kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
+	} else
+#endif
+		kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
+#endif
+	return r;
 }
 
 static void kvmppc_book3s_exit(void)
 {
+#ifdef CONFIG_KVM_XICS
+	if (xive_enabled())
+		kvmppc_xive_exit_module();
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	kvmppc_book3s_exit_pr();
 #endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 549dd6070dee..42b7a4fd57d9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -67,6 +67,7 @@
 #include <asm/mmu.h>
 #include <asm/opal.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 
 #include "book3s.h"
 
@@ -837,6 +838,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 	case H_IPOLL:
 	case H_XIRR_X:
 		if (kvmppc_xics_enabled(vcpu)) {
+			if (xive_enabled()) {
+				ret = H_NOT_AVAILABLE;
+				return RESUME_GUEST;
+			}
 			ret = kvmppc_xics_hcall(vcpu, req);
 			break;
 		}
@@ -2947,8 +2952,12 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			r = kvmppc_book3s_hv_page_fault(run, vcpu,
 				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
 			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-		} else if (r == RESUME_PASSTHROUGH)
-			r = kvmppc_xics_rm_complete(vcpu, 0);
+		} else if (r == RESUME_PASSTHROUGH) {
+			if (WARN_ON(xive_enabled()))
+				r = H_SUCCESS;
+			else
+				r = kvmppc_xics_rm_complete(vcpu, 0);
+		}
 	} while (is_kvmppc_resume_guest(r));
 
  out:
@@ -3400,10 +3409,20 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	/*
 	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
 	 * Set HVICE bit to enable hypervisor virtualization interrupts.
+	 * Set HEIC to prevent OS interrupts to go to hypervisor (should
+	 * be unnecessary but better safe than sorry in case we re-enable
+	 * EE in HV mode with this LPCR still set)
 	 */
 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 		lpcr &= ~LPCR_VPM0;
-		lpcr |= LPCR_HVICE;
+		lpcr |= LPCR_HVICE | LPCR_HEIC;
+
+		/*
+		 * If xive is enabled, we route 0x500 interrupts directly
+		 * to the guest.
+		 */
+		if (xive_enabled())
+			lpcr |= LPCR_LPES;
 	}
 
 	/*
@@ -3533,7 +3552,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	struct kvmppc_irq_map *irq_map;
 	struct kvmppc_passthru_irqmap *pimap;
 	struct irq_chip *chip;
-	int i;
+	int i, rc = 0;
 
 	if (!kvm_irq_bypass)
 		return 1;
@@ -3558,10 +3577,10 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	/*
 	 * For now, we only support interrupts for which the EOI operation
 	 * is an OPAL call followed by a write to XIRR, since that's
-	 * what our real-mode EOI code does.
+	 * what our real-mode EOI code does, or a XIVE interrupt
 	 */
 	chip = irq_data_get_irq_chip(&desc->irq_data);
-	if (!chip || !is_pnv_opal_msi(chip)) {
+	if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
 		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
 			host_irq, guest_gsi);
 		mutex_unlock(&kvm->lock);
@@ -3603,7 +3622,12 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	if (i == pimap->n_mapped)
 		pimap->n_mapped++;
 
-	kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+	if (xive_enabled())
+		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
+	else
+		kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+	if (rc)
+		irq_map->r_hwirq = 0;
 
 	mutex_unlock(&kvm->lock);
 
@@ -3614,7 +3638,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 {
 	struct irq_desc *desc;
 	struct kvmppc_passthru_irqmap *pimap;
-	int i;
+	int i, rc = 0;
 
 	if (!kvm_irq_bypass)
 		return 0;
@@ -3639,9 +3663,12 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 		return -ENODEV;
 	}
 
-	kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+	if (xive_enabled())
+		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
+	else
+		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
 
-	/* invalidate the entry */
+	/* invalidate the entry (what do do on error from the above ?) */
 	pimap->mapped[i].r_hwirq = 0;
 
 	/*
@@ -3650,7 +3677,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	 */
  unlock:
 	mutex_unlock(&kvm->lock);
-	return 0;
+	return rc;
 }
 
 static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
@@ -3928,7 +3955,7 @@ static int kvmppc_book3s_init_hv(void)
 	 * indirectly, via OPAL.
 	 */
 #ifdef CONFIG_SMP
-	if (!get_paca()->kvm_hstate.xics_phys) {
+	if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
 		struct device_node *np;
 
 		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 9c71c72e65ce..88a65923c649 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -32,6 +32,24 @@
 
 #define KVM_CMA_CHUNK_ORDER	18
 
+#include "book3s_xics.h"
+#include "book3s_xive.h"
+
+/*
+ * The XIVE module will populate these when it loads
+ */
+unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
+unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
+int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+		       unsigned long mfrr);
+int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
+int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_xirr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_ipoll);
+EXPORT_SYMBOL_GPL(__xive_vm_h_ipi);
+EXPORT_SYMBOL_GPL(__xive_vm_h_cppr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
+
 /*
  * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
  * should be power of 2.
@@ -211,6 +229,7 @@ void kvmhv_rm_send_ipi(int cpu)
 		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
 		return;
 	}
+
 	/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
 	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
 	    cpu_first_thread_sibling(cpu) ==
@@ -407,6 +426,9 @@ static long kvmppc_read_one_intr(bool *again)
 	u8 host_ipi;
 	int64_t rc;
 
+	if (xive_enabled())
+		return 1;
+
 	/* see if a host IPI is pending */
 	host_ipi = local_paca->kvm_hstate.host_ipi;
 	if (host_ipi)
@@ -491,3 +513,84 @@ static long kvmppc_read_one_intr(bool *again)
 
 	return kvmppc_check_passthru(xisr, xirr, again);
 }
+
+#ifdef CONFIG_KVM_XICS
+static inline bool is_rm(void)
+{
+	return !(mfmsr() & MSR_DR);
+}
+
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_xirr(vcpu);
+		if (unlikely(!__xive_vm_h_xirr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_xirr(vcpu);
+	} else
+		return xics_rm_h_xirr(vcpu);
+}
+
+unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.gpr[5] = get_tb();
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_xirr(vcpu);
+		if (unlikely(!__xive_vm_h_xirr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_xirr(vcpu);
+	} else
+		return xics_rm_h_xirr(vcpu);
+}
+
+unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_ipoll(vcpu, server);
+		if (unlikely(!__xive_vm_h_ipoll))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_ipoll(vcpu, server);
+	} else
+		return H_TOO_HARD;
+}
+
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		    unsigned long mfrr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_ipi(vcpu, server, mfrr);
+		if (unlikely(!__xive_vm_h_ipi))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_ipi(vcpu, server, mfrr);
+	} else
+		return xics_rm_h_ipi(vcpu, server, mfrr);
+}
+
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_cppr(vcpu, cppr);
+		if (unlikely(!__xive_vm_h_cppr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_cppr(vcpu, cppr);
+	} else
+		return xics_rm_h_cppr(vcpu, cppr);
+}
+
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_eoi(vcpu, xirr);
+		if (unlikely(!__xive_vm_h_eoi))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_eoi(vcpu, xirr);
+	} else
+		return xics_rm_h_eoi(vcpu, xirr);
+}
+#endif /* CONFIG_KVM_XICS */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index ffde4507ddfd..2a862618f072 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -484,7 +484,7 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 }
 
 
-unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -522,8 +522,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
 	return check_too_hard(xics, icp);
 }
 
-int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
-		    unsigned long mfrr)
+int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		  unsigned long mfrr)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -609,7 +609,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 	return check_too_hard(xics, this_icp);
 }
 
-int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -729,7 +729,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
 	return check_too_hard(xics, icp);
 }
 
-int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c
new file mode 100644
index 000000000000..abf5f01b6eb1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c
@@ -0,0 +1,47 @@
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/synch.h>
+#include <asm/cputhreads.h>
+#include <asm/pgtable.h>
+#include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+#include <asm/asm-prototypes.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+
+#include "book3s_xive.h"
+
+/* XXX */
+#include <asm/udbg.h>
+//#define DBG(fmt...) udbg_printf(fmt)
+#define DBG(fmt...) do { } while(0)
+
+static inline void __iomem *get_tima_phys(void)
+{
+	return local_paca->kvm_hstate.xive_tima_phys;
+}
+
+#undef XIVE_RUNTIME_CHECKS
+#define X_PFX xive_rm_
+#define X_STATIC
+#define X_STAT_PFX stat_rm_
+#define __x_tima		get_tima_phys()
+#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_page))
+#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_page))
+#define __x_readb	__raw_rm_readb
+#define __x_writeb	__raw_rm_writeb
+#define __x_readw	__raw_rm_readw
+#define __x_readq	__raw_rm_readq
+#define __x_writeq	__raw_rm_writeq
+
+#include "book3s_xive_template.c"
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7c6477d1840a..bdb3f76ceb6b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -30,6 +30,7 @@
 #include <asm/book3s/64/mmu-hash.h>
 #include <asm/tm.h>
 #include <asm/opal.h>
+#include <asm/xive-regs.h>
 
 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
 
@@ -970,6 +971,23 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	cmpwi	r3, 512		/* 1 microsecond */
 	blt	hdec_soon
 
+#ifdef CONFIG_KVM_XICS
+	/* We are entering the guest on that thread, push VCPU to XIVE */
+	ld	r10, HSTATE_XIVE_TIMA_PHYS(r13)
+	cmpldi	cr0, r10, r0
+	beq	no_xive
+	ld	r11, VCPU_XIVE_SAVED_STATE(r4)
+	li	r9, TM_QW1_OS
+	stdcix	r11,r9,r10
+	eieio
+	lwz	r11, VCPU_XIVE_CAM_WORD(r4)
+	li	r9, TM_QW1_OS + TM_WORD2
+	stwcix	r11,r9,r10
+	li	r9, 1
+	stw	r9, VCPU_XIVE_PUSHED(r4)
+no_xive:
+#endif /* CONFIG_KVM_XICS */
+
 deliver_guest_interrupt:
 	ld	r6, VCPU_CTR(r4)
 	ld	r7, VCPU_XER(r4)
@@ -1307,6 +1325,42 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	blt	deliver_guest_interrupt
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
+#ifdef CONFIG_KVM_XICS
+	/* We are exiting, pull the VP from the XIVE */
+	lwz	r0, VCPU_XIVE_PUSHED(r9)
+	cmpwi	cr0, r0, 0
+	beq	1f
+	li	r7, TM_SPC_PULL_OS_CTX
+	li	r6, TM_QW1_OS
+	mfmsr	r0
+	andi.	r0, r0, MSR_IR		/* in real mode? */
+	beq	2f
+	ld	r10, HSTATE_XIVE_TIMA_VIRT(r13)
+	cmpldi	cr0, r10, 0
+	beq	1f
+	/* First load to pull the context, we ignore the value */
+	lwzx	r11, r7, r10
+	eieio
+	/* Second load to recover the context state (Words 0 and 1) */
+	ldx	r11, r6, r10
+	b	3f
+2:	ld	r10, HSTATE_XIVE_TIMA_PHYS(r13)
+	cmpldi	cr0, r10, 0
+	beq	1f
+	/* First load to pull the context, we ignore the value */
+	lwzcix	r11, r7, r10
+	eieio
+	/* Second load to recover the context state (Words 0 and 1) */
+	ldcix	r11, r6, r10
+3:	std	r11, VCPU_XIVE_SAVED_STATE(r9)
+	/* Fixup some of the state for the next load */
+	li	r10, 0
+	li	r0, 0xff
+	stw	r10, VCPU_XIVE_PUSHED(r9)
+	stb	r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
+	stb	r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
+1:
+#endif /* CONFIG_KVM_XICS */
 	/* Save more register state  */
 	mfdar	r6
 	mfdsisr	r7
@@ -2011,7 +2065,7 @@ hcall_real_table:
 	.long	DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table
-	.long	0		/* 0x70 - H_IPOLL */
+	.long	DOTSYM(kvmppc_rm_h_ipoll) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table
 #else
 	.long	0		/* 0x64 - H_EOI */
@@ -2181,7 +2235,11 @@ hcall_real_table:
 	.long	0		/* 0x2f0 */
 	.long	0		/* 0x2f4 */
 	.long	0		/* 0x2f8 */
-	.long	0		/* 0x2fc */
+#ifdef CONFIG_KVM_XICS
+	.long	DOTSYM(kvmppc_rm_h_xirr_x) - hcall_real_table
+#else
+	.long	0		/* 0x2fc - H_XIRR_X*/
+#endif
 	.long	DOTSYM(kvmppc_h_random) - hcall_real_table
 	.globl	hcall_real_table_end
 hcall_real_table_end:
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 20528701835b..2d3b2b1cc272 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -16,6 +16,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
 #include <asm/rtas.h>
+#include <asm/xive.h>
 
 #ifdef CONFIG_KVM_XICS
 static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
@@ -32,7 +33,10 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
 	server = be32_to_cpu(args->args[1]);
 	priority = be32_to_cpu(args->args[2]);
 
-	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (xive_enabled())
+		rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
+	else
+		rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
 	if (rc)
 		rc = -3;
 out:
@@ -52,7 +56,10 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
 	irq = be32_to_cpu(args->args[0]);
 
 	server = priority = 0;
-	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (xive_enabled())
+		rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
+	else
+		rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
 	if (rc) {
 		rc = -3;
 		goto out;
@@ -76,7 +83,10 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
 
 	irq = be32_to_cpu(args->args[0]);
 
-	rc = kvmppc_xics_int_off(vcpu->kvm, irq);
+	if (xive_enabled())
+		rc = kvmppc_xive_int_off(vcpu->kvm, irq);
+	else
+		rc = kvmppc_xics_int_off(vcpu->kvm, irq);
 	if (rc)
 		rc = -3;
 out:
@@ -95,7 +105,10 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
 
 	irq = be32_to_cpu(args->args[0]);
 
-	rc = kvmppc_xics_int_on(vcpu->kvm, irq);
+	if (xive_enabled())
+		rc = kvmppc_xive_int_on(vcpu->kvm, irq);
+	else
+		rc = kvmppc_xics_int_on(vcpu->kvm, irq);
 	if (rc)
 		rc = -3;
 out:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 459b72cb617a..d329b2add7e2 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1306,8 +1306,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	return 0;
 }
 
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
-		bool line_status)
+int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+			bool line_status)
 {
 	struct kvmppc_xics *xics = kvm->arch.xics;
 
@@ -1316,14 +1316,6 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 	return ics_deliver_irq(xics, irq, level);
 }
 
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
-			      struct kvm *kvm, int irq_source_id,
-			      int level, bool line_status)
-{
-	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
-			   level, line_status);
-}
-
 static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	struct kvmppc_xics *xics = dev->private;
@@ -1457,29 +1449,6 @@ void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
 	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
 }
 
-static int xics_set_irq(struct kvm_kernel_irq_routing_entry *e,
-			struct kvm *kvm, int irq_source_id, int level,
-			bool line_status)
-{
-	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
-}
-
-int kvm_irq_map_gsi(struct kvm *kvm,
-		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
-{
-	entries->gsi = gsi;
-	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
-	entries->set = xics_set_irq;
-	entries->irqchip.irqchip = 0;
-	entries->irqchip.pin = gsi;
-	return 1;
-}
-
-int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
-	return pin;
-}
-
 void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
 			    unsigned long host_irq)
 {
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index ec5474cf70c6..453c9e518c19 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -10,6 +10,7 @@
 #ifndef _KVM_PPC_BOOK3S_XICS_H
 #define _KVM_PPC_BOOK3S_XICS_H
 
+#ifdef CONFIG_KVM_XICS
 /*
  * We use a two-level tree to store interrupt source information.
  * There are up to 1024 ICS nodes, each of which can represent
@@ -144,5 +145,11 @@ static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
 	return ics;
 }
 
+extern unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu);
+extern int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			 unsigned long mfrr);
+extern int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
 
+#endif /* CONFIG_KVM_XICS */
 #endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
new file mode 100644
index 000000000000..ffe1da95033a
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -0,0 +1,1894 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "xive-kvm: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/debug.h>
+#include <asm/debugfs.h>
+#include <asm/time.h>
+#include <asm/opal.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xive.h"
+
+
+/*
+ * Virtual mode variants of the hcalls for use on radix/radix
+ * with AIL. They require the VCPU's VP to be "pushed"
+ *
+ * We still instanciate them here because we use some of the
+ * generated utility functions as well in this file.
+ */
+#define XIVE_RUNTIME_CHECKS
+#define X_PFX xive_vm_
+#define X_STATIC static
+#define X_STAT_PFX stat_vm_
+#define __x_tima		xive_tima
+#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
+#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
+#define __x_readb	__raw_readb
+#define __x_writeb	__raw_writeb
+#define __x_readw	__raw_readw
+#define __x_readq	__raw_readq
+#define __x_writeq	__raw_writeq
+
+#include "book3s_xive_template.c"
+
+/*
+ * We leave a gap of a couple of interrupts in the queue to
+ * account for the IPI and additional safety guard.
+ */
+#define XIVE_Q_GAP	2
+
+/*
+ * This is a simple trigger for a generic XIVE IRQ. This must
+ * only be called for interrupts that support a trigger page
+ */
+static bool xive_irq_trigger(struct xive_irq_data *xd)
+{
+	/* This should be only for MSIs */
+	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
+		return false;
+
+	/* Those interrupts should always have a trigger page */
+	if (WARN_ON(!xd->trig_mmio))
+		return false;
+
+	out_be64(xd->trig_mmio, 0);
+
+	return true;
+}
+
+static irqreturn_t xive_esc_irq(int irq, void *data)
+{
+	struct kvm_vcpu *vcpu = data;
+
+	/* We use the existing H_PROD mechanism to wake up the target */
+	vcpu->arch.prodded = 1;
+	smp_mb();
+	if (vcpu->arch.ceded)
+		kvmppc_fast_vcpu_kick(vcpu);
+
+	return IRQ_HANDLED;
+}
+
+static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q = &xc->queues[prio];
+	char *name = NULL;
+	int rc;
+
+	/* Already there ? */
+	if (xc->esc_virq[prio])
+		return 0;
+
+	/* Hook up the escalation interrupt */
+	xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq);
+	if (!xc->esc_virq[prio]) {
+		pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		return -EIO;
+	}
+
+	/*
+	 * Future improvement: start with them disabled
+	 * and handle DD2 and later scheme of merged escalation
+	 * interrupts
+	 */
+	name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
+			 vcpu->kvm->arch.lpid, xc->server_num, prio);
+	if (!name) {
+		pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		rc = -ENOMEM;
+		goto error;
+	}
+	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
+			 IRQF_NO_THREAD, name, vcpu);
+	if (rc) {
+		pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		goto error;
+	}
+	xc->esc_virq_names[prio] = name;
+	return 0;
+error:
+	irq_dispose_mapping(xc->esc_virq[prio]);
+	xc->esc_virq[prio] = 0;
+	kfree(name);
+	return rc;
+}
+
+static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = xc->xive;
+	struct xive_q *q =  &xc->queues[prio];
+	void *qpage;
+	int rc;
+
+	if (WARN_ON(q->qpage))
+		return 0;
+
+	/* Allocate the queue and retrieve infos on current node for now */
+	qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order);
+	if (!qpage) {
+		pr_err("Failed to allocate queue %d for VCPU %d\n",
+		       prio, xc->server_num);
+		return -ENOMEM;;
+	}
+	memset(qpage, 0, 1 << xive->q_order);
+
+	/*
+	 * Reconfigure the queue. This will set q->qpage only once the
+	 * queue is fully configured. This is a requirement for prio 0
+	 * as we will stop doing EOIs for every IPI as soon as we observe
+	 * qpage being non-NULL, and instead will only EOI when we receive
+	 * corresponding queue 0 entries
+	 */
+	rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
+					 xive->q_order, true);
+	if (rc)
+		pr_err("Failed to configure queue %d for VCPU %d\n",
+		       prio, xc->server_num);
+	return rc;
+}
+
+/* Called with kvm_lock held */
+static int xive_check_provisioning(struct kvm *kvm, u8 prio)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvm_vcpu *vcpu;
+	int i, rc;
+
+	lockdep_assert_held(&kvm->lock);
+
+	/* Already provisioned ? */
+	if (xive->qmap & (1 << prio))
+		return 0;
+
+	pr_devel("Provisioning prio... %d\n", prio);
+
+	/* Provision each VCPU and enable escalations */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!vcpu->arch.xive_vcpu)
+			continue;
+		rc = xive_provision_queue(vcpu, prio);
+		if (rc == 0)
+			xive_attach_escalation(vcpu, prio);
+		if (rc)
+			return rc;
+	}
+
+	/* Order previous stores and mark it as provisioned */
+	mb();
+	xive->qmap |= (1 << prio);
+	return 0;
+}
+
+static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvmppc_xive_vcpu *xc;
+	struct xive_q *q;
+
+	/* Locate target server */
+	vcpu = kvmppc_xive_find_server(kvm, server);
+	if (!vcpu) {
+		pr_warn("%s: Can't find server %d\n", __func__, server);
+		return;
+	}
+	xc = vcpu->arch.xive_vcpu;
+	if (WARN_ON(!xc))
+		return;
+
+	q = &xc->queues[prio];
+	atomic_inc(&q->pending_count);
+}
+
+static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q;
+	u32 max;
+
+	if (WARN_ON(!xc))
+		return -ENXIO;
+	if (!xc->valid)
+		return -ENXIO;
+
+	q = &xc->queues[prio];
+	if (WARN_ON(!q->qpage))
+		return -ENXIO;
+
+	/* Calculate max number of interrupts in that queue. */
+	max = (q->msk + 1) - XIVE_Q_GAP;
+	return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
+}
+
+static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
+{
+	struct kvm_vcpu *vcpu;
+	int i, rc;
+
+	/* Locate target server */
+	vcpu = kvmppc_xive_find_server(kvm, *server);
+	if (!vcpu) {
+		pr_devel("Can't find server %d\n", *server);
+		return -EINVAL;
+	}
+
+	pr_devel("Finding irq target on 0x%x/%d...\n", *server, prio);
+
+	/* Try pick it */
+	rc = xive_try_pick_queue(vcpu, prio);
+	if (rc == 0)
+		return rc;
+
+	pr_devel(" .. failed, looking up candidate...\n");
+
+	/* Failed, pick another VCPU */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!vcpu->arch.xive_vcpu)
+			continue;
+		rc = xive_try_pick_queue(vcpu, prio);
+		if (rc == 0) {
+			*server = vcpu->arch.xive_vcpu->server_num;
+			pr_devel("  found on 0x%x/%d\n", *server, prio);
+			return rc;
+		}
+	}
+	pr_devel("  no available target !\n");
+
+	/* No available target ! */
+	return -EBUSY;
+}
+
+static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
+			     struct kvmppc_xive_src_block *sb,
+			     struct kvmppc_xive_irq_state *state)
+{
+	struct xive_irq_data *xd;
+	u32 hw_num;
+	u8 old_prio;
+	u64 val;
+
+	/*
+	 * Take the lock, set masked, try again if racing
+	 * with H_EOI
+	 */
+	for (;;) {
+		arch_spin_lock(&sb->lock);
+		old_prio = state->guest_priority;
+		state->guest_priority = MASKED;
+		mb();
+		if (!state->in_eoi)
+			break;
+		state->guest_priority = old_prio;
+		arch_spin_unlock(&sb->lock);
+	}
+
+	/* No change ? Bail */
+	if (old_prio == MASKED)
+		return old_prio;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	/*
+	 * If the interrupt is marked as needing masking via
+	 * firmware, we do it here. Firmware masking however
+	 * is "lossy", it won't return the old p and q bits
+	 * and won't set the interrupt to a state where it will
+	 * record queued ones. If this is an issue we should do
+	 * lazy masking instead.
+	 *
+	 * For now, we work around this in unmask by forcing
+	 * an interrupt whenever we unmask a non-LSI via FW
+	 * (if ever).
+	 */
+	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
+		xive_native_configure_irq(hw_num,
+					  xive->vp_base + state->act_server,
+					  MASKED, state->number);
+		/* set old_p so we can track if an H_EOI was done */
+		state->old_p = true;
+		state->old_q = false;
+	} else {
+		/* Set PQ to 10, return old P and old Q and remember them */
+		val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
+		state->old_p = !!(val & 2);
+		state->old_q = !!(val & 1);
+
+		/*
+		 * Synchronize hardware to sensure the queues are updated
+		 * when masking
+		 */
+		xive_native_sync_source(hw_num);
+	}
+
+	return old_prio;
+}
+
+static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb,
+				 struct kvmppc_xive_irq_state *state)
+{
+	/*
+	 * Take the lock try again if racing with H_EOI
+	 */
+	for (;;) {
+		arch_spin_lock(&sb->lock);
+		if (!state->in_eoi)
+			break;
+		arch_spin_unlock(&sb->lock);
+	}
+}
+
+static void xive_finish_unmask(struct kvmppc_xive *xive,
+			       struct kvmppc_xive_src_block *sb,
+			       struct kvmppc_xive_irq_state *state,
+			       u8 prio)
+{
+	struct xive_irq_data *xd;
+	u32 hw_num;
+
+	/* If we aren't changing a thing, move on */
+	if (state->guest_priority != MASKED)
+		goto bail;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	/*
+	 * See command in xive_lock_and_mask() concerning masking
+	 * via firmware.
+	 */
+	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
+		xive_native_configure_irq(hw_num,
+					  xive->vp_base + state->act_server,
+					  state->act_priority, state->number);
+		/* If an EOI is needed, do it here */
+		if (!state->old_p)
+			xive_vm_source_eoi(hw_num, xd);
+		/* If this is not an LSI, force a trigger */
+		if (!(xd->flags & OPAL_XIVE_IRQ_LSI))
+			xive_irq_trigger(xd);
+		goto bail;
+	}
+
+	/* Old Q set, set PQ to 11 */
+	if (state->old_q)
+		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
+
+	/*
+	 * If not old P, then perform an "effective" EOI,
+	 * on the source. This will handle the cases where
+	 * FW EOI is needed.
+	 */
+	if (!state->old_p)
+		xive_vm_source_eoi(hw_num, xd);
+
+	/* Synchronize ordering and mark unmasked */
+	mb();
+bail:
+	state->guest_priority = prio;
+}
+
+/*
+ * Target an interrupt to a given server/prio, this will fallback
+ * to another server if necessary and perform the HW targetting
+ * updates as needed
+ *
+ * NOTE: Must be called with the state lock held
+ */
+static int xive_target_interrupt(struct kvm *kvm,
+				 struct kvmppc_xive_irq_state *state,
+				 u32 server, u8 prio)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	u32 hw_num;
+	int rc;
+
+	/*
+	 * This will return a tentative server and actual
+	 * priority. The count for that new target will have
+	 * already been incremented.
+	 */
+	rc = xive_select_target(kvm, &server, prio);
+
+	/*
+	 * We failed to find a target ? Not much we can do
+	 * at least until we support the GIQ.
+	 */
+	if (rc)
+		return rc;
+
+	/*
+	 * Increment the old queue pending count if there
+	 * was one so that the old queue count gets adjusted later
+	 * when observed to be empty.
+	 */
+	if (state->act_priority != MASKED)
+		xive_inc_q_pending(kvm,
+				   state->act_server,
+				   state->act_priority);
+	/*
+	 * Update state and HW
+	 */
+	state->act_priority = prio;
+	state->act_server = server;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, NULL);
+
+	return xive_native_configure_irq(hw_num,
+					 xive->vp_base + server,
+					 prio, state->number);
+}
+
+/*
+ * Targetting rules: In order to avoid losing track of
+ * pending interrupts accross mask and unmask, which would
+ * allow queue overflows, we implement the following rules:
+ *
+ *  - Unless it was never enabled (or we run out of capacity)
+ *    an interrupt is always targetted at a valid server/queue
+ *    pair even when "masked" by the guest. This pair tends to
+ *    be the last one used but it can be changed under some
+ *    circumstances. That allows us to separate targetting
+ *    from masking, we only handle accounting during (re)targetting,
+ *    this also allows us to let an interrupt drain into its target
+ *    queue after masking, avoiding complex schemes to remove
+ *    interrupts out of remote processor queues.
+ *
+ *  - When masking, we set PQ to 10 and save the previous value
+ *    of P and Q.
+ *
+ *  - When unmasking, if saved Q was set, we set PQ to 11
+ *    otherwise we leave PQ to the HW state which will be either
+ *    10 if nothing happened or 11 if the interrupt fired while
+ *    masked. Effectively we are OR'ing the previous Q into the
+ *    HW Q.
+ *
+ *    Then if saved P is clear, we do an effective EOI (Q->P->Trigger)
+ *    which will unmask the interrupt and shoot a new one if Q was
+ *    set.
+ *
+ *    Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11,
+ *    effectively meaning an H_EOI from the guest is still expected
+ *    for that interrupt).
+ *
+ *  - If H_EOI occurs while masked, we clear the saved P.
+ *
+ *  - When changing target, we account on the new target and
+ *    increment a separate "pending" counter on the old one.
+ *    This pending counter will be used to decrement the old
+ *    target's count when its queue has been observed empty.
+ */
+
+int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+			 u32 priority)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u8 new_act_prio;
+	int rc = 0;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("set_xive ! irq 0x%x server 0x%x prio %d\n",
+		 irq, server, priority);
+
+	/* First, check provisioning of queues */
+	if (priority != MASKED)
+		rc = xive_check_provisioning(xive->kvm,
+			      xive_prio_from_guest(priority));
+	if (rc) {
+		pr_devel("  provisioning failure %d !\n", rc);
+		return rc;
+	}
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * We first handle masking/unmasking since the locking
+	 * might need to be retried due to EOIs, we'll handle
+	 * targetting changes later. These functions will return
+	 * with the SB lock held.
+	 *
+	 * xive_lock_and_mask() will also set state->guest_priority
+	 * but won't otherwise change other fields of the state.
+	 *
+	 * xive_lock_for_unmask will not actually unmask, this will
+	 * be done later by xive_finish_unmask() once the targetting
+	 * has been done, so we don't try to unmask an interrupt
+	 * that hasn't yet been targetted.
+	 */
+	if (priority == MASKED)
+		xive_lock_and_mask(xive, sb, state);
+	else
+		xive_lock_for_unmask(sb, state);
+
+
+	/*
+	 * Then we handle targetting.
+	 *
+	 * First calculate a new "actual priority"
+	 */
+	new_act_prio = state->act_priority;
+	if (priority != MASKED)
+		new_act_prio = xive_prio_from_guest(priority);
+
+	pr_devel(" new_act_prio=%x act_server=%x act_prio=%x\n",
+		 new_act_prio, state->act_server, state->act_priority);
+
+	/*
+	 * Then check if we actually need to change anything,
+	 *
+	 * The condition for re-targetting the interrupt is that
+	 * we have a valid new priority (new_act_prio is not 0xff)
+	 * and either the server or the priority changed.
+	 *
+	 * Note: If act_priority was ff and the new priority is
+	 *       also ff, we don't do anything and leave the interrupt
+	 *       untargetted. An attempt of doing an int_on on an
+	 *       untargetted interrupt will fail. If that is a problem
+	 *       we could initialize interrupts with valid default
+	 */
+
+	if (new_act_prio != MASKED &&
+	    (state->act_server != server ||
+	     state->act_priority != new_act_prio))
+		rc = xive_target_interrupt(kvm, state, server, new_act_prio);
+
+	/*
+	 * Perform the final unmasking of the interrupt source
+	 * if necessary
+	 */
+	if (priority != MASKED)
+		xive_finish_unmask(xive, sb, state, priority);
+
+	/*
+	 * Finally Update saved_priority to match. Only int_on/off
+	 * set this field to a different value.
+	 */
+	state->saved_priority = priority;
+
+	arch_spin_unlock(&sb->lock);
+	return rc;
+}
+
+int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+			 u32 *priority)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+	arch_spin_lock(&sb->lock);
+	*server = state->guest_server;
+	*priority = state->guest_priority;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+int kvmppc_xive_int_on(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	pr_devel("int_on(irq=0x%x)\n", irq);
+
+	/*
+	 * Check if interrupt was not targetted
+	 */
+	if (state->act_priority == MASKED) {
+		pr_devel("int_on on untargetted interrupt\n");
+		return -EINVAL;
+	}
+
+	/* If saved_priority is 0xff, do nothing */
+	if (state->saved_priority == MASKED)
+		return 0;
+
+	/*
+	 * Lock and unmask it.
+	 */
+	xive_lock_for_unmask(sb, state);
+	xive_finish_unmask(xive, sb, state, state->saved_priority);
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+int kvmppc_xive_int_off(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	pr_devel("int_off(irq=0x%x)\n", irq);
+
+	/*
+	 * Lock and mask
+	 */
+	state->saved_priority = xive_lock_and_mask(xive, sb, state);
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return false;
+	state = &sb->irq_state[idx];
+	if (!state->valid)
+		return false;
+
+	/*
+	 * Trigger the IPI. This assumes we never restore a pass-through
+	 * interrupt which should be safe enough
+	 */
+	xive_irq_trigger(&state->ipi_data);
+
+	return true;
+}
+
+u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	if (!xc)
+		return 0;
+
+	/* Return the per-cpu state for state saving/migration */
+	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
+	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
+}
+
+int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	u8 cppr, mfrr;
+	u32 xisr;
+
+	if (!xc || !xive)
+		return -ENOENT;
+
+	/* Grab individual state fields. We don't use pending_pri */
+	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+		KVM_REG_PPC_ICP_XISR_MASK;
+	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+
+	pr_devel("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n",
+		 xc->server_num, cppr, mfrr, xisr);
+
+	/*
+	 * We can't update the state of a "pushed" VCPU, but that
+	 * shouldn't happen.
+	 */
+	if (WARN_ON(vcpu->arch.xive_pushed))
+		return -EIO;
+
+	/* Update VCPU HW saved state */
+	vcpu->arch.xive_saved_state.cppr = cppr;
+	xc->hw_cppr = xc->cppr = cppr;
+
+	/*
+	 * Update MFRR state. If it's not 0xff, we mark the VCPU as
+	 * having a pending MFRR change, which will re-evaluate the
+	 * target. The VCPU will thus potentially get a spurious
+	 * interrupt but that's not a big deal.
+	 */
+	xc->mfrr = mfrr;
+	if (mfrr < cppr)
+		xive_irq_trigger(&xc->vp_ipi_data);
+
+	/*
+	 * Now saved XIRR is "interesting". It means there's something in
+	 * the legacy "1 element" queue... for an IPI we simply ignore it,
+	 * as the MFRR restore will handle that. For anything else we need
+	 * to force a resend of the source.
+	 * However the source may not have been setup yet. If that's the
+	 * case, we keep that info and increment a counter in the xive to
+	 * tell subsequent xive_set_source() to go look.
+	 */
+	if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
+		xc->delayed_irq = xisr;
+		xive->delayed_irqs++;
+		pr_devel("  xisr restore delayed\n");
+	}
+
+	return 0;
+}
+
+int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+			   struct irq_desc *host_desc)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct irq_data *host_data = irq_desc_get_irq_data(host_desc);
+	unsigned int host_irq = irq_desc_get_irq(host_desc);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
+	u16 idx;
+	u8 prio;
+	int rc;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, hw_irq);
+
+	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * Mark the passed-through interrupt as going to a VCPU,
+	 * this will prevent further EOIs and similar operations
+	 * from the XIVE code. It will also mask the interrupt
+	 * to either PQ=10 or 11 state, the latter if the interrupt
+	 * is pending. This will allow us to unmask or retrigger it
+	 * after routing it to the guest with a simple EOI.
+	 *
+	 * The "state" argument is a "token", all it needs is to be
+	 * non-NULL to switch to passed-through or NULL for the
+	 * other way around. We may not yet have an actual VCPU
+	 * target here and we don't really care.
+	 */
+	rc = irq_set_vcpu_affinity(host_irq, state);
+	if (rc) {
+		pr_err("Failed to set VCPU affinity for irq %d\n", host_irq);
+		return rc;
+	}
+
+	/*
+	 * Mask and read state of IPI. We need to know if its P bit
+	 * is set as that means it's potentially already using a
+	 * queue entry in the target
+	 */
+	prio = xive_lock_and_mask(xive, sb, state);
+	pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio,
+		 state->old_p, state->old_q);
+
+	/* Turn the IPI hard off */
+	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+
+	/* Grab info about irq */
+	state->pt_number = hw_irq;
+	state->pt_data = irq_data_get_irq_handler_data(host_data);
+
+	/*
+	 * Configure the IRQ to match the existing configuration of
+	 * the IPI if it was already targetted. Otherwise this will
+	 * mask the interrupt in a lossy way (act_priority is 0xff)
+	 * which is fine for a never started interrupt.
+	 */
+	xive_native_configure_irq(hw_irq,
+				  xive->vp_base + state->act_server,
+				  state->act_priority, state->number);
+
+	/*
+	 * We do an EOI to enable the interrupt (and retrigger if needed)
+	 * if the guest has the interrupt unmasked and the P bit was *not*
+	 * set in the IPI. If it was set, we know a slot may still be in
+	 * use in the target queue thus we have to wait for a guest
+	 * originated EOI
+	 */
+	if (prio != MASKED && !state->old_p)
+		xive_vm_source_eoi(hw_irq, state->pt_data);
+
+	/* Clear old_p/old_q as they are no longer relevant */
+	state->old_p = state->old_q = false;
+
+	/* Restore guest prio (unlocks EOI) */
+	mb();
+	state->guest_priority = prio;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
+
+int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+			   struct irq_desc *host_desc)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	unsigned int host_irq = irq_desc_get_irq(host_desc);
+	u16 idx;
+	u8 prio;
+	int rc;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("clr_mapped girq 0x%lx...\n", guest_irq);
+
+	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * Mask and read state of IRQ. We need to know if its P bit
+	 * is set as that means it's potentially already using a
+	 * queue entry in the target
+	 */
+	prio = xive_lock_and_mask(xive, sb, state);
+	pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio,
+		 state->old_p, state->old_q);
+
+	/*
+	 * If old_p is set, the interrupt is pending, we switch it to
+	 * PQ=11. This will force a resend in the host so the interrupt
+	 * isn't lost to whatver host driver may pick it up
+	 */
+	if (state->old_p)
+		xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
+
+	/* Release the passed-through interrupt to the host */
+	rc = irq_set_vcpu_affinity(host_irq, NULL);
+	if (rc) {
+		pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq);
+		return rc;
+	}
+
+	/* Forget about the IRQ */
+	state->pt_number = 0;
+	state->pt_data = NULL;
+
+	/* Reconfigure the IPI */
+	xive_native_configure_irq(state->ipi_number,
+				  xive->vp_base + state->act_server,
+				  state->act_priority, state->number);
+
+	/*
+	 * If old_p is set (we have a queue entry potentially
+	 * occupied) or the interrupt is masked, we set the IPI
+	 * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
+	 */
+	if (prio == MASKED || state->old_p)
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10);
+	else
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
+
+	/* Restore guest prio (unlocks EOI) */
+	mb();
+	state->guest_priority = prio;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
+
+static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	int i, j;
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+		if (!sb)
+			continue;
+		for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
+			struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
+
+			if (!state->valid)
+				continue;
+			if (state->act_priority == MASKED)
+				continue;
+			if (state->act_server != xc->server_num)
+				continue;
+
+			/* Clean it up */
+			arch_spin_lock(&sb->lock);
+			state->act_priority = MASKED;
+			xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+			xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+			if (state->pt_number) {
+				xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
+				xive_native_configure_irq(state->pt_number, 0, MASKED, 0);
+			}
+			arch_spin_unlock(&sb->lock);
+		}
+	}
+}
+
+void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = xc->xive;
+	int i;
+
+	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
+
+	/* Ensure no interrupt is still routed to that VP */
+	xc->valid = false;
+	kvmppc_xive_disable_vcpu_interrupts(vcpu);
+
+	/* Mask the VP IPI */
+	xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
+
+	/* Disable the VP */
+	xive_native_disable_vp(xc->vp_id);
+
+	/* Free the queues & associated interrupts */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+
+		/* Free the escalation irq */
+		if (xc->esc_virq[i]) {
+			free_irq(xc->esc_virq[i], vcpu);
+			irq_dispose_mapping(xc->esc_virq[i]);
+			kfree(xc->esc_virq_names[i]);
+		}
+		/* Free the queue */
+		xive_native_disable_queue(xc->vp_id, q, i);
+		if (q->qpage) {
+			free_pages((unsigned long)q->qpage,
+				   xive->q_page_order);
+			q->qpage = NULL;
+		}
+	}
+
+	/* Free the IPI */
+	if (xc->vp_ipi) {
+		xive_cleanup_irq_data(&xc->vp_ipi_data);
+		xive_native_free_irq(xc->vp_ipi);
+	}
+	/* Free the VP */
+	kfree(xc);
+}
+
+int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+			     struct kvm_vcpu *vcpu, u32 cpu)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvmppc_xive_vcpu *xc;
+	int i, r = -EBUSY;
+
+	pr_devel("connect_vcpu(cpu=%d)\n", cpu);
+
+	if (dev->ops != &kvm_xive_ops) {
+		pr_devel("Wrong ops !\n");
+		return -EPERM;
+	}
+	if (xive->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type)
+		return -EBUSY;
+	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
+		pr_devel("Duplicate !\n");
+		return -EEXIST;
+	}
+	if (cpu >= KVM_MAX_VCPUS) {
+		pr_devel("Out of bounds !\n");
+		return -EINVAL;
+	}
+	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
+	if (!xc)
+		return -ENOMEM;
+
+	/* We need to synchronize with queue provisioning */
+	mutex_lock(&vcpu->kvm->lock);
+	vcpu->arch.xive_vcpu = xc;
+	xc->xive = xive;
+	xc->vcpu = vcpu;
+	xc->server_num = cpu;
+	xc->vp_id = xive->vp_base + cpu;
+	xc->mfrr = 0xff;
+	xc->valid = true;
+
+	r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
+	if (r)
+		goto bail;
+
+	/* Configure VCPU fields for use by assembly push/pull */
+	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
+	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
+
+	/* Allocate IPI */
+	xc->vp_ipi = xive_native_alloc_irq();
+	if (!xc->vp_ipi) {
+		r = -EIO;
+		goto bail;
+	}
+	pr_devel(" IPI=0x%x\n", xc->vp_ipi);
+
+	r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data);
+	if (r)
+		goto bail;
+
+	/*
+	 * Initialize queues. Initially we set them all for no queueing
+	 * and we enable escalation for queue 0 only which we'll use for
+	 * our mfrr change notifications. If the VCPU is hot-plugged, we
+	 * do handle provisioning however.
+	 */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+
+		/* Is queue already enabled ? Provision it */
+		if (xive->qmap & (1 << i)) {
+			r = xive_provision_queue(vcpu, i);
+			if (r == 0)
+				xive_attach_escalation(vcpu, i);
+			if (r)
+				goto bail;
+		} else {
+			r = xive_native_configure_queue(xc->vp_id,
+							q, i, NULL, 0, true);
+			if (r) {
+				pr_err("Failed to configure queue %d for VCPU %d\n",
+				       i, cpu);
+				goto bail;
+			}
+		}
+	}
+
+	/* If not done above, attach priority 0 escalation */
+	r = xive_attach_escalation(vcpu, 0);
+	if (r)
+		goto bail;
+
+	/* Enable the VP */
+	r = xive_native_enable_vp(xc->vp_id);
+	if (r)
+		goto bail;
+
+	/* Route the IPI */
+	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
+	if (!r)
+		xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
+
+bail:
+	mutex_unlock(&vcpu->kvm->lock);
+	if (r) {
+		kvmppc_xive_cleanup_vcpu(vcpu);
+		return r;
+	}
+
+	vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+	return 0;
+}
+
+/*
+ * Scanning of queues before/after migration save
+ */
+static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return;
+
+	state = &sb->irq_state[idx];
+
+	/* Some sanity checking */
+	if (!state->valid) {
+		pr_err("invalid irq 0x%x in cpu queue!\n", irq);
+		return;
+	}
+
+	/*
+	 * If the interrupt is in a queue it should have P set.
+	 * We warn so that gets reported. A backtrace isn't useful
+	 * so no need to use a WARN_ON.
+	 */
+	if (!state->saved_p)
+		pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq);
+
+	/* Set flag */
+	state->in_queue = true;
+}
+
+static void xive_pre_save_mask_irq(struct kvmppc_xive *xive,
+				   struct kvmppc_xive_src_block *sb,
+				   u32 irq)
+{
+	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
+
+	if (!state->valid)
+		return;
+
+	/* Mask and save state, this will also sync HW queues */
+	state->saved_scan_prio = xive_lock_and_mask(xive, sb, state);
+
+	/* Transfer P and Q */
+	state->saved_p = state->old_p;
+	state->saved_q = state->old_q;
+
+	/* Unlock */
+	arch_spin_unlock(&sb->lock);
+}
+
+static void xive_pre_save_unmask_irq(struct kvmppc_xive *xive,
+				     struct kvmppc_xive_src_block *sb,
+				     u32 irq)
+{
+	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
+
+	if (!state->valid)
+		return;
+
+	/*
+	 * Lock / exclude EOI (not technically necessary if the
+	 * guest isn't running concurrently. If this becomes a
+	 * performance issue we can probably remove the lock.
+	 */
+	xive_lock_for_unmask(sb, state);
+
+	/* Restore mask/prio if it wasn't masked */
+	if (state->saved_scan_prio != MASKED)
+		xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
+
+	/* Unlock */
+	arch_spin_unlock(&sb->lock);
+}
+
+static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q)
+{
+	u32 idx = q->idx;
+	u32 toggle = q->toggle;
+	u32 irq;
+
+	do {
+		irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle);
+		if (irq > XICS_IPI)
+			xive_pre_save_set_queued(xive, irq);
+	} while(irq);
+}
+
+static void xive_pre_save_scan(struct kvmppc_xive *xive)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i, j;
+
+	/*
+	 * See comment in xive_get_source() about how this
+	 * work. Collect a stable state for all interrupts
+	 */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			xive_pre_save_mask_irq(xive, sb, j);
+	}
+
+	/* Then scan the queues and update the "in_queue" flag */
+	kvm_for_each_vcpu(i, vcpu, xive->kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+		if (!xc)
+			continue;
+		for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
+			if (xc->queues[i].qpage)
+				xive_pre_save_queue(xive, &xc->queues[i]);
+		}
+	}
+
+	/* Finally restore interrupt states */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			xive_pre_save_unmask_irq(xive, sb, j);
+	}
+}
+
+static void xive_post_save_scan(struct kvmppc_xive *xive)
+{
+	u32 i, j;
+
+	/* Clear all the in_queue flags */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			sb->irq_state[j].in_queue = false;
+	}
+
+	/* Next get_source() will do a new scan */
+	xive->saved_src_count = 0;
+}
+
+/*
+ * This returns the source configuration and state to user space.
+ */
+static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u64 val, prio;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -ENOENT;
+
+	state = &sb->irq_state[idx];
+
+	if (!state->valid)
+		return -ENOENT;
+
+	pr_devel("get_source(%ld)...\n", irq);
+
+	/*
+	 * So to properly save the state into something that looks like a
+	 * XICS migration stream we cannot treat interrupts individually.
+	 *
+	 * We need, instead, mask them all (& save their previous PQ state)
+	 * to get a stable state in the HW, then sync them to ensure that
+	 * any interrupt that had already fired hits its queue, and finally
+	 * scan all the queues to collect which interrupts are still present
+	 * in the queues, so we can set the "pending" flag on them and
+	 * they can be resent on restore.
+	 *
+	 * So we do it all when the "first" interrupt gets saved, all the
+	 * state is collected at that point, the rest of xive_get_source()
+	 * will merely collect and convert that state to the expected
+	 * userspace bit mask.
+	 */
+	if (xive->saved_src_count == 0)
+		xive_pre_save_scan(xive);
+	xive->saved_src_count++;
+
+	/* Convert saved state into something compatible with xics */
+	val = state->guest_server;
+	prio = state->saved_scan_prio;
+
+	if (prio == MASKED) {
+		val |= KVM_XICS_MASKED;
+		prio = state->saved_priority;
+	}
+	val |= prio << KVM_XICS_PRIORITY_SHIFT;
+	if (state->lsi) {
+		val |= KVM_XICS_LEVEL_SENSITIVE;
+		if (state->saved_p)
+			val |= KVM_XICS_PENDING;
+	} else {
+		if (state->saved_p)
+			val |= KVM_XICS_PRESENTED;
+
+		if (state->saved_q)
+			val |= KVM_XICS_QUEUED;
+
+		/*
+		 * We mark it pending (which will attempt a re-delivery)
+		 * if we are in a queue *or* we were masked and had
+		 * Q set which is equivalent to the XICS "masked pending"
+		 * state
+		 */
+		if (state->in_queue || (prio == MASKED && state->saved_q))
+			val |= KVM_XICS_PENDING;
+	}
+
+	/*
+	 * If that was the last interrupt saved, reset the
+	 * in_queue flags
+	 */
+	if (xive->saved_src_count == xive->src_count)
+		xive_post_save_scan(xive);
+
+	/* Copy the result to userspace */
+	if (put_user(val, ubufp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
+							   int irq)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvmppc_xive_src_block *sb;
+	int i, bid;
+
+	bid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&kvm->lock);
+
+	/* block already exists - somebody else got here first */
+	if (xive->src_blocks[bid])
+		goto out;
+
+	/* Create the ICS */
+	sb = kzalloc(sizeof(*sb), GFP_KERNEL);
+	if (!sb)
+		goto out;
+
+	sb->id = bid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
+		sb->irq_state[i].guest_priority = MASKED;
+		sb->irq_state[i].saved_priority = MASKED;
+		sb->irq_state[i].act_priority = MASKED;
+	}
+	smp_wmb();
+	xive->src_blocks[bid] = sb;
+
+	if (bid > xive->max_sbid)
+		xive->max_sbid = bid;
+
+out:
+	mutex_unlock(&kvm->lock);
+	return xive->src_blocks[bid];
+}
+
+static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		if (xc->delayed_irq == irq) {
+			xc->delayed_irq = 0;
+			xive->delayed_irqs--;
+			return true;
+		}
+	}
+	return false;
+}
+
+static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 act_prio, guest_prio;
+	u32 server;
+	int rc = 0;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	pr_devel("set_source(irq=0x%lx)\n", irq);
+
+	/* Find the source */
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb) {
+		pr_devel("No source, creating source block...\n");
+		sb = xive_create_src_block(xive, irq);
+		if (!sb) {
+			pr_devel("Failed to create block...\n");
+			return -ENOMEM;
+		}
+	}
+	state = &sb->irq_state[idx];
+
+	/* Read user passed data */
+	if (get_user(val, ubufp)) {
+		pr_devel("fault getting user info !\n");
+		return -EFAULT;
+	}
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	guest_prio = val >> KVM_XICS_PRIORITY_SHIFT;
+
+	pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
+		 val, server, guest_prio);
+	/*
+	 * If the source doesn't already have an IPI, allocate
+	 * one and get the corresponding data
+	 */
+	if (!state->ipi_number) {
+		state->ipi_number = xive_native_alloc_irq();
+		if (state->ipi_number == 0) {
+			pr_devel("Failed to allocate IPI !\n");
+			return -ENOMEM;
+		}
+		xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
+		pr_devel(" src_ipi=0x%x\n", state->ipi_number);
+	}
+
+	/*
+	 * We use lock_and_mask() to set us in the right masked
+	 * state. We will override that state from the saved state
+	 * further down, but this will handle the cases of interrupts
+	 * that need FW masking. We set the initial guest_priority to
+	 * 0 before calling it to ensure it actually performs the masking.
+	 */
+	state->guest_priority = 0;
+	xive_lock_and_mask(xive, sb, state);
+
+	/*
+	 * Now, we select a target if we have one. If we don't we
+	 * leave the interrupt untargetted. It means that an interrupt
+	 * can become "untargetted" accross migration if it was masked
+	 * by set_xive() but there is little we can do about it.
+	 */
+
+	/* First convert prio and mark interrupt as untargetted */
+	act_prio = xive_prio_from_guest(guest_prio);
+	state->act_priority = MASKED;
+	state->guest_server = server;
+
+	/*
+	 * We need to drop the lock due to the mutex below. Hopefully
+	 * nothing is touching that interrupt yet since it hasn't been
+	 * advertized to a running guest yet
+	 */
+	arch_spin_unlock(&sb->lock);
+
+	/* If we have a priority target the interrupt */
+	if (act_prio != MASKED) {
+		/* First, check provisioning of queues */
+		mutex_lock(&xive->kvm->lock);
+		rc = xive_check_provisioning(xive->kvm, act_prio);
+		mutex_unlock(&xive->kvm->lock);
+
+		/* Target interrupt */
+		if (rc == 0)
+			rc = xive_target_interrupt(xive->kvm, state,
+						   server, act_prio);
+		/*
+		 * If provisioning or targetting failed, leave it
+		 * alone and masked. It will remain disabled until
+		 * the guest re-targets it.
+		 */
+	}
+
+	/*
+	 * Find out if this was a delayed irq stashed in an ICP,
+	 * in which case, treat it as pending
+	 */
+	if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
+		val |= KVM_XICS_PENDING;
+		pr_devel("  Found delayed ! forcing PENDING !\n");
+	}
+
+	/* Cleanup the SW state */
+	state->old_p = false;
+	state->old_q = false;
+	state->lsi = false;
+	state->asserted = false;
+
+	/* Restore LSI state */
+	if (val & KVM_XICS_LEVEL_SENSITIVE) {
+		state->lsi = true;
+		if (val & KVM_XICS_PENDING)
+			state->asserted = true;
+		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
+	}
+
+	/*
+	 * Restore P and Q. If the interrupt was pending, we
+	 * force both P and Q, which will trigger a resend.
+	 *
+	 * That means that a guest that had both an interrupt
+	 * pending (queued) and Q set will restore with only
+	 * one instance of that interrupt instead of 2, but that
+	 * is perfectly fine as coalescing interrupts that haven't
+	 * been presented yet is always allowed.
+	 */
+	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+		state->old_p = true;
+	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
+		state->old_q = true;
+
+	pr_devel("  P=%d, Q=%d\n", state->old_p, state->old_q);
+
+	/*
+	 * If the interrupt was unmasked, update guest priority and
+	 * perform the appropriate state transition and do a
+	 * re-trigger if necessary.
+	 */
+	if (val & KVM_XICS_MASKED) {
+		pr_devel("  masked, saving prio\n");
+		state->guest_priority = MASKED;
+		state->saved_priority = guest_prio;
+	} else {
+		pr_devel("  unmasked, restoring to prio %d\n", guest_prio);
+		xive_finish_unmask(xive, sb, state, guest_prio);
+		state->saved_priority = guest_prio;
+	}
+
+	/* Increment the number of valid sources and mark this one valid */
+	if (!state->valid)
+		xive->src_count++;
+	state->valid = true;
+
+	return 0;
+}
+
+int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+			bool line_status)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+
+	/* Perform locklessly .... (we need to do some RCUisms here...) */
+	state = &sb->irq_state[idx];
+	if (!state->valid)
+		return -EINVAL;
+
+	/* We don't allow a trigger on a passed-through interrupt */
+	if (state->pt_number)
+		return -EINVAL;
+
+	if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = 1;
+	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
+		state->asserted = 0;
+		return 0;
+	}
+
+	/* Trigger the IPI */
+	xive_irq_trigger(&state->ipi_data);
+
+	return 0;
+}
+
+static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We honor the existing XICS ioctl */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xive_set_source(xive, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We honor the existing XICS ioctl */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xive_get_source(xive, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	/* We honor the same limits as XICS, at least for now */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	}
+	return -ENXIO;
+}
+
+static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
+{
+	xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+	xive_native_configure_irq(hw_num, 0, MASKED, 0);
+	xive_cleanup_irq_data(xd);
+}
+
+static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
+{
+	int i;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
+
+		if (!state->valid)
+			continue;
+
+		kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
+		xive_native_free_irq(state->ipi_number);
+
+		/* Pass-through, cleanup too */
+		if (state->pt_number)
+			kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
+
+		state->valid = false;
+	}
+}
+
+static void kvmppc_xive_free(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvm *kvm = xive->kvm;
+	int i;
+
+	debugfs_remove(xive->dentry);
+
+	if (kvm)
+		kvm->arch.xive = NULL;
+
+	/* Mask and free interrupts */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		if (xive->src_blocks[i])
+			kvmppc_xive_free_sources(xive->src_blocks[i]);
+		kfree(xive->src_blocks[i]);
+		xive->src_blocks[i] = NULL;
+	}
+
+	if (xive->vp_base != XIVE_INVALID_VP)
+		xive_native_free_vp_block(xive->vp_base);
+
+
+	kfree(xive);
+	kfree(dev);
+}
+
+static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
+{
+	struct kvmppc_xive *xive;
+	struct kvm *kvm = dev->kvm;
+	int ret = 0;
+
+	pr_devel("Creating xive for partition\n");
+
+	xive = kzalloc(sizeof(*xive), GFP_KERNEL);
+	if (!xive)
+		return -ENOMEM;
+
+	dev->private = xive;
+	xive->dev = dev;
+	xive->kvm = kvm;
+
+	/* Already there ? */
+	if (kvm->arch.xive)
+		ret = -EEXIST;
+	else
+		kvm->arch.xive = xive;
+
+	/* We use the default queue size set by the host */
+	xive->q_order = xive_native_default_eq_shift();
+	if (xive->q_order < PAGE_SHIFT)
+		xive->q_page_order = 0;
+	else
+		xive->q_page_order = xive->q_order - PAGE_SHIFT;
+
+	/* Allocate a bunch of VPs */
+	xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
+	pr_devel("VP_Base=%x\n", xive->vp_base);
+
+	if (xive->vp_base == XIVE_INVALID_VP)
+		ret = -ENOMEM;
+
+	if (ret) {
+		kfree(xive);
+		return ret;
+	}
+
+	return 0;
+}
+
+
+static int xive_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xive *xive = m->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	u64 t_rm_h_xirr = 0;
+	u64 t_rm_h_ipoll = 0;
+	u64 t_rm_h_cppr = 0;
+	u64 t_rm_h_eoi = 0;
+	u64 t_rm_h_ipi = 0;
+	u64 t_vm_h_xirr = 0;
+	u64 t_vm_h_ipoll = 0;
+	u64 t_vm_h_cppr = 0;
+	u64 t_vm_h_eoi = 0;
+	u64 t_vm_h_ipi = 0;
+	unsigned int i;
+
+	if (!kvm)
+		return 0;
+
+	seq_printf(m, "=========\nVCPU state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x"
+			   " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
+			   xc->server_num, xc->cppr, xc->hw_cppr,
+			   xc->mfrr, xc->pending,
+			   xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
+
+		t_rm_h_xirr += xc->stat_rm_h_xirr;
+		t_rm_h_ipoll += xc->stat_rm_h_ipoll;
+		t_rm_h_cppr += xc->stat_rm_h_cppr;
+		t_rm_h_eoi += xc->stat_rm_h_eoi;
+		t_rm_h_ipi += xc->stat_rm_h_ipi;
+		t_vm_h_xirr += xc->stat_vm_h_xirr;
+		t_vm_h_ipoll += xc->stat_vm_h_ipoll;
+		t_vm_h_cppr += xc->stat_vm_h_cppr;
+		t_vm_h_eoi += xc->stat_vm_h_eoi;
+		t_vm_h_ipi += xc->stat_vm_h_ipi;
+	}
+
+	seq_printf(m, "Hcalls totals\n");
+	seq_printf(m, " H_XIRR  R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr);
+	seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll);
+	seq_printf(m, " H_CPPR  R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr);
+	seq_printf(m, " H_EOI   R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi);
+	seq_printf(m, " H_IPI   R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi);
+
+	return 0;
+}
+
+static int xive_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xive_debug_show, inode->i_private);
+}
+
+static const struct file_operations xive_debug_fops = {
+	.open = xive_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void xive_debugfs_init(struct kvmppc_xive *xive)
+{
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
+	if (!name) {
+		pr_err("%s: no memory for name\n", __func__);
+		return;
+	}
+
+	xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+					   xive, &xive_debug_fops);
+
+	pr_debug("%s: created %s\n", __func__, name);
+	kfree(name);
+}
+
+static void kvmppc_xive_init(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
+
+	/* Register some debug interfaces */
+	xive_debugfs_init(xive);
+}
+
+struct kvm_device_ops kvm_xive_ops = {
+	.name = "kvm-xive",
+	.create = kvmppc_xive_create,
+	.init = kvmppc_xive_init,
+	.destroy = kvmppc_xive_free,
+	.set_attr = xive_set_attr,
+	.get_attr = xive_get_attr,
+	.has_attr = xive_has_attr,
+};
+
+void kvmppc_xive_init_module(void)
+{
+	__xive_vm_h_xirr = xive_vm_h_xirr;
+	__xive_vm_h_ipoll = xive_vm_h_ipoll;
+	__xive_vm_h_ipi = xive_vm_h_ipi;
+	__xive_vm_h_cppr = xive_vm_h_cppr;
+	__xive_vm_h_eoi = xive_vm_h_eoi;
+}
+
+void kvmppc_xive_exit_module(void)
+{
+	__xive_vm_h_xirr = NULL;
+	__xive_vm_h_ipoll = NULL;
+	__xive_vm_h_ipi = NULL;
+	__xive_vm_h_cppr = NULL;
+	__xive_vm_h_eoi = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
new file mode 100644
index 000000000000..5938f7644dc1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XIVE_H
+#define _KVM_PPC_BOOK3S_XIVE_H
+
+#ifdef CONFIG_KVM_XICS
+#include "book3s_xics.h"
+
+/*
+ * State for one guest irq source.
+ *
+ * For each guest source we allocate a HW interrupt in the XIVE
+ * which we use for all SW triggers. It will be unused for
+ * pass-through but it's easier to keep around as the same
+ * guest interrupt can alternatively be emulated or pass-through
+ * if a physical device is hot unplugged and replaced with an
+ * emulated one.
+ *
+ * This state structure is very similar to the XICS one with
+ * additional XIVE specific tracking.
+ */
+struct kvmppc_xive_irq_state {
+	bool valid;			/* Interrupt entry is valid */
+
+	u32 number;			/* Guest IRQ number */
+	u32 ipi_number;			/* XIVE IPI HW number */
+	struct xive_irq_data ipi_data;	/* XIVE IPI associated data */
+	u32 pt_number;			/* XIVE Pass-through number if any */
+	struct xive_irq_data *pt_data;	/* XIVE Pass-through associated data */
+
+	/* Targetting as set by guest */
+	u32 guest_server;		/* Current guest selected target */
+	u8 guest_priority;		/* Guest set priority */
+	u8 saved_priority;		/* Saved priority when masking */
+
+	/* Actual targetting */
+	u32 act_server;			/* Actual server */
+	u8 act_priority;		/* Actual priority */
+
+	/* Various state bits */
+	bool in_eoi;			/* Synchronize with H_EOI */
+	bool old_p;			/* P bit state when masking */
+	bool old_q;			/* Q bit state when masking */
+	bool lsi;			/* level-sensitive interrupt */
+	bool asserted;			/* Only for emulated LSI: current state */
+
+	/* Saved for migration state */
+	bool in_queue;
+	bool saved_p;
+	bool saved_q;
+	u8 saved_scan_prio;
+};
+
+/* Select the "right" interrupt (IPI vs. passthrough) */
+static inline void kvmppc_xive_select_irq(struct kvmppc_xive_irq_state *state,
+					  u32 *out_hw_irq,
+					  struct xive_irq_data **out_xd)
+{
+	if (state->pt_number) {
+		if (out_hw_irq)
+			*out_hw_irq = state->pt_number;
+		if (out_xd)
+			*out_xd = state->pt_data;
+	} else {
+		if (out_hw_irq)
+			*out_hw_irq = state->ipi_number;
+		if (out_xd)
+			*out_xd = &state->ipi_data;
+	}
+}
+
+/*
+ * This corresponds to an "ICS" in XICS terminology, we use it
+ * as a mean to break up source information into multiple structures.
+ */
+struct kvmppc_xive_src_block {
+	arch_spinlock_t lock;
+	u16 id;
+	struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+
+struct kvmppc_xive {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct dentry *dentry;
+
+	/* VP block associated with the VM */
+	u32	vp_base;
+
+	/* Blocks of sources */
+	struct kvmppc_xive_src_block *src_blocks[KVMPPC_XICS_MAX_ICS_ID + 1];
+	u32	max_sbid;
+
+	/*
+	 * For state save, we lazily scan the queues on the first interrupt
+	 * being migrated. We don't have a clean way to reset that flags
+	 * so we keep track of the number of valid sources and how many of
+	 * them were migrated so we can reset when all of them have been
+	 * processed.
+	 */
+	u32	src_count;
+	u32	saved_src_count;
+
+	/*
+	 * Some irqs are delayed on restore until the source is created,
+	 * keep track here of how many of them
+	 */
+	u32	delayed_irqs;
+
+	/* Which queues (priorities) are in use by the guest */
+	u8	qmap;
+
+	/* Queue orders */
+	u32	q_order;
+	u32	q_page_order;
+
+};
+
+#define KVMPPC_XIVE_Q_COUNT	8
+
+struct kvmppc_xive_vcpu {
+	struct kvmppc_xive	*xive;
+	struct kvm_vcpu		*vcpu;
+	bool			valid;
+
+	/* Server number. This is the HW CPU ID from a guest perspective */
+	u32			server_num;
+
+	/*
+	 * HW VP corresponding to this VCPU. This is the base of the VP
+	 * block plus the server number.
+	 */
+	u32			vp_id;
+	u32			vp_chip_id;
+	u32			vp_cam;
+
+	/* IPI used for sending ... IPIs */
+	u32			vp_ipi;
+	struct xive_irq_data	vp_ipi_data;
+
+	/* Local emulation state */
+	uint8_t			cppr;	/* guest CPPR */
+	uint8_t			hw_cppr;/* Hardware CPPR */
+	uint8_t			mfrr;
+	uint8_t			pending;
+
+	/* Each VP has 8 queues though we only provision some */
+	struct xive_q		queues[KVMPPC_XIVE_Q_COUNT];
+	u32			esc_virq[KVMPPC_XIVE_Q_COUNT];
+	char			*esc_virq_names[KVMPPC_XIVE_Q_COUNT];
+
+	/* Stash a delayed irq on restore from migration (see set_icp) */
+	u32			delayed_irq;
+
+	/* Stats */
+	u64			stat_rm_h_xirr;
+	u64			stat_rm_h_ipoll;
+	u64			stat_rm_h_cppr;
+	u64			stat_rm_h_eoi;
+	u64			stat_rm_h_ipi;
+	u64			stat_vm_h_xirr;
+	u64			stat_vm_h_ipoll;
+	u64			stat_vm_h_cppr;
+	u64			stat_vm_h_eoi;
+	u64			stat_vm_h_ipi;
+};
+
+static inline struct kvm_vcpu *kvmppc_xive_find_server(struct kvm *kvm, u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.xive_vcpu && nr == vcpu->arch.xive_vcpu->server_num)
+			return vcpu;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmppc_xive *xive,
+		u32 irq, u16 *source)
+{
+	u32 bid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+
+	if (source)
+		*source = src;
+	if (bid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	return xive->src_blocks[bid];
+}
+
+/*
+ * Mapping between guest priorities and host priorities
+ * is as follow.
+ *
+ * Guest request for 0...6 are honored. Guest request for anything
+ * higher results in a priority of 7 being applied.
+ *
+ * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
+ * in order to match AIX expectations
+ *
+ * Similar mapping is done for CPPR values
+ */
+static inline u8 xive_prio_from_guest(u8 prio)
+{
+	if (prio == 0xff || prio < 8)
+		return prio;
+	return 7;
+}
+
+static inline u8 xive_prio_to_guest(u8 prio)
+{
+	if (prio == 0xff || prio < 7)
+		return prio;
+	return 0xb;
+}
+
+static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
+{
+	u32 cur;
+
+	if (!qpage)
+		return 0;
+	cur = be32_to_cpup(qpage + *idx);
+	if ((cur >> 31) == *toggle)
+		return 0;
+	*idx = (*idx + 1) & msk;
+	if (*idx == 0)
+		(*toggle) ^= 1;
+	return cur & 0x7fffffff;
+}
+
+extern unsigned long xive_rm_h_xirr(struct kvm_vcpu *vcpu);
+extern unsigned long xive_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
+extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			 unsigned long mfrr);
+extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
+
+extern unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
+extern unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
+extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+			      unsigned long mfrr);
+extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
+
+#endif /* CONFIG_KVM_XICS */
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
new file mode 100644
index 000000000000..023a31133c37
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+/* File to be included by other .c files */
+
+#define XGLUE(a,b) a##b
+#define GLUE(a,b) XGLUE(a,b)
+
+static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
+{
+	u8 cppr;
+	u16 ack;
+
+	/* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */
+
+	/* Perform the acknowledge OS to register cycle. */
+	ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG));
+
+	/* Synchronize subsequent queue accesses */
+	mb();
+
+	/* XXX Check grouping level */
+
+	/* Anything ? */
+	if (!((ack >> 8) & TM_QW1_NSR_EO))
+		return;
+
+	/* Grab CPPR of the most favored pending interrupt */
+	cppr = ack & 0xff;
+	if (cppr < 8)
+		xc->pending |= 1 << cppr;
+
+#ifdef XIVE_RUNTIME_CHECKS
+	/* Check consistency */
+	if (cppr >= xc->hw_cppr)
+		pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
+			smp_processor_id(), cppr, xc->hw_cppr);
+#endif
+
+	/*
+	 * Update our image of the HW CPPR. We don't yet modify
+	 * xc->cppr, this will be done as we scan for interrupts
+	 * in the queues.
+	 */
+	xc->hw_cppr = cppr;
+}
+
+static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd, u32 offset)
+{
+	u64 val;
+
+	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+		offset |= offset << 4;
+
+	val =__x_readq(__x_eoi_page(xd) + offset);
+#ifdef __LITTLE_ENDIAN__
+	val >>= 64-8;
+#endif
+	return (u8)val;
+}
+
+
+static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd)
+{
+	/* If the XIVE supports the new "store EOI facility, use it */
+	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+		__x_writeq(0, __x_eoi_page(xd));
+	else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
+		opal_int_eoi(hw_irq);
+	} else {
+		uint64_t eoi_val;
+
+		/*
+		 * Otherwise for EOI, we use the special MMIO that does
+		 * a clear of both P and Q and returns the old Q,
+		 * except for LSIs where we use the "EOI cycle" special
+		 * load.
+		 *
+		 * This allows us to then do a re-trigger if Q was set
+		 * rather than synthetizing an interrupt in software
+		 *
+		 * For LSIs, using the HW EOI cycle works around a problem
+		 * on P9 DD1 PHBs where the other ESB accesses don't work
+		 * properly.
+		 */
+		if (xd->flags & XIVE_IRQ_FLAG_LSI)
+			__x_readq(__x_eoi_page(xd));
+		else {
+			eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00);
+
+			/* Re-trigger if needed */
+			if ((eoi_val & 1) && __x_trig_page(xd))
+				__x_writeq(0, __x_trig_page(xd));
+		}
+	}
+}
+
+enum {
+	scan_fetch,
+	scan_poll,
+	scan_eoi,
+};
+
+static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc,
+				       u8 pending, int scan_type)
+{
+	u32 hirq = 0;
+	u8 prio = 0xff;
+
+	/* Find highest pending priority */
+	while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
+		struct xive_q *q;
+		u32 idx, toggle;
+		__be32 *qpage;
+
+		/*
+		 * If pending is 0 this will return 0xff which is what
+		 * we want
+		 */
+		prio = ffs(pending) - 1;
+
+		/*
+		 * If the most favoured prio we found pending is less
+		 * favored (or equal) than a pending IPI, we return
+		 * the IPI instead.
+		 *
+		 * Note: If pending was 0 and mfrr is 0xff, we will
+		 * not spurriously take an IPI because mfrr cannot
+		 * then be smaller than cppr.
+		 */
+		if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
+			prio = xc->mfrr;
+			hirq = XICS_IPI;
+			break;
+		}
+
+		/* Don't scan past the guest cppr */
+		if (prio >= xc->cppr || prio > 7)
+			break;
+
+		/* Grab queue and pointers */
+		q = &xc->queues[prio];
+		idx = q->idx;
+		toggle = q->toggle;
+
+		/*
+		 * Snapshot the queue page. The test further down for EOI
+		 * must use the same "copy" that was used by __xive_read_eq
+		 * since qpage can be set concurrently and we don't want
+		 * to miss an EOI.
+		 */
+		qpage = READ_ONCE(q->qpage);
+
+skip_ipi:
+		/*
+		 * Try to fetch from the queue. Will return 0 for a
+		 * non-queueing priority (ie, qpage = 0).
+		 */
+		hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
+
+		/*
+		 * If this was a signal for an MFFR change done by
+		 * H_IPI we skip it. Additionally, if we were fetching
+		 * we EOI it now, thus re-enabling reception of a new
+		 * such signal.
+		 *
+		 * We also need to do that if prio is 0 and we had no
+		 * page for the queue. In this case, we have non-queued
+		 * IPI that needs to be EOId.
+		 *
+		 * This is safe because if we have another pending MFRR
+		 * change that wasn't observed above, the Q bit will have
+		 * been set and another occurrence of the IPI will trigger.
+		 */
+		if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
+			if (scan_type == scan_fetch)
+				GLUE(X_PFX,source_eoi)(xc->vp_ipi,
+						       &xc->vp_ipi_data);
+			/* Loop back on same queue with updated idx/toggle */
+#ifdef XIVE_RUNTIME_CHECKS
+			WARN_ON(hirq && hirq != XICS_IPI);
+#endif
+			if (hirq)
+				goto skip_ipi;
+		}
+
+		/* If fetching, update queue pointers */
+		if (scan_type == scan_fetch) {
+			q->idx = idx;
+			q->toggle = toggle;
+		}
+
+		/* Something found, stop searching */
+		if (hirq)
+			break;
+
+		/* Clear the pending bit on the now empty queue */
+		pending &= ~(1 << prio);
+
+		/*
+		 * Check if the queue count needs adjusting due to
+		 * interrupts being moved away.
+		 */
+		if (atomic_read(&q->pending_count)) {
+			int p = atomic_xchg(&q->pending_count, 0);
+			if (p) {
+#ifdef XIVE_RUNTIME_CHECKS
+				WARN_ON(p > atomic_read(&q->count));
+#endif
+				atomic_sub(p, &q->count);
+			}
+		}
+	}
+
+	/* If we are just taking a "peek", do nothing else */
+	if (scan_type == scan_poll)
+		return hirq;
+
+	/* Update the pending bits */
+	xc->pending = pending;
+
+	/*
+	 * If this is an EOI that's it, no CPPR adjustment done here,
+	 * all we needed was cleanup the stale pending bits and check
+	 * if there's anything left.
+	 */
+	if (scan_type == scan_eoi)
+		return hirq;
+
+	/*
+	 * If we found an interrupt, adjust what the guest CPPR should
+	 * be as if we had just fetched that interrupt from HW.
+	 */
+	if (hirq)
+		xc->cppr = prio;
+	/*
+	 * If it was an IPI the HW CPPR might have been lowered too much
+	 * as the HW interrupt we use for IPIs is routed to priority 0.
+	 *
+	 * We re-sync it here.
+	 */
+	if (xc->cppr != xc->hw_cppr) {
+		xc->hw_cppr = xc->cppr;
+		__x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+	}
+
+	return hirq;
+}
+
+X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 old_cppr;
+	u32 hirq;
+
+	pr_devel("H_XIRR\n");
+
+	xc->GLUE(X_STAT_PFX,h_xirr)++;
+
+	/* First collect pending bits from HW */
+	GLUE(X_PFX,ack_pending)(xc);
+
+	/*
+	 * Cleanup the old-style bits if needed (they may have been
+	 * set by pull or an escalation interrupts).
+	 */
+	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
+		clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
+			  &vcpu->arch.pending_exceptions);
+
+	pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
+		 xc->pending, xc->hw_cppr, xc->cppr);
+
+	/* Grab previous CPPR and reverse map it */
+	old_cppr = xive_prio_to_guest(xc->cppr);
+
+	/* Scan for actual interrupts */
+	hirq = GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_fetch);
+
+	pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
+		 hirq, xc->hw_cppr, xc->cppr);
+
+#ifdef XIVE_RUNTIME_CHECKS
+	/* That should never hit */
+	if (hirq & 0xff000000)
+		pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
+#endif
+
+	/*
+	 * XXX We could check if the interrupt is masked here and
+	 * filter it. If we chose to do so, we would need to do:
+	 *
+	 *    if (masked) {
+	 *        lock();
+	 *        if (masked) {
+	 *            old_Q = true;
+	 *            hirq = 0;
+	 *        }
+	 *        unlock();
+	 *    }
+	 */
+
+	/* Return interrupt and old CPPR in GPR4 */
+	vcpu->arch.gpr[4] = hirq | (old_cppr << 24);
+
+	return H_SUCCESS;
+}
+
+X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 pending = xc->pending;
+	u32 hirq;
+	u8 pipr;
+
+	pr_devel("H_IPOLL(server=%ld)\n", server);
+
+	xc->GLUE(X_STAT_PFX,h_ipoll)++;
+
+	/* Grab the target VCPU if not the current one */
+	if (xc->server_num != server) {
+		vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+		if (!vcpu)
+			return H_PARAMETER;
+		xc = vcpu->arch.xive_vcpu;
+
+		/* Scan all priorities */
+		pending = 0xff;
+	} else {
+		/* Grab pending interrupt if any */
+		pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+		if (pipr < 8)
+			pending |= 1 << pipr;
+	}
+
+	hirq = GLUE(X_PFX,scan_interrupts)(xc, pending, scan_poll);
+
+	/* Return interrupt and old CPPR in GPR4 */
+	vcpu->arch.gpr[4] = hirq | (xc->cppr << 24);
+
+	return H_SUCCESS;
+}
+
+static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc)
+{
+	u8 pending, prio;
+
+	pending = xc->pending;
+	if (xc->mfrr != 0xff) {
+		if (xc->mfrr < 8)
+			pending |= 1 << xc->mfrr;
+		else
+			pending |= 0x80;
+	}
+	if (!pending)
+		return;
+	prio = ffs(pending) - 1;
+
+	__x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING);
+}
+
+X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 old_cppr;
+
+	pr_devel("H_CPPR(cppr=%ld)\n", cppr);
+
+	xc->GLUE(X_STAT_PFX,h_cppr)++;
+
+	/* Map CPPR */
+	cppr = xive_prio_from_guest(cppr);
+
+	/* Remember old and update SW state */
+	old_cppr = xc->cppr;
+	xc->cppr = cppr;
+
+	/*
+	 * We are masking less, we need to look for pending things
+	 * to deliver and set VP pending bits accordingly to trigger
+	 * a new interrupt otherwise we might miss MFRR changes for
+	 * which we have optimized out sending an IPI signal.
+	 */
+	if (cppr > old_cppr)
+		GLUE(X_PFX,push_pending_to_hw)(xc);
+
+	/* Apply new CPPR */
+	xc->hw_cppr = cppr;
+	__x_writeb(cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+
+	return H_SUCCESS;
+}
+
+X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_irq_data *xd;
+	u8 new_cppr = xirr >> 24;
+	u32 irq = xirr & 0x00ffffff, hw_num;
+	u16 src;
+	int rc = 0;
+
+	pr_devel("H_EOI(xirr=%08lx)\n", xirr);
+
+	xc->GLUE(X_STAT_PFX,h_eoi)++;
+
+	xc->cppr = xive_prio_from_guest(new_cppr);
+
+	/*
+	 * IPIs are synthetized from MFRR and thus don't need
+	 * any special EOI handling. The underlying interrupt
+	 * used to signal MFRR changes is EOId when fetched from
+	 * the queue.
+	 */
+	if (irq == XICS_IPI || irq == 0)
+		goto bail;
+
+	/* Find interrupt source */
+	sb = kvmppc_xive_find_source(xive, irq, &src);
+	if (!sb) {
+		pr_devel(" source not found !\n");
+		rc = H_PARAMETER;
+		goto bail;
+	}
+	state = &sb->irq_state[src];
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	state->in_eoi = true;
+	mb();
+
+again:
+	if (state->guest_priority == MASKED) {
+		arch_spin_lock(&sb->lock);
+		if (state->guest_priority != MASKED) {
+			arch_spin_unlock(&sb->lock);
+			goto again;
+		}
+		pr_devel(" EOI on saved P...\n");
+
+		/* Clear old_p, that will cause unmask to perform an EOI */
+		state->old_p = false;
+
+		arch_spin_unlock(&sb->lock);
+	} else {
+		pr_devel(" EOI on source...\n");
+
+		/* Perform EOI on the source */
+		GLUE(X_PFX,source_eoi)(hw_num, xd);
+
+		/* If it's an emulated LSI, check level and resend */
+		if (state->lsi && state->asserted)
+			__x_writeq(0, __x_trig_page(xd));
+
+	}
+
+	mb();
+	state->in_eoi = false;
+bail:
+
+	/* Re-evaluate pending IRQs and update HW */
+	GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_eoi);
+	GLUE(X_PFX,push_pending_to_hw)(xc);
+	pr_devel(" after scan pending=%02x\n", xc->pending);
+
+	/* Apply new CPPR */
+	xc->hw_cppr = xc->cppr;
+	__x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+
+	return rc;
+}
+
+X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+			       unsigned long mfrr)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
+
+	xc->GLUE(X_STAT_PFX,h_ipi)++;
+
+	/* Find target */
+	vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+	if (!vcpu)
+		return H_PARAMETER;
+	xc = vcpu->arch.xive_vcpu;
+
+	/* Locklessly write over MFRR */
+	xc->mfrr = mfrr;
+
+	/* Shoot the IPI if most favored than target cppr */
+	if (mfrr < xc->cppr)
+		__x_writeq(0, __x_trig_page(&xc->vp_ipi_data));
+
+	return H_SUCCESS;
+}
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
index 5a9a10b90762..3f1be85a83bc 100644
--- a/arch/powerpc/kvm/irq.h
+++ b/arch/powerpc/kvm/irq.h
@@ -12,6 +12,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 #endif
 #ifdef CONFIG_KVM_XICS
 	ret = ret || (kvm->arch.xics != NULL);
+	ret = ret || (kvm->arch.xive != NULL);
 #endif
 	smp_rmb();
 	return ret;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1ee22a910074..f7cf2cd564ef 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,6 +38,8 @@
 #include <asm/irqflags.h>
 #include <asm/iommu.h>
 #include <asm/switch_to.h>
+#include <asm/xive.h>
+
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -697,7 +699,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
 		break;
 	case KVMPPC_IRQ_XICS:
-		kvmppc_xics_free_icp(vcpu);
+		if (xive_enabled())
+			kvmppc_xive_cleanup_vcpu(vcpu);
+		else
+			kvmppc_xics_free_icp(vcpu);
 		break;
 	}
 
@@ -1522,8 +1527,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
 		r = -EPERM;
 		dev = kvm_device_from_filp(f.file);
-		if (dev)
-			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+		if (dev) {
+			if (xive_enabled())
+				r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
+			else
+				r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+		}
 
 		fdput(f);
 		break;
@@ -1547,7 +1556,7 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
 		return true;
 #endif
 #ifdef CONFIG_KVM_XICS
-	if (kvm->arch.xics)
+	if (kvm->arch.xics || kvm->arch.xive)
 		return true;
 #endif
 	return false;
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 7925a9d72cca..59684b4af4d1 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -967,3 +967,4 @@ EXPORT_SYMBOL_GPL(opal_leds_set_ind);
 EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
 /* Export this for KVM */
 EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
+EXPORT_SYMBOL_GPL(opal_int_eoi);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 6a98efb14264..913825086b8d 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -46,13 +46,15 @@
 #endif
 
 bool __xive_enabled;
+EXPORT_SYMBOL_GPL(__xive_enabled);
 bool xive_cmdline_disabled;
 
 /* We use only one priority for now */
 static u8 xive_irq_priority;
 
-/* TIMA */
+/* TIMA exported to KVM */
 void __iomem *xive_tima;
+EXPORT_SYMBOL_GPL(xive_tima);
 u32 xive_tima_offset;
 
 /* Backend ops */
@@ -345,8 +347,11 @@ static void xive_irq_eoi(struct irq_data *d)
 	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
 		    d->irq, irqd_to_hwirq(d), xc->pending_prio);
 
-	/* EOI the source if it hasn't been disabled */
-	if (!irqd_irq_disabled(d))
+	/*
+	 * EOI the source if it hasn't been disabled and hasn't
+	 * been passed-through to a KVM guest
+	 */
+	if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d))
 		xive_do_source_eoi(irqd_to_hwirq(d), xd);
 
 	/*
@@ -689,9 +694,14 @@ static int xive_irq_set_affinity(struct irq_data *d,
 
 	old_target = xd->target;
 
-	rc = xive_ops->configure_irq(hw_irq,
-				     get_hard_smp_processor_id(target),
-				     xive_irq_priority, d->irq);
+	/*
+	 * Only configure the irq if it's not currently passed-through to
+	 * a KVM guest
+	 */
+	if (!irqd_is_forwarded_to_vcpu(d))
+		rc = xive_ops->configure_irq(hw_irq,
+					     get_hard_smp_processor_id(target),
+					     xive_irq_priority, d->irq);
 	if (rc < 0) {
 		pr_err("Error %d reconfiguring irq %d\n", rc, d->irq);
 		return rc;
@@ -771,6 +781,123 @@ static int xive_irq_retrigger(struct irq_data *d)
 	return 1;
 }
 
+static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	int rc;
+	u8 pq;
+
+	/*
+	 * We only support this on interrupts that do not require
+	 * firmware calls for masking and unmasking
+	 */
+	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW)
+		return -EIO;
+
+	/*
+	 * This is called by KVM with state non-NULL for enabling
+	 * pass-through or NULL for disabling it
+	 */
+	if (state) {
+		irqd_set_forwarded_to_vcpu(d);
+
+		/* Set it to PQ=10 state to prevent further sends */
+		pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
+
+		/* No target ? nothing to do */
+		if (xd->target == XIVE_INVALID_TARGET) {
+			/*
+			 * An untargetted interrupt should have been
+			 * also masked at the source
+			 */
+			WARN_ON(pq & 2);
+
+			return 0;
+		}
+
+		/*
+		 * If P was set, adjust state to PQ=11 to indicate
+		 * that a resend is needed for the interrupt to reach
+		 * the guest. Also remember the value of P.
+		 *
+		 * This also tells us that it's in flight to a host queue
+		 * or has already been fetched but hasn't been EOIed yet
+		 * by the host. This it's potentially using up a host
+		 * queue slot. This is important to know because as long
+		 * as this is the case, we must not hard-unmask it when
+		 * "returning" that interrupt to the host.
+		 *
+		 * This saved_p is cleared by the host EOI, when we know
+		 * for sure the queue slot is no longer in use.
+		 */
+		if (pq & 2) {
+			pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
+			xd->saved_p = true;
+
+			/*
+			 * Sync the XIVE source HW to ensure the interrupt
+			 * has gone through the EAS before we change its
+			 * target to the guest. That should guarantee us
+			 * that we *will* eventually get an EOI for it on
+			 * the host. Otherwise there would be a small window
+			 * for P to be seen here but the interrupt going
+			 * to the guest queue.
+			 */
+			if (xive_ops->sync_source)
+				xive_ops->sync_source(hw_irq);
+		} else
+			xd->saved_p = false;
+	} else {
+		irqd_clr_forwarded_to_vcpu(d);
+
+		/* No host target ? hard mask and return */
+		if (xd->target == XIVE_INVALID_TARGET) {
+			xive_do_source_set_mask(xd, true);
+			return 0;
+		}
+
+		/*
+		 * Sync the XIVE source HW to ensure the interrupt
+		 * has gone through the EAS before we change its
+		 * target to the host.
+		 */
+		if (xive_ops->sync_source)
+			xive_ops->sync_source(hw_irq);
+
+		/*
+		 * By convention we are called with the interrupt in
+		 * a PQ=10 or PQ=11 state, ie, it won't fire and will
+		 * have latched in Q whether there's a pending HW
+		 * interrupt or not.
+		 *
+		 * First reconfigure the target.
+		 */
+		rc = xive_ops->configure_irq(hw_irq,
+					     get_hard_smp_processor_id(xd->target),
+					     xive_irq_priority, d->irq);
+		if (rc)
+			return rc;
+
+		/*
+		 * Then if saved_p is not set, effectively re-enable the
+		 * interrupt with an EOI. If it is set, we know there is
+		 * still a message in a host queue somewhere that will be
+		 * EOId eventually.
+		 *
+		 * Note: We don't check irqd_irq_disabled(). Effectively,
+		 * we *will* let the irq get through even if masked if the
+		 * HW is still firing it in order to deal with the whole
+		 * saved_p business properly. If the interrupt triggers
+		 * while masked, the generic code will re-mask it anyway.
+		 */
+		if (!xd->saved_p)
+			xive_do_source_eoi(hw_irq, xd);
+
+	}
+	return 0;
+}
+
 static struct irq_chip xive_irq_chip = {
 	.name = "XIVE-IRQ",
 	.irq_startup = xive_irq_startup,
@@ -781,12 +908,14 @@ static struct irq_chip xive_irq_chip = {
 	.irq_set_affinity = xive_irq_set_affinity,
 	.irq_set_type = xive_irq_set_type,
 	.irq_retrigger = xive_irq_retrigger,
+	.irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
 };
 
 bool is_xive_irq(struct irq_chip *chip)
 {
 	return chip == &xive_irq_chip;
 }
+EXPORT_SYMBOL_GPL(is_xive_irq);
 
 void xive_cleanup_irq_data(struct xive_irq_data *xd)
 {
@@ -801,6 +930,7 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd)
 		xd->trig_mmio = NULL;
 	}
 }
+EXPORT_SYMBOL_GPL(xive_cleanup_irq_data);
 
 static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
 {
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 1a726229a427..ab9ecce61ee5 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -31,6 +31,7 @@
 #include <asm/xive.h>
 #include <asm/xive-regs.h>
 #include <asm/opal.h>
+#include <asm/kvm_ppc.h>
 
 #include "xive-internal.h"
 
@@ -95,6 +96,7 @@ int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xive_native_populate_irq_data);
 
 int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
 {
@@ -108,6 +110,8 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
 	}
 	return rc == 0 ? 0 : -ENXIO;
 }
+EXPORT_SYMBOL_GPL(xive_native_configure_irq);
+
 
 /* This can be called multiple time to change a queue configuration */
 int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
@@ -172,6 +176,7 @@ int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
 fail:
 	return rc;
 }
+EXPORT_SYMBOL_GPL(xive_native_configure_queue);
 
 static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
 {
@@ -192,6 +197,7 @@ void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
 {
 	__xive_native_disable_queue(vp_id, q, prio);
 }
+EXPORT_SYMBOL_GPL(xive_native_disable_queue);
 
 static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
 {
@@ -262,6 +268,7 @@ static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
 	}
 	return 0;
 }
+#endif /* CONFIG_SMP */
 
 u32 xive_native_alloc_irq(void)
 {
@@ -277,6 +284,7 @@ u32 xive_native_alloc_irq(void)
 		return 0;
 	return rc;
 }
+EXPORT_SYMBOL_GPL(xive_native_alloc_irq);
 
 void xive_native_free_irq(u32 irq)
 {
@@ -287,7 +295,9 @@ void xive_native_free_irq(u32 irq)
 		msleep(1);
 	}
 }
+EXPORT_SYMBOL_GPL(xive_native_free_irq);
 
+#ifdef CONFIG_SMP
 static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
 {
 	s64 rc;
@@ -383,7 +393,7 @@ static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
 		return;
 
 	/* Enable the pool VP */
-	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	vp = xive_pool_vps + cpu;
 	pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);
 	for (;;) {
 		rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0);
@@ -428,7 +438,7 @@ static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
 	in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
 
 	/* Disable it */
-	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	vp = xive_pool_vps + cpu;
 	for (;;) {
 		rc = opal_xive_set_vp_info(vp, 0, 0);
 		if (rc != OPAL_BUSY)
@@ -437,10 +447,11 @@ static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
 	}
 }
 
-static void xive_native_sync_source(u32 hw_irq)
+void xive_native_sync_source(u32 hw_irq)
 {
 	opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
 }
+EXPORT_SYMBOL_GPL(xive_native_sync_source);
 
 static const struct xive_ops xive_native_ops = {
 	.populate_irq_data	= xive_native_populate_irq_data,
@@ -501,10 +512,24 @@ static bool xive_parse_provisioning(struct device_node *np)
 	return true;
 }
 
+static void xive_native_setup_pools(void)
+{
+	/* Allocate a pool big enough */
+	pr_debug("XIVE: Allocating VP block for pool size %d\n", nr_cpu_ids);
+
+	xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids);
+	if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP))
+		pr_err("XIVE: Failed to allocate pool VP, KVM might not function\n");
+
+	pr_debug("XIVE: Pool VPs allocated at 0x%x for %d max CPUs\n",
+		 xive_pool_vps, nr_cpu_ids);
+}
+
 u32 xive_native_default_eq_shift(void)
 {
 	return xive_queue_shift;
 }
+EXPORT_SYMBOL_GPL(xive_native_default_eq_shift);
 
 bool xive_native_init(void)
 {
@@ -514,7 +539,7 @@ bool xive_native_init(void)
 	struct property *prop;
 	u8 max_prio = 7;
 	const __be32 *p;
-	u32 val;
+	u32 val, cpu;
 	s64 rc;
 
 	if (xive_cmdline_disabled)
@@ -550,7 +575,11 @@ bool xive_native_init(void)
 			break;
 	}
 
-	/* Grab size of provisioning pages */
+	/* Configure Thread Management areas for KVM */
+	for_each_possible_cpu(cpu)
+		kvmppc_set_xive_tima(cpu, r.start, tima);
+
+	/* Grab size of provisionning pages */
 	xive_parse_provisioning(np);
 
 	/* Switch the XIVE to exploitation mode */
@@ -560,6 +589,9 @@ bool xive_native_init(void)
 		return false;
 	}
 
+	/* Setup some dummy HV pool VPs */
+	xive_native_setup_pools();
+
 	/* Initialize XIVE core with our backend */
 	if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS,
 			    max_prio)) {
@@ -638,3 +670,47 @@ void xive_native_free_vp_block(u32 vp_base)
 		pr_warn("OPAL error %lld freeing VP block\n", rc);
 }
 EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
+
+int xive_native_enable_vp(u32 vp_id)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	return rc ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_enable_vp);
+
+int xive_native_disable_vp(u32 vp_id)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp_id, 0, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	return rc ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_disable_vp);
+
+int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
+{
+	__be64 vp_cam_be;
+	__be32 vp_chip_id_be;
+	s64 rc;
+
+	rc = opal_xive_get_vp_info(vp_id, NULL, &vp_cam_be, NULL, &vp_chip_id_be);
+	if (rc)
+		return -EIO;
+	*out_cam_id = be64_to_cpu(vp_cam_be) & 0xffffffffu;
+	*out_chip_id = be32_to_cpu(vp_chip_id_be);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f5bddf92faba..9c761fea0c98 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1020,6 +1020,8 @@ struct kvm_x86_ops {
 	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
 					   struct kvm_memory_slot *slot,
 					   gfn_t offset, unsigned long mask);
+	int (*write_log_dirty)(struct kvm_vcpu *vcpu);
+
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 558676538fca..5d3376f67794 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1498,6 +1498,21 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
+/**
+ * kvm_arch_write_log_dirty - emulate dirty page logging
+ * @vcpu: Guest mode vcpu
+ *
+ * Emulate arch specific page modification logging for the
+ * nested hypervisor
+ */
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
+{
+	if (kvm_x86_ops->write_log_dirty)
+		return kvm_x86_ops->write_log_dirty(vcpu);
+
+	return 0;
+}
+
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d8ccb32f7308..27975807cc64 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -202,4 +202,5 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 314d2071b337..56241746abbd 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -226,6 +226,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 		if (level == walker->level && write_fault &&
 				!(pte & PT_GUEST_DIRTY_MASK)) {
 			trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+#if PTTYPE == PTTYPE_EPT
+			if (kvm_arch_write_log_dirty(vcpu))
+				return -EINVAL;
+#endif
 			pte |= PT_GUEST_DIRTY_MASK;
 		}
 		if (pte == orig_pte)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5fd459c4043..c6f4ad44aa95 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -248,6 +248,7 @@ struct __packed vmcs12 {
 	u64 xss_exit_bitmap;
 	u64 guest_physical_address;
 	u64 vmcs_link_pointer;
+	u64 pml_address;
 	u64 guest_ia32_debugctl;
 	u64 guest_ia32_pat;
 	u64 guest_ia32_efer;
@@ -369,6 +370,7 @@ struct __packed vmcs12 {
 	u16 guest_ldtr_selector;
 	u16 guest_tr_selector;
 	u16 guest_intr_status;
+	u16 guest_pml_index;
 	u16 host_es_selector;
 	u16 host_cs_selector;
 	u16 host_ss_selector;
@@ -407,6 +409,7 @@ struct nested_vmx {
 	/* Has the level1 guest done vmxon? */
 	bool vmxon;
 	gpa_t vmxon_ptr;
+	bool pml_full;
 
 	/* The guest-physical address of the current VMCS L1 keeps for L2 */
 	gpa_t current_vmptr;
@@ -742,6 +745,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
 	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
 	FIELD(GUEST_INTR_STATUS, guest_intr_status),
+	FIELD(GUEST_PML_INDEX, guest_pml_index),
 	FIELD(HOST_ES_SELECTOR, host_es_selector),
 	FIELD(HOST_CS_SELECTOR, host_cs_selector),
 	FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -767,6 +771,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+	FIELD64(PML_ADDRESS, pml_address),
 	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
 	FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
 	FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
@@ -1314,6 +1319,11 @@ static inline bool report_flexpriority(void)
 	return flexpriority_enabled;
 }
 
+static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
+{
+	return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low);
+}
+
 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
 {
 	return vmcs12->cpu_based_vm_exec_control & bit;
@@ -1348,6 +1358,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
 		vmx_xsaves_supported();
 }
 
+static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
+}
+
 static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
 {
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -2751,8 +2766,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
 			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
 			VMX_EPT_1GB_PAGE_BIT;
-	       if (enable_ept_ad_bits)
+		if (enable_ept_ad_bits) {
+			vmx->nested.nested_vmx_secondary_ctls_high |=
+				SECONDARY_EXEC_ENABLE_PML;
 		       vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
+		}
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
@@ -8114,7 +8132,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_PREEMPTION_TIMER:
 		return false;
 	case EXIT_REASON_PML_FULL:
-		/* We don't expose PML support to L1. */
+		/* We emulate PML support to L1. */
 		return false;
 	default:
 		return true;
@@ -9364,13 +9382,20 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
 		struct x86_exception *fault)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exit_reason;
+	unsigned long exit_qualification = vcpu->arch.exit_qualification;
 
-	if (fault->error_code & PFERR_RSVD_MASK)
+	if (vmx->nested.pml_full) {
+		exit_reason = EXIT_REASON_PML_FULL;
+		vmx->nested.pml_full = false;
+		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+	} else if (fault->error_code & PFERR_RSVD_MASK)
 		exit_reason = EXIT_REASON_EPT_MISCONFIG;
 	else
 		exit_reason = EXIT_REASON_EPT_VIOLATION;
-	nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);
+
+	nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
 	vmcs12->guest_physical_address = fault->address;
 }
 
@@ -9713,6 +9738,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
+					 struct vmcs12 *vmcs12)
+{
+	u64 address = vmcs12->pml_address;
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
+		if (!nested_cpu_has_ept(vmcs12) ||
+		    !IS_ALIGNED(address, 4096)  ||
+		    address >> maxphyaddr)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
 				       struct vmx_msr_entry *e)
 {
@@ -9886,7 +9927,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			  bool from_vmentry, u32 *entry_failure_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exec_control;
+	u32 exec_control, vmcs12_exec_ctrl;
 
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -10017,8 +10058,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 				  SECONDARY_EXEC_APIC_REGISTER_VIRT);
 		if (nested_cpu_has(vmcs12,
-				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
-			exec_control |= vmcs12->secondary_vm_exec_control;
+				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
+			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
+				~SECONDARY_EXEC_ENABLE_PML;
+			exec_control |= vmcs12_exec_ctrl;
+		}
 
 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
 			vmcs_write64(EOI_EXIT_BITMAP0,
@@ -10248,6 +10292,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+	if (nested_vmx_check_pml_controls(vcpu, vmcs12))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
 				vmx->nested.nested_vmx_procbased_ctls_low,
 				vmx->nested.nested_vmx_procbased_ctls_high) ||
@@ -10266,6 +10313,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmx->nested.nested_vmx_entry_ctls_high))
 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+	if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
 	if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
 	    !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
 	    !nested_cr3_valid(vcpu, vmcs12->host_cr3))
@@ -11143,6 +11193,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm)
 	kvm_flush_pml_buffers(kvm);
 }
 
+static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	gpa_t gpa;
+	struct page *page = NULL;
+	u64 *pml_address;
+
+	if (is_guest_mode(vcpu)) {
+		WARN_ON_ONCE(vmx->nested.pml_full);
+
+		/*
+		 * Check if PML is enabled for the nested guest.
+		 * Whether eptp bit 6 is set is already checked
+		 * as part of A/D emulation.
+		 */
+		vmcs12 = get_vmcs12(vcpu);
+		if (!nested_cpu_has_pml(vmcs12))
+			return 0;
+
+		if (vmcs12->guest_pml_index > PML_ENTITY_NUM) {
+			vmx->nested.pml_full = true;
+			return 1;
+		}
+
+		gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+
+		page = nested_get_page(vcpu, vmcs12->pml_address);
+		if (!page)
+			return 0;
+
+		pml_address = kmap(page);
+		pml_address[vmcs12->guest_pml_index--] = gpa;
+		kunmap(page);
+		nested_release_page_clean(page);
+	}
+
+	return 0;
+}
+
 static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 					   struct kvm_memory_slot *memslot,
 					   gfn_t offset, unsigned long mask)
@@ -11502,6 +11592,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
 	.flush_log_dirty = vmx_flush_log_dirty,
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+	.write_log_dirty = vmx_write_pml_buffer,
 
 	.pre_block = vmx_pre_block,
 	.post_block = vmx_post_block,