From 24647e0a39b6c5ef007fa2f9a91ac8b581e093b9 Mon Sep 17 00:00:00 2001
From: Peter Shier <pshier@google.com>
Date: Wed, 10 Oct 2018 15:56:53 -0700
Subject: KVM: x86: Return updated timer current count register from
 KVM_GET_LAPIC

kvm_vcpu_ioctl_get_lapic (implements KVM_GET_LAPIC ioctl) does a bulk copy
of the LAPIC registers but must take into account that the one-shot and
periodic timer current count register is computed upon reads and is not
present in register state. When restoring LAPIC state (e.g. after
migration), restart timers from their their current count values at time of
save.

Note: When a one-shot timer expires, the code in arch/x86/kvm/lapic.c does
not zero the value of the LAPIC initial count register (emulating HW
behavior). If no other timer is run and pending prior to a subsequent
KVM_GET_LAPIC call, the returned register set will include the expired
one-shot initial count. On a subsequent KVM_SET_LAPIC call the code will
see a non-zero initial count and start a new one-shot timer using the
expired timer's count. This is a prior existing bug and will be addressed
in a separate patch. Thanks to jmattson@google.com for this find.

Signed-off-by: Peter Shier <pshier@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <20181010225653.238911-1-pshier@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 61 +++++++++++++++++++++++++++++++++++++++++++---------
 arch/x86/kvm/lapic.h |  7 +++++-
 2 files changed, 57 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9af25c97612a..38f7dc9c16ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1648,13 +1648,18 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	local_irq_restore(flags);
 }
 
+static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
+{
+	return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+}
+
 static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
 {
 	ktime_t now, remaining;
 	u64 ns_remaining_old, ns_remaining_new;
 
-	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-		* APIC_BUS_CYCLE_NS * apic->divide_count;
+	apic->lapic_timer.period =
+			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
 	limit_periodic_timer_frequency(apic);
 
 	now = ktime_get();
@@ -1672,14 +1677,15 @@ static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_diviso
 	apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
 }
 
-static bool set_target_expiration(struct kvm_lapic *apic)
+static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
 {
 	ktime_t now;
 	u64 tscl = rdtsc();
+	s64 deadline;
 
 	now = ktime_get();
-	apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
-		* APIC_BUS_CYCLE_NS * apic->divide_count;
+	apic->lapic_timer.period =
+			tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
 
 	if (!apic->lapic_timer.period) {
 		apic->lapic_timer.tscdeadline = 0;
@@ -1687,10 +1693,32 @@ static bool set_target_expiration(struct kvm_lapic *apic)
 	}
 
 	limit_periodic_timer_frequency(apic);
+	deadline = apic->lapic_timer.period;
+
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+		if (unlikely(count_reg != APIC_TMICT)) {
+			deadline = tmict_to_ns(apic,
+				     kvm_lapic_get_reg(apic, count_reg));
+			if (unlikely(deadline <= 0))
+				deadline = apic->lapic_timer.period;
+			else if (unlikely(deadline > apic->lapic_timer.period)) {
+				pr_info_ratelimited(
+				    "kvm: vcpu %i: requested lapic timer restore with "
+				    "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
+				    "Using initial count to start timer.\n",
+				    apic->vcpu->vcpu_id,
+				    count_reg,
+				    kvm_lapic_get_reg(apic, count_reg),
+				    deadline, apic->lapic_timer.period);
+				kvm_lapic_set_reg(apic, count_reg, 0);
+				deadline = apic->lapic_timer.period;
+			}
+		}
+	}
 
 	apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
-		nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
-	apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+		nsec_to_cycles(apic->vcpu, deadline);
+	apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
 
 	return true;
 }
@@ -1872,17 +1900,22 @@ void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
 	restart_apic_timer(apic);
 }
 
-static void start_apic_timer(struct kvm_lapic *apic)
+static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
 {
 	atomic_set(&apic->lapic_timer.pending, 0);
 
 	if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
-	    && !set_target_expiration(apic))
+	    && !set_target_expiration(apic, count_reg))
 		return;
 
 	restart_apic_timer(apic);
 }
 
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+	__start_apic_timer(apic, APIC_TMICT);
+}
+
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 {
 	bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
@@ -2493,6 +2526,14 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
 int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 {
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+
+	/*
+	 * Get calculated timer current count for remaining timer period (if
+	 * any) and store it in the returned register set.
+	 */
+	__kvm_lapic_set_reg(s->regs, APIC_TMCCT,
+			    __apic_read(vcpu->arch.apic, APIC_TMCCT));
+
 	return kvm_apic_state_fixup(vcpu, s, false);
 }
 
@@ -2520,7 +2561,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 	apic_update_lvtt(apic);
 	apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
 	update_divide_count(apic);
-	start_apic_timer(apic);
+	__start_apic_timer(apic, APIC_TMCCT);
 	kvm_apic_update_apicv(vcpu);
 	apic->highest_isr_cache = -1;
 	if (vcpu->arch.apicv_active) {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a0ffb4331418..7f15f9e69efe 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -161,9 +161,14 @@ static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
 	return *((u32 *) (apic->regs + reg_off));
 }
 
+static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
+{
+	*((u32 *) (regs + reg_off)) = val;
+}
+
 static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
 {
-	*((u32 *) (apic->regs + reg_off)) = val;
+	__kvm_lapic_set_reg(apic->regs, reg_off, val);
 }
 
 extern struct static_key kvm_no_apic_vcpu;
-- 
cgit v1.2.3


From 43d05de2bee7561b7958628b7c27c89b779de990 Mon Sep 17 00:00:00 2001
From: Eric Northup <digitaleric@gmail.com>
Date: Tue, 14 Apr 2020 18:23:20 -0700
Subject: KVM: pass through CPUID(0x80000006)

Return the host's L2 cache and TLB information for CPUID.0x80000006
instead of zeroing out the entry as part of KVM_GET_SUPPORTED_CPUID.
This allows a userspace VMM to feed KVM_GET_SUPPORTED_CPUID's output
directly into KVM_SET_CPUID2 (without breaking the guest).

Signed-off-by: Eric Northup (Google) <digitaleric@gmail.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Jon Cargille <jcargill@google.com>
Message-Id: <20200415012320.236065-1-jcargill@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 901cd1fdecd9..6828be99b908 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -728,6 +728,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		cpuid_entry_override(entry, CPUID_8000_0001_EDX);
 		cpuid_entry_override(entry, CPUID_8000_0001_ECX);
 		break;
+	case 0x80000006:
+		/* L2 cache and TLB: pass through host info. */
+		break;
 	case 0x80000007: /* Advanced power management */
 		/* invariant TSC is CPUID.80000007H:EDX[8] */
 		entry->edx &= (1 << 8);
-- 
cgit v1.2.3


From e8eff282154fc392dadf6a779009c7ecaa7e169b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:27:57 -0700
Subject: KVM: VMX: Flush all EPTP/VPID contexts on remote TLB flush

Flush all EPTP/VPID contexts if a TLB flush _may_ have been triggered by
a remote or deferred TLB flush, i.e. by KVM_REQ_TLB_FLUSH.  Remote TLB
flushes require all contexts to be invalidated, not just the active
contexts, e.g. all mappings in all contexts for a given HVA need to be
invalidated on a mmu_notifier invalidation.  Similarly, the instigator
of the deferred TLB flush may be expecting all contexts to be flushed,
e.g. vmx_vcpu_load_vmcs().

Without nested VMX, flushing only the current EPTP/VPID context isn't
problematic because KVM uses a constant VPID for each vCPU, and
mmu_alloc_direct_roots() all but guarantees KVM will use a single EPTP
for L1.  In the rare case where a different EPTP is created or reused,
KVM (currently) unconditionally flushes the new EPTP context prior to
entering the guest.

With nested VMX, KVM conditionally uses a different VPID for L2, and
unconditionally uses a different EPTP for L2.  Because KVM doesn't
_intentionally_ guarantee L2's EPTP/VPID context is flushed on nested
VM-Enter, it'd be possible for a malicious L1 to attack the host and/or
different VMs by exploiting the lack of flushing for L2.

  1) Launch nested guest from malicious L1.

  2) Nested VM-Enter to L2.

  3) Access target GPA 'g'.  CPU inserts TLB entry tagged with L2's ASID
     mapping 'g' to host PFN 'x'.

  2) Nested VM-Exit to L1.

  3) L1 triggers kernel same-page merging (ksm) by duplicating/zeroing
     the page for PFN 'x'.

  4) Host kernel merges PFN 'x' with PFN 'y', i.e. unmaps PFN 'x' and
     remaps the page to PFN 'y'.  mmu_notifier sends invalidate command,
     KVM flushes TLB only for L1's ASID.

  4) Host kernel reallocates PFN 'x' to some other task/guest.

  5) Nested VM-Enter to L2.  KVM does not invalidate L2's EPTP or VPID.

  6) L2 accesses GPA 'g' and gains read/write access to PFN 'x' via its
     stale TLB entry.

However, current KVM unconditionally flushes L1's EPTP/VPID context on
nested VM-Exit.  But, that behavior is mostly unintentional, KVM doesn't
go out of its way to flush EPTP/VPID on nested VM-Enter/VM-Exit, rather
a TLB flush is guaranteed to occur prior to re-entering L1 due to
__kvm_mmu_new_cr3() always being called with skip_tlb_flush=false.  On
nested VM-Enter, this happens via kvm_init_shadow_ept_mmu() (nested EPT
enabled) or in nested_vmx_load_cr3() (nested EPT disabled).  On nested
VM-Exit it occurs via nested_vmx_load_cr3().

This also fixes a bug where a deferred TLB flush in the context of L2,
with EPT disabled, would flush L1's VPID instead of L2's VPID, as
vmx_flush_tlb() flushes L1's VPID regardless of is_guest_mode().

Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Ben Gardon <bgardon@google.com>
Cc: Jim Mattson <jmattson@google.com>
Cc: Junaid Shahid <junaids@google.com>
Cc: Liran Alon <liran.alon@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: John Haxby <john.haxby@oracle.com>
Reviewed-by: Liran Alon <liran.alon@oracle.com>
Fixes: efebf0aaec3d ("KVM: nVMX: Do not flush TLB on L1<->L2 transitions if L1 uses VPID and EPT")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index aab9df55336e..fa40319f1698 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -515,7 +515,33 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
 
 static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 {
-	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/*
+	 * Flush all EPTP/VPID contexts if the TLB flush _may_ have been
+	 * invoked via kvm_flush_remote_tlbs(), which always passes %true for
+	 * @invalidate_gpa.  Flushing remote TLBs requires all contexts to be
+	 * flushed, not just the active context.
+	 *
+	 * Note, this also ensures a deferred TLB flush with VPID enabled and
+	 * EPT disabled invalidates the "correct" VPID, by nuking both L1 and
+	 * L2's VPIDs.
+	 */
+	if (invalidate_gpa) {
+		if (enable_ept) {
+			ept_sync_global();
+		} else if (enable_vpid) {
+			if (cpu_has_vmx_invvpid_global()) {
+				vpid_sync_vcpu_global();
+			} else {
+				WARN_ON_ONCE(!cpu_has_vmx_invvpid_single());
+				vpid_sync_vcpu_single(vmx->vpid);
+				vpid_sync_vcpu_single(vmx->nested.vpid02);
+			}
+		}
+	} else {
+		__vmx_flush_tlb(vcpu, vmx->vpid, false);
+	}
 }
 
 static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
-- 
cgit v1.2.3


From eed0030e4caa941cfbdfca00981395d85e6b3c3e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:27:58 -0700
Subject: KVM: nVMX: Validate the EPTP when emulating INVEPT(EXTENT_CONTEXT)

Signal VM-Fail for the single-context variant of INVEPT if the specified
EPTP is invalid.  Per the INEVPT pseudocode in Intel's SDM, it's subject
to the standard EPT checks:

  If VM entry with the "enable EPT" VM execution control set to 1 would
  fail due to the EPTP value then VMfail(Invalid operand to INVEPT/INVVPID);

Fixes: bfd0a56b90005 ("nEPT: Nested INVEPT")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-3-sean.j.christopherson@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index fd78ffbde644..dc7d8104b58e 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5157,8 +5157,12 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	}
 
 	switch (type) {
-	case VMX_EPT_EXTENT_GLOBAL:
 	case VMX_EPT_EXTENT_CONTEXT:
+		if (!nested_vmx_check_eptp(vcpu, operand.eptp))
+			return nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		fallthrough;
+	case VMX_EPT_EXTENT_GLOBAL:
 	/*
 	 * TODO: Sync the necessary shadow EPT roots here, rather than
 	 * at the next emulated VM-entry.
-- 
cgit v1.2.3


From f8aa7e3958bc433087ae7b9d7f24a92036c41141 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:27:59 -0700
Subject: KVM: nVMX: Invalidate all EPTP contexts when emulating INVEPT for L1

Free all L2 (guest_mmu) roots when emulating INVEPT for L1.  Outstanding
changes to the EPT tables managed by L1 need to be recognized, and
relying on KVM to always flush L2's EPTP context on nested VM-Enter is
dangerous.

Similar to handle_invpcid(), rely on kvm_mmu_free_roots() to do a remote
TLB flush if necessary, e.g. if L1 has never entered L2 then there is
nothing to be done.

Nuking all L2 roots is overkill for the single-context variant, but it's
the safe and easy bet.  A more precise zap mechanism will be added in
the future.  Add a TODO to call out that KVM only needs to invalidate
affected contexts.

Fixes: 14c07ad89f4d ("x86/kvm/mmu: introduce guest_mmu")
Reported-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-4-sean.j.christopherson@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index dc7d8104b58e..7ecb555e385a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5161,12 +5161,16 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 		if (!nested_vmx_check_eptp(vcpu, operand.eptp))
 			return nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+		/* TODO: sync only the target EPTP context. */
 		fallthrough;
 	case VMX_EPT_EXTENT_GLOBAL:
-	/*
-	 * TODO: Sync the necessary shadow EPT roots here, rather than
-	 * at the next emulated VM-entry.
-	 */
+		/*
+		 * Nested EPT roots are always held through guest_mmu,
+		 * not root_mmu.
+		 */
+		kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu,
+				   KVM_MMU_ROOTS_ALL);
 		break;
 	default:
 		BUG_ON(1);
-- 
cgit v1.2.3


From d6e3f8385d27e6357221e01158fdd369f762cd0f Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Fri, 20 Mar 2020 14:28:00 -0700
Subject: KVM: nVMX: Invalidate all roots when emulating INVVPID without EPT

Free all roots when emulating INVVPID for L1 and EPT is disabled, as
outstanding changes to the page tables managed by L1 need to be
recognized.  Because L1 and L2 share an MMU when EPT is disabled, and
because VPID is not tracked by the MMU role, all roots in the current
MMU (root_mmu) need to be freed, otherwise a future nested VM-Enter or
VM-Exit could do a fast CR3 switch (without a flush/sync) and consume
stale SPTEs.

Fixes: 5c614b3583e7b ("KVM: nVMX: nested VPID emulation")
Signed-off-by: Junaid Shahid <junaids@google.com>
[sean: ported to upstream KVM, reworded the comment and changelog]
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-5-sean.j.christopherson@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 7ecb555e385a..8f3ff07c7e66 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5255,6 +5255,20 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 		return kvm_skip_emulated_instruction(vcpu);
 	}
 
+	/*
+	 * Sync the shadow page tables if EPT is disabled, L1 is invalidating
+	 * linear mappings for L2 (tagged with L2's VPID).  Free all roots as
+	 * VPIDs are not tracked in the MMU role.
+	 *
+	 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
+	 * an MMU when EPT is disabled.
+	 *
+	 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
+	 */
+	if (!enable_ept)
+		kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
+				   KVM_MMU_ROOTS_ALL);
+
 	return nested_vmx_succeed(vcpu);
 }
 
-- 
cgit v1.2.3


From 53b3d8e9d57753295b33065f80b1e2fb4fcb946d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:01 -0700
Subject: KVM: x86: Export kvm_propagate_fault() (as
 kvm_inject_emulated_page_fault)

Export the page fault propagation helper so that VMX can use it to
correctly emulate TLB invalidation on page faults in an upcoming patch.

In the (hopefully) not-too-distant future, SGX virtualization will also
want access to the helper for injecting page faults to the correct level
(L1 vs. L2) when emulating ENCLS instructions.

Rename the function to kvm_inject_emulated_page_fault() to clarify that
it is (a) injecting a fault and (b) only for page faults.  WARN if it's
invoked with an exception other than PF_VECTOR.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-6-sean.j.christopherson@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/x86.c              | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42a2d0d3984a..34ad7297b06b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1452,6 +1452,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+				    struct x86_exception *fault);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gfn_t gfn, void *data, int offset, int len,
 			    u32 access);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 027dfd278a97..003e625367b7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -611,8 +611,11 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+				    struct x86_exception *fault)
 {
+	WARN_ON_ONCE(fault->vector != PF_VECTOR);
+
 	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
 		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
 	else
@@ -620,6 +623,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau
 
 	return fault->nested_page_fault;
 }
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
@@ -6381,7 +6385,7 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
 	if (ctxt->exception.vector == PF_VECTOR)
-		return kvm_propagate_fault(vcpu, &ctxt->exception);
+		return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
 
 	if (ctxt->exception.error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
-- 
cgit v1.2.3


From d075fc3154be9de3b00b7bba2fb6009fe8ab611a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 3 Apr 2020 17:30:49 +0200
Subject: KVM: s390: vsie: Move conditional reschedule

Let's move it to the outer loop, in case we ever run again into long
loops, trying to map the prefix. While at it, convert it to cond_resched().

Signed-off-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20200403153050.20569-5-david@redhat.com
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 4f6c22d72072..ef05b4e167fb 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1000,8 +1000,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
 	handle_last_fault(vcpu, vsie_page);
 
-	if (need_resched())
-		schedule();
 	if (test_cpu_flag(CIF_MCCK_PENDING))
 		s390_handle_mcck();
 
@@ -1185,6 +1183,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		    kvm_s390_vcpu_has_irq(vcpu, 0) ||
 		    kvm_s390_vcpu_sie_inhibited(vcpu))
 			break;
+		cond_resched();
 	}
 
 	if (rc == -EFAULT) {
-- 
cgit v1.2.3


From 62cf666e4eb8c00dee69bed505db60bf8c3fecf0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 3 Apr 2020 17:30:50 +0200
Subject: KVM: s390: vsie: gmap_table_walk() simplifications

Let's use asce_type where applicable. Also, simplify our sanity check for
valid table levels and convert it into a WARN_ON_ONCE(). Check if we even
have a valid gmap shadow as the very first step.

Signed-off-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20200403153050.20569-6-david@redhat.com
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/gmap.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 1a95d8809cc3..4b6903fbba4a 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -788,19 +788,19 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
 					     unsigned long gaddr, int level)
 {
 	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
-	unsigned long *table;
+	unsigned long *table = gmap->table;
 
-	if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
-		return NULL;
 	if (gmap_is_shadow(gmap) && gmap->removed)
 		return NULL;
 
+	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
+		return NULL;
+
 	if (asce_type != _ASCE_TYPE_REGION1 &&
 	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
 		return NULL;
 
-	table = gmap->table;
-	switch (gmap->asce & _ASCE_TYPE_MASK) {
+	switch (asce_type) {
 	case _ASCE_TYPE_REGION1:
 		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
 		if (level == 4)
-- 
cgit v1.2.3


From 0b545fd17f84184f9536bde68c3521e36c488448 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Sat, 18 Apr 2020 16:19:26 +0800
Subject: KVM: s390: remove unneeded semicolon in gisa_vcpu_kicker()

Fix the following coccicheck warning:

arch/s390/kvm/interrupt.c:3085:2-3: Unneeded semicolon

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20200418081926.41666-1-yanaijie@huawei.com
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index bfb481134994..a4d4ca2769bd 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -3082,7 +3082,7 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
 		__airqs_kick_single_vcpu(kvm, pending_mask);
 		hrtimer_forward_now(timer, ns_to_ktime(gi->expires));
 		return HRTIMER_RESTART;
-	};
+	}
 
 	return HRTIMER_NORESTART;
 }
-- 
cgit v1.2.3


From 5efac0741ce238e0844d3f7af00198f81e84926a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 23 Mar 2020 20:42:57 -0400
Subject: KVM: x86: introduce kvm_mmu_invalidate_gva

Wrap the combination of mmu->invlpg and kvm_x86_ops->tlb_flush_gva
into a new function.  This function also lets us specify the host PGD to
invalidate and also the MMU, both of which will be useful in fixing and
simplifying kvm_inject_emulated_page_fault.

A nested guest's MMU however has g_context->invlpg == NULL.  Instead of
setting it to nonpaging_invlpg, make kvm_mmu_invalidate_gva the only
entry point to mmu->invlpg and make a NULL invlpg pointer equivalent
to nonpaging_invlpg, saving a retpoline.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu/mmu.c          | 71 ++++++++++++++++++++++++++---------------
 2 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 34ad7297b06b..8c1e094a639f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1511,6 +1511,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
 		       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+			    gva_t gva, hpa_t root_hpa);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8071952e9cf2..84eeaa4ea149 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2153,10 +2153,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
-{
-}
-
 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu_page *sp, u64 *spte,
 				 const void *pte)
@@ -4237,7 +4233,7 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->page_fault = nonpaging_page_fault;
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->sync_page = nonpaging_sync_page;
-	context->invlpg = nonpaging_invlpg;
+	context->invlpg = NULL;
 	context->update_pte = nonpaging_update_pte;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
@@ -4928,7 +4924,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->mmu_role.as_u64 = new_role.as_u64;
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
-	context->invlpg = nonpaging_invlpg;
+	context->invlpg = NULL;
 	context->update_pte = nonpaging_update_pte;
 	context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu);
 	context->direct_map = true;
@@ -5096,6 +5092,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	g_context->get_pdptr         = kvm_pdptr_read;
 	g_context->inject_page_fault = kvm_inject_page_fault;
 
+	/*
+	 * L2 page tables are never shadowed, so there is no need to sync
+	 * SPTEs.
+	 */
+	g_context->invlpg            = NULL;
+
 	/*
 	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
 	 * L1's nested page tables (e.g. EPT12). The nested translation
@@ -5497,37 +5499,54 @@ emulate:
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+			    gva_t gva, hpa_t root_hpa)
 {
-	struct kvm_mmu *mmu = vcpu->arch.mmu;
 	int i;
 
-	/* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
-	if (is_noncanonical_address(gva, vcpu))
+	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
+	if (mmu != &vcpu->arch.guest_mmu) {
+		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
+		if (is_noncanonical_address(gva, vcpu))
+			return;
+
+		kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+	}
+
+	if (!mmu->invlpg)
 		return;
 
-	mmu->invlpg(vcpu, gva, mmu->root_hpa);
+	if (root_hpa == INVALID_PAGE) {
+		mmu->invlpg(vcpu, gva, mmu->root_hpa);
 
-	/*
-	 * INVLPG is required to invalidate any global mappings for the VA,
-	 * irrespective of PCID. Since it would take us roughly similar amount
-	 * of work to determine whether any of the prev_root mappings of the VA
-	 * is marked global, or to just sync it blindly, so we might as well
-	 * just always sync it.
-	 *
-	 * Mappings not reachable via the current cr3 or the prev_roots will be
-	 * synced when switching to that cr3, so nothing needs to be done here
-	 * for them.
-	 */
-	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-		if (VALID_PAGE(mmu->prev_roots[i].hpa))
-			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+		/*
+		 * INVLPG is required to invalidate any global mappings for the VA,
+		 * irrespective of PCID. Since it would take us roughly similar amount
+		 * of work to determine whether any of the prev_root mappings of the VA
+		 * is marked global, or to just sync it blindly, so we might as well
+		 * just always sync it.
+		 *
+		 * Mappings not reachable via the current cr3 or the prev_roots will be
+		 * synced when switching to that cr3, so nothing needs to be done here
+		 * for them.
+		 */
+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+			if (VALID_PAGE(mmu->prev_roots[i].hpa))
+				mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+	} else {
+		mmu->invlpg(vcpu, gva, root_hpa);
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
 
-	kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
 	++vcpu->stat.invlpg;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
 
+
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
-- 
cgit v1.2.3


From 0cd665bd20f9088d363158b4ac75592af18ecf4f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 25 Mar 2020 12:50:03 -0400
Subject: KVM: x86: cleanup kvm_inject_emulated_page_fault

To reconstruct the kvm_mmu to be used for page fault injection, we
can simply use fault->nested_page_fault.  This matches how
fault->nested_page_fault is assigned in the first place by
FNAME(walk_addr_generic).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c         | 6 ------
 arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
 arch/x86/kvm/x86.c             | 8 ++++----
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 84eeaa4ea149..32c9f4b2a281 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4353,12 +4353,6 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 	return kvm_read_cr3(vcpu);
 }
 
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-			      struct x86_exception *fault)
-{
-	vcpu->arch.mmu->inject_page_fault(vcpu, fault);
-}
-
 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 			   unsigned int access, int *nr_present)
 {
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 9bdf9b7d9a96..efec7d27b8c5 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -812,7 +812,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
 		if (!prefault)
-			inject_page_fault(vcpu, &walker.fault);
+			kvm_inject_emulated_page_fault(vcpu, &walker.fault);
 
 		return RET_PF_RETRY;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 003e625367b7..2ab821f6281f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -614,12 +614,12 @@ EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 				    struct x86_exception *fault)
 {
+	struct kvm_mmu *fault_mmu;
 	WARN_ON_ONCE(fault->vector != PF_VECTOR);
 
-	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
-		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
-	else
-		vcpu->arch.mmu->inject_page_fault(vcpu, fault);
+	fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
+					       vcpu->arch.walk_mmu;
+	fault_mmu->inject_page_fault(vcpu, fault);
 
 	return fault->nested_page_fault;
 }
-- 
cgit v1.2.3


From ee1fa209f5e5ca5c1e76c7aa1c2aab292f371f4a Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Fri, 20 Mar 2020 14:28:03 -0700
Subject: KVM: x86: Sync SPTEs when injecting page/EPT fault into L1

When injecting a page fault or EPT violation/misconfiguration, KVM is
not syncing any shadow PTEs associated with the faulting address,
including those in previous MMUs that are associated with L1's current
EPTP (in a nested EPT scenario), nor is it flushing any hardware TLB
entries.  All this is done by kvm_mmu_invalidate_gva.

Page faults that are either !PRESENT or RSVD are exempt from the flushing,
as the CPU is not allowed to cache such translations.

Signed-off-by: Junaid Shahid <junaids@google.com>
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-8-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 12 ++++++------
 arch/x86/kvm/vmx/vmx.c    |  2 +-
 arch/x86/kvm/x86.c        | 11 ++++++++++-
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 8f3ff07c7e66..d0857bfcb568 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4560,7 +4560,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
 		return 1;
 
 	if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
-		kvm_inject_page_fault(vcpu, &e);
+		kvm_inject_emulated_page_fault(vcpu, &e);
 		return 1;
 	}
 
@@ -4869,7 +4869,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 			return 1;
 		/* _system ok, nested_vmx_check_permission has verified cpl=0 */
 		if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) {
-			kvm_inject_page_fault(vcpu, &e);
+			kvm_inject_emulated_page_fault(vcpu, &e);
 			return 1;
 		}
 	}
@@ -4943,7 +4943,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 					instr_info, false, len, &gva))
 			return 1;
 		if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) {
-			kvm_inject_page_fault(vcpu, &e);
+			kvm_inject_emulated_page_fault(vcpu, &e);
 			return 1;
 		}
 	}
@@ -5108,7 +5108,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
 	if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
 					sizeof(gpa_t), &e)) {
-		kvm_inject_page_fault(vcpu, &e);
+		kvm_inject_emulated_page_fault(vcpu, &e);
 		return 1;
 	}
 	return nested_vmx_succeed(vcpu);
@@ -5152,7 +5152,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 			vmx_instruction_info, false, sizeof(operand), &gva))
 		return 1;
 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
-		kvm_inject_page_fault(vcpu, &e);
+		kvm_inject_emulated_page_fault(vcpu, &e);
 		return 1;
 	}
 
@@ -5220,7 +5220,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 			vmx_instruction_info, false, sizeof(operand), &gva))
 		return 1;
 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
-		kvm_inject_page_fault(vcpu, &e);
+		kvm_inject_emulated_page_fault(vcpu, &e);
 		return 1;
 	}
 	if (operand.vpid >> 16)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 01330096ff3e..e671bcfa29c4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5393,7 +5393,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 		return 1;
 
 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
-		kvm_inject_page_fault(vcpu, &e);
+		kvm_inject_emulated_page_fault(vcpu, &e);
 		return 1;
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2ab821f6281f..3984574e09bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -619,8 +619,17 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 
 	fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
 					       vcpu->arch.walk_mmu;
-	fault_mmu->inject_page_fault(vcpu, fault);
 
+	/*
+	 * Invalidate the TLB entry for the faulting address, if it exists,
+	 * else the access will fault indefinitely (and to emulate hardware).
+	 */
+	if ((fault->error_code & PFERR_PRESENT_MASK) &&
+	    !(fault->error_code & PFERR_RSVD_MASK))
+		kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
+				       fault_mmu->root_hpa);
+
+	fault_mmu->inject_page_fault(vcpu, fault);
 	return fault->nested_page_fault;
 }
 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
-- 
cgit v1.2.3


From c746b3a4b84cc6eb2f8a45e89210f57d252d98fd Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:04 -0700
Subject: KVM: VMX: Skip global INVVPID fallback if vpid==0 in
 vpid_sync_context()

Skip the global INVVPID in the unlikely scenario that vpid==0 and the
SINGLE_CONTEXT variant of INVVPID is unsupported.  If vpid==0, there's
no need to INVVPID as it's impossible to do VM-Enter with VPID enabled
and vmcs.VPID==0, i.e. there can't be any TLB entries for the vCPU with
vpid==0.  The fact that the SINGLE_CONTEXT variant isn't supported is
irrelevant.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-9-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index 19717d0a1100..a1cb6d4114dc 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -300,7 +300,7 @@ static inline void vpid_sync_context(int vpid)
 {
 	if (cpu_has_vmx_invvpid_single())
 		vpid_sync_vcpu_single(vpid);
-	else
+	else if (vpid != 0)
 		vpid_sync_vcpu_global();
 }
 
-- 
cgit v1.2.3


From 446ace4bca423297d62342801799ba4952d6b910 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:05 -0700
Subject: KVM: VMX: Use vpid_sync_context() directly when possible

Use vpid_sync_context() directly for flows that run if and only if
enable_vpid=1, or more specifically, nested VMX flows that are gated by
vmx->nested.msrs.secondary_ctls_high.SECONDARY_EXEC_ENABLE_VPID being
set, which is allowed if and only if enable_vpid=1.  Because these flows
call __vmx_flush_tlb() with @invalidate_gpa=false, the if-statement that
decides between INVEPT and INVVPID will always go down the INVVPID path,
i.e. call vpid_sync_context() because
"enable_ept && (invalidate_gpa || !enable_vpid)" always evaluates false.

This helps pave the way toward removing @invalidate_gpa and @vpid from
__vmx_flush_tlb() and its callers.

Opportunstically drop unnecessary brackets in handle_invvpid() around an
affected __vmx_flush_tlb()->vpid_sync_context() conversion.

No functional change intended.

Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-10-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d0857bfcb568..919fb7409535 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2459,7 +2459,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
 			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
 				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-				__vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
+				vpid_sync_context(nested_get_vpid02(vcpu));
 			}
 		} else {
 			/*
@@ -5234,21 +5234,21 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 		    is_noncanonical_address(operand.gla, vcpu))
 			return nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-		if (cpu_has_vmx_invvpid_individual_addr()) {
+		if (cpu_has_vmx_invvpid_individual_addr())
 			__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
 				vpid02, operand.gla);
-		} else
-			__vmx_flush_tlb(vcpu, vpid02, false);
+		else
+			vpid_sync_context(vpid02);
 		break;
 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
 	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
 		if (!operand.vpid)
 			return nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-		__vmx_flush_tlb(vcpu, vpid02, false);
+		vpid_sync_context(vpid02);
 		break;
 	case VMX_VPID_EXTENT_ALL_CONTEXT:
-		__vmx_flush_tlb(vcpu, vpid02, false);
+		vpid_sync_context(vpid02);
 		break;
 	default:
 		WARN_ON_ONCE(1);
-- 
cgit v1.2.3


From 8a8b097c6cd0041da6ba3a0701f911bfee05c652 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:06 -0700
Subject: KVM: VMX: Move vpid_sync_vcpu_addr() down a few lines

Move vpid_sync_vcpu_addr() below vpid_sync_context() so that it can be
refactored in a future patch to call vpid_sync_context() directly when
the "individual address" INVVPID variant isn't supported.

No functional change intended.

Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-11-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/ops.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index a1cb6d4114dc..6e7ceb0a1c0f 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -268,19 +268,6 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
 	vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
 }
 
-static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
-{
-	if (vpid == 0)
-		return true;
-
-	if (cpu_has_vmx_invvpid_individual_addr()) {
-		__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
-		return true;
-	}
-
-	return false;
-}
-
 static inline void vpid_sync_vcpu_single(int vpid)
 {
 	if (vpid == 0)
@@ -304,6 +291,19 @@ static inline void vpid_sync_context(int vpid)
 		vpid_sync_vcpu_global();
 }
 
+static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+	if (vpid == 0)
+		return true;
+
+	if (cpu_has_vmx_invvpid_individual_addr()) {
+		__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+		return true;
+	}
+
+	return false;
+}
+
 static inline void ept_sync_global(void)
 {
 	__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
-- 
cgit v1.2.3


From ab4b3597ffce01e9080ce5760e5a407a134701ae Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:07 -0700
Subject: KVM: VMX: Handle INVVPID fallback logic in vpid_sync_vcpu_addr()

Directly invoke vpid_sync_context() to do a global INVVPID when the
individual address variant is not supported instead of deferring such
behavior to the caller.  This allows for additional consolidation of
code as the logic is basically identical to the emulation of the
individual address variant in handle_invvpid().

No functional change intended.

Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-12-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/ops.h | 12 +++++-------
 arch/x86/kvm/vmx/vmx.c |  3 +--
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index 6e7ceb0a1c0f..c43853d2cdcc 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -291,17 +291,15 @@ static inline void vpid_sync_context(int vpid)
 		vpid_sync_vcpu_global();
 }
 
-static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
+static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
 {
 	if (vpid == 0)
-		return true;
+		return;
 
-	if (cpu_has_vmx_invvpid_individual_addr()) {
+	if (cpu_has_vmx_invvpid_individual_addr())
 		__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
-		return true;
-	}
-
-	return false;
+	else
+		vpid_sync_context(vpid);
 }
 
 static inline void ept_sync_global(void)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e671bcfa29c4..d973ff095066 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2842,8 +2842,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
 {
 	int vpid = to_vmx(vcpu)->vpid;
 
-	if (!vpid_sync_vcpu_addr(vpid, addr))
-		vpid_sync_context(vpid);
+	vpid_sync_vcpu_addr(vpid, addr);
 
 	/*
 	 * If VPIDs are not supported or enabled, then the above is a no-op.
-- 
cgit v1.2.3


From ca431c0cc3317e36c7011df2d2319385465db442 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:08 -0700
Subject: KVM: VMX: Drop redundant capability checks in low level INVVPID
 helpers

Remove the INVVPID capabilities checks from vpid_sync_vcpu_single() and
vpid_sync_vcpu_global() now that all callers ensure the INVVPID variant
is supported.  Note, in some cases the guarantee is provided in concert
with hardware_setup(), which enables VPID if and only if at least of
invvpid_single() or invvpid_global() is supported.

Drop the WARN_ON_ONCE() from vmx_flush_tlb() as vpid_sync_vcpu_single()
will trigger a WARN() on INVVPID failure, i.e. if SINGLE_CONTEXT isn't
supported.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-13-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/ops.h | 6 ++----
 arch/x86/kvm/vmx/vmx.h | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index c43853d2cdcc..5f1ac002b4b6 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -273,14 +273,12 @@ static inline void vpid_sync_vcpu_single(int vpid)
 	if (vpid == 0)
 		return;
 
-	if (cpu_has_vmx_invvpid_single())
-		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
 }
 
 static inline void vpid_sync_vcpu_global(void)
 {
-	if (cpu_has_vmx_invvpid_global())
-		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+	__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
 }
 
 static inline void vpid_sync_context(int vpid)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index fa40319f1698..1560296dde25 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -534,7 +534,6 @@ static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 			if (cpu_has_vmx_invvpid_global()) {
 				vpid_sync_vcpu_global();
 			} else {
-				WARN_ON_ONCE(!cpu_has_vmx_invvpid_single());
 				vpid_sync_vcpu_single(vmx->vpid);
 				vpid_sync_vcpu_single(vmx->nested.vpid02);
 			}
-- 
cgit v1.2.3


From bc41d0c40ec2a09b775e2c65f425f903733f5b22 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:09 -0700
Subject: KVM: nVMX: Use vpid_sync_vcpu_addr() to emulate INVVPID with address

Use vpid_sync_vcpu_addr() to emulate the "individual address" variant of
INVVPID now that said function handles the fallback case of the (host)
CPU not supporting "individual address".

Note, the "vpid == 0" checks in the vpid_sync_*() helpers aren't
actually redundant with the "!operand.vpid" check in handle_invvpid(),
as the vpid passed to vpid_sync_vcpu_addr() is a KVM (host) controlled
value, i.e. vpid02 can be zero even if operand.vpid is non-zero.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-14-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 919fb7409535..cbc687e9209d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5234,11 +5234,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 		    is_noncanonical_address(operand.gla, vcpu))
 			return nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-		if (cpu_has_vmx_invvpid_individual_addr())
-			__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
-				vpid02, operand.gla);
-		else
-			vpid_sync_context(vpid02);
+		vpid_sync_vcpu_addr(vpid02, operand.gla);
 		break;
 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
 	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
-- 
cgit v1.2.3


From e64419d991ea212af087d3c57fcabb4d27db03fc Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:10 -0700
Subject: KVM: x86: Move "flush guest's TLB" logic to separate kvm_x86_ops hook

Add a dedicated hook to handle flushing TLB entries on behalf of the
guest, i.e. for a paravirtualized TLB flush, and use it directly instead
of bouncing through kvm_vcpu_flush_tlb().

For VMX, change the effective implementation implementation to never do
INVEPT and flush only the current context, i.e. to always flush via
INVVPID(SINGLE_CONTEXT).  The INVEPT performed by __vmx_flush_tlb() when
@invalidate_gpa=false and enable_vpid=0 is unnecessary, as it will only
flush guest-physical mappings; linear and combined mappings are flushed
by VM-Enter when VPID is disabled, and changes in the guest pages tables
do not affect guest-physical mappings.

When EPT and VPID are enabled, doing INVVPID is not required (by Intel's
architecture) to invalidate guest-physical mappings, i.e. TLB entries
that cache guest-physical mappings can live across INVVPID as the
mappings are associated with an EPTP, not a VPID.  The intent of
@invalidate_gpa is to inform vmx_flush_tlb() that it must "invalidate
gpa mappings", i.e. do INVEPT and not simply INVVPID.  Other than nested
VPID handling, which now calls vpid_sync_context() directly, the only
scenario where KVM can safely do INVVPID instead of INVEPT (when EPT is
enabled) is if KVM is flushing TLB entries from the guest's perspective,
i.e. is only required to invalidate linear mappings.

For SVM, flushing TLB entries from the guest's perspective can be done
by flushing the current ASID, as changes to the guest's page tables are
associated only with the current ASID.

Adding a dedicated ->tlb_flush_guest() paves the way toward removing
@invalidate_gpa, which is a potentially dangerous control flag as its
meaning is not exactly crystal clear, even for those who are familiar
with the subtleties of what mappings Intel CPUs are/aren't allowed to
keep across various invalidation scenarios.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-15-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++++++
 arch/x86/kvm/svm/svm.c          |  6 ++++++
 arch/x86/kvm/vmx/vmx.c          | 13 +++++++++++++
 arch/x86/kvm/x86.c              |  2 +-
 4 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8c1e094a639f..a7c1ea136c8c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1114,6 +1114,12 @@ struct kvm_x86_ops {
 	 */
 	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
 
+	/*
+	 * Flush any TLB entries created by the guest.  Like tlb_flush_gva(),
+	 * does not need to flush GPA->HPA mappings.
+	 */
+	void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
+
 	void (*run)(struct kvm_vcpu *vcpu);
 	int (*handle_exit)(struct kvm_vcpu *vcpu,
 		enum exit_fastpath_completion exit_fastpath);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2f379bacbb26..2c0fcb87a054 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3170,6 +3170,11 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
 	invlpga(gva, svm->vmcb->control.asid);
 }
 
+static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+	svm_flush_tlb(vcpu, false);
+}
+
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
 }
@@ -3939,6 +3944,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.tlb_flush = svm_flush_tlb,
 	.tlb_flush_gva = svm_flush_tlb_gva,
+	.tlb_flush_guest = svm_flush_tlb_guest,
 
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d973ff095066..094acb53829b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2851,6 +2851,18 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
 	 */
 }
 
+static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
+	 * or a vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit
+	 * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+	 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
+	 * i.e. no explicit INVVPID is necessary.
+	 */
+	vpid_sync_context(to_vmx(vcpu)->vpid);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -7718,6 +7730,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.tlb_flush = vmx_flush_tlb,
 	.tlb_flush_gva = vmx_flush_tlb_gva,
+	.tlb_flush_guest = vmx_flush_tlb_guest,
 
 	.run = vmx_vcpu_run,
 	.handle_exit = vmx_handle_exit,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3984574e09bf..c7ad142b511f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2719,7 +2719,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
 		st->preempted & KVM_VCPU_FLUSH_TLB);
 	if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
-		kvm_vcpu_flush_tlb(vcpu, false);
+		kvm_x86_ops.tlb_flush_guest(vcpu);
 
 	vcpu->arch.st.preempted = 0;
 
-- 
cgit v1.2.3


From 0baedd792713063213f1e2060dc6a5d536638f0a Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 25 Mar 2020 12:28:24 -0400
Subject: KVM: x86: make Hyper-V PV TLB flush use tlb_flush_guest()

Hyper-V PV TLB flush mechanism does TLB flush on behalf of the guest
so doing tlb_flush_all() is an overkill, switch to using tlb_flush_guest()
(just like KVM PV TLB flush mechanism) instead. Introduce
KVM_REQ_HV_TLB_FLUSH to support the change.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/hyperv.c           |  3 +--
 arch/x86/kvm/x86.c              | 10 +++++++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7c1ea136c8c..9951b01df57c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,6 +83,8 @@
 #define KVM_REQ_GET_VMCS12_PAGES	KVM_ARCH_REQ(24)
 #define KVM_REQ_APICV_UPDATE \
 	KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_TLB_FLUSH \
+	KVM_ARCH_REQ_FLAGS(26, KVM_REQUEST_NO_WAKEUP)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index bcefa9d4e57e..b850f676abe4 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1425,8 +1425,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
 	 * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
 	 * analyze it here, flush TLB regardless of the specified address space.
 	 */
-	kvm_make_vcpus_request_mask(kvm,
-				    KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
+	kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH,
 				    vcpu_mask, &hv_vcpu->tlb_flush);
 
 ret_success:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c7ad142b511f..a7df68af65e5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2696,6 +2696,12 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 	kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa);
 }
 
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+	++vcpu->stat.tlb_flush;
+	kvm_x86_ops.tlb_flush_guest(vcpu);
+}
+
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
 	struct kvm_host_map map;
@@ -2719,7 +2725,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
 		st->preempted & KVM_VCPU_FLUSH_TLB);
 	if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
-		kvm_x86_ops.tlb_flush_guest(vcpu);
+		kvm_vcpu_flush_tlb_guest(vcpu);
 
 	vcpu->arch.st.preempted = 0;
 
@@ -8218,6 +8224,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_mmu_load_pgd(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
 			kvm_vcpu_flush_tlb(vcpu, true);
+		if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+			kvm_vcpu_flush_tlb_guest(vcpu);
 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			r = 0;
-- 
cgit v1.2.3


From ad104b5e433a6851a3a9822663c1ca756832a5f0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:11 -0700
Subject: KVM: VMX: Clean up vmx_flush_tlb_gva()

Refactor vmx_flush_tlb_gva() to remove a superfluous local variable and
clean up its comment, which is oddly located below the code it is
commenting.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-16-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 094acb53829b..8b55f43433a4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2840,15 +2840,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
 {
-	int vpid = to_vmx(vcpu)->vpid;
-
-	vpid_sync_vcpu_addr(vpid, addr);
-
 	/*
-	 * If VPIDs are not supported or enabled, then the above is a no-op.
-	 * But we don't really need a TLB flush in that case anyway, because
-	 * each VM entry/exit includes an implicit flush when VPID is 0.
+	 * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+	 * vmx_flush_tlb_guest() for an explanation of why this is ok.
 	 */
+	vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
 }
 
 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From f55ac304ca47039368a5971fa61ebc8160c90659 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:12 -0700
Subject: KVM: x86: Drop @invalidate_gpa param from kvm_x86_ops' tlb_flush()

Drop @invalidate_gpa from ->tlb_flush() and kvm_vcpu_flush_tlb() now
that all callers pass %true for said param, or ignore the param (SVM has
an internal call to svm_flush_tlb() in svm_flush_tlb_guest that somewhat
arbitrarily passes %false).

Remove __vmx_flush_tlb() as it is no longer used.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-17-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu/mmu.c          |  2 +-
 arch/x86/kvm/svm/nested.c       |  2 +-
 arch/x86/kvm/svm/svm.c          |  6 +++---
 arch/x86/kvm/svm/svm.h          |  2 +-
 arch/x86/kvm/vmx/vmx.c          |  4 ++--
 arch/x86/kvm/vmx/vmx.h          | 42 ++++++++++++-----------------------------
 arch/x86/kvm/x86.c              |  6 +++---
 8 files changed, 24 insertions(+), 42 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9951b01df57c..7156c749677a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1103,7 +1103,7 @@ struct kvm_x86_ops {
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
-	void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+	void (*tlb_flush)(struct kvm_vcpu *vcpu);
 	int  (*tlb_remote_flush)(struct kvm *kvm);
 	int  (*tlb_remote_flush_with_range)(struct kvm *kvm,
 			struct kvm_tlb_range *range);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 32c9f4b2a281..081f6b220b6e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5179,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	kvm_mmu_load_pgd(vcpu);
-	kvm_x86_ops.tlb_flush(vcpu, true);
+	kvm_x86_ops.tlb_flush(vcpu);
 out:
 	return r;
 }
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 90a1ca939627..c62893102777 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -279,7 +279,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
 	svm->nested.intercept            = nested_vmcb->control.intercept;
 
-	svm_flush_tlb(&svm->vcpu, true);
+	svm_flush_tlb(&svm->vcpu);
 	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
 	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
 		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2c0fcb87a054..aafd18589fda 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1603,7 +1603,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return 1;
 
 	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
-		svm_flush_tlb(vcpu, true);
+		svm_flush_tlb(vcpu);
 
 	vcpu->arch.cr4 = cr4;
 	if (!npt_enabled)
@@ -3153,7 +3153,7 @@ static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
 	return 0;
 }
 
-void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+void svm_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3172,7 +3172,7 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
 
 static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu)
 {
-	svm_flush_tlb(vcpu, false);
+	svm_flush_tlb(vcpu);
 }
 
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index df3474f4fb02..ca95204f9dde 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -360,7 +360,7 @@ u32 svm_msrpm_offset(u32 msr);
 void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+void svm_flush_tlb(struct kvm_vcpu *vcpu);
 void disable_nmi_singlestep(struct vcpu_svm *svm);
 
 /* nested.c */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8b55f43433a4..d843ef34aafd 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6083,7 +6083,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 		if (flexpriority_enabled) {
 			sec_exec_control |=
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-			vmx_flush_tlb(vcpu, true);
+			vmx_flush_tlb(vcpu);
 		}
 		break;
 	case LAPIC_MODE_X2APIC:
@@ -6101,7 +6101,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
 {
 	if (!is_guest_mode(vcpu)) {
 		vmcs_write64(APIC_ACCESS_ADDR, hpa);
-		vmx_flush_tlb(vcpu, true);
+		vmx_flush_tlb(vcpu);
 	}
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 1560296dde25..45978552524e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -500,46 +500,28 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
 
 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
 
-static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
-				bool invalidate_gpa)
-{
-	if (enable_ept && (invalidate_gpa || !enable_vpid)) {
-		if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-			return;
-		ept_sync_context(construct_eptp(vcpu,
-						vcpu->arch.mmu->root_hpa));
-	} else {
-		vpid_sync_context(vpid);
-	}
-}
-
-static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	/*
-	 * Flush all EPTP/VPID contexts if the TLB flush _may_ have been
-	 * invoked via kvm_flush_remote_tlbs(), which always passes %true for
-	 * @invalidate_gpa.  Flushing remote TLBs requires all contexts to be
-	 * flushed, not just the active context.
+	 * Flush all EPTP/VPID contexts, as the TLB flush _may_ have been
+	 * invoked via kvm_flush_remote_tlbs().  Flushing remote TLBs requires
+	 * all contexts to be flushed, not just the active context.
 	 *
 	 * Note, this also ensures a deferred TLB flush with VPID enabled and
 	 * EPT disabled invalidates the "correct" VPID, by nuking both L1 and
 	 * L2's VPIDs.
 	 */
-	if (invalidate_gpa) {
-		if (enable_ept) {
-			ept_sync_global();
-		} else if (enable_vpid) {
-			if (cpu_has_vmx_invvpid_global()) {
-				vpid_sync_vcpu_global();
-			} else {
-				vpid_sync_vcpu_single(vmx->vpid);
-				vpid_sync_vcpu_single(vmx->nested.vpid02);
-			}
+	if (enable_ept) {
+		ept_sync_global();
+	} else if (enable_vpid) {
+		if (cpu_has_vmx_invvpid_global()) {
+			vpid_sync_vcpu_global();
+		} else {
+			vpid_sync_vcpu_single(vmx->vpid);
+			vpid_sync_vcpu_single(vmx->nested.vpid02);
 		}
-	} else {
-		__vmx_flush_tlb(vcpu, vmx->vpid, false);
 	}
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a7df68af65e5..daf16247870c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2690,10 +2690,10 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.time = 0;
 }
 
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
-	kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa);
+	kvm_x86_ops.tlb_flush(vcpu);
 }
 
 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
@@ -8223,7 +8223,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
 			kvm_mmu_load_pgd(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
-			kvm_vcpu_flush_tlb(vcpu, true);
+			kvm_vcpu_flush_tlb(vcpu);
 		if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
 			kvm_vcpu_flush_tlb_guest(vcpu);
 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
-- 
cgit v1.2.3


From 72b38320872666f7fb6ff9564bf91139c685c234 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:13 -0700
Subject: KVM: SVM: Wire up ->tlb_flush_guest() directly to svm_flush_tlb()

Use svm_flush_tlb() directly for kvm_x86_ops->tlb_flush_guest() now that
the @invalidate_gpa param to ->tlb_flush() is gone, i.e. the wrapper for
->tlb_flush_guest() is no longer necessary.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-18-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index aafd18589fda..2ad6a8d1a77f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3170,11 +3170,6 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
 	invlpga(gva, svm->vmcb->control.asid);
 }
 
-static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu)
-{
-	svm_flush_tlb(vcpu);
-}
-
 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
 }
@@ -3944,7 +3939,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.tlb_flush = svm_flush_tlb,
 	.tlb_flush_gva = svm_flush_tlb_gva,
-	.tlb_flush_guest = svm_flush_tlb_guest,
+	.tlb_flush_guest = svm_flush_tlb,
 
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
-- 
cgit v1.2.3


From 5058b692c69997f9736b94df786509366c32f34d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:14 -0700
Subject: KVM: VMX: Move vmx_flush_tlb() to vmx.c

Move vmx_flush_tlb() to vmx.c and make it non-inline static now that all
its callers live in vmx.c.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-19-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 25 +++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.h | 25 -------------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d843ef34aafd..3cdca4d03c07 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2838,6 +2838,31 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/*
+	 * Flush all EPTP/VPID contexts, as the TLB flush _may_ have been
+	 * invoked via kvm_flush_remote_tlbs().  Flushing remote TLBs requires
+	 * all contexts to be flushed, not just the active context.
+	 *
+	 * Note, this also ensures a deferred TLB flush with VPID enabled and
+	 * EPT disabled invalidates the "correct" VPID, by nuking both L1 and
+	 * L2's VPIDs.
+	 */
+	if (enable_ept) {
+		ept_sync_global();
+	} else if (enable_vpid) {
+		if (cpu_has_vmx_invvpid_global()) {
+			vpid_sync_vcpu_global();
+		} else {
+			vpid_sync_vcpu_single(vmx->vpid);
+			vpid_sync_vcpu_single(vmx->nested.vpid02);
+		}
+	}
+}
+
 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
 {
 	/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 45978552524e..4c7b0713b438 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -500,31 +500,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
 
 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
 
-static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	/*
-	 * Flush all EPTP/VPID contexts, as the TLB flush _may_ have been
-	 * invoked via kvm_flush_remote_tlbs().  Flushing remote TLBs requires
-	 * all contexts to be flushed, not just the active context.
-	 *
-	 * Note, this also ensures a deferred TLB flush with VPID enabled and
-	 * EPT disabled invalidates the "correct" VPID, by nuking both L1 and
-	 * L2's VPIDs.
-	 */
-	if (enable_ept) {
-		ept_sync_global();
-	} else if (enable_vpid) {
-		if (cpu_has_vmx_invvpid_global()) {
-			vpid_sync_vcpu_global();
-		} else {
-			vpid_sync_vcpu_single(vmx->vpid);
-			vpid_sync_vcpu_single(vmx->nested.vpid02);
-		}
-	}
-}
-
 static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
 {
 	vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
-- 
cgit v1.2.3


From 25d8b84376e7c12ce4e4ec88c0f5bf0bfc9ff8f4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:15 -0700
Subject: KVM: nVMX: Move nested_get_vpid02() to vmx/nested.h

Move nested_get_vpid02() to vmx/nested.h so that a future patch can
reference it from vmx.c to implement context-specific TLB flushing.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-20-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 7 -------
 arch/x86/kvm/vmx/nested.h | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index cbc687e9209d..b18dc7bc3be7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1132,13 +1132,6 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
 	       (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
 }
 
-static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
-}
-
 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
 {
 	superset &= mask;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index ac56aefa49e3..e8f4a74f7251 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -62,6 +62,13 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
 		vmx->nested.hv_evmcs;
 }
 
+static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+}
+
 static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu)
 {
 	/* return the page table to be shadowed - in our case, EPT12 */
-- 
cgit v1.2.3


From 33d19ec9b14c0e304f5d44a0dd1403539360968b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:16 -0700
Subject: KVM: VMX: Introduce vmx_flush_tlb_current()

Add a helper to flush TLB entries only for the current EPTP/VPID context
and use it for the existing direct invocations of vmx_flush_tlb().  TLB
flushes that are specific to the current vCPU state do not need to flush
other contexts.

Note, both converted call sites happen to be related to the APIC access
page, this is purely coincidental.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-21-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3cdca4d03c07..0f1dfbae649f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2863,6 +2863,22 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+	u64 root_hpa = vcpu->arch.mmu->root_hpa;
+
+	/* No flush required if the current context is invalid. */
+	if (!VALID_PAGE(root_hpa))
+		return;
+
+	if (enable_ept)
+		ept_sync_context(construct_eptp(vcpu, root_hpa));
+	else if (!is_guest_mode(vcpu))
+		vpid_sync_context(to_vmx(vcpu)->vpid);
+	else
+		vpid_sync_context(nested_get_vpid02(vcpu));
+}
+
 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
 {
 	/*
@@ -6108,7 +6124,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 		if (flexpriority_enabled) {
 			sec_exec_control |=
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-			vmx_flush_tlb(vcpu);
+			vmx_flush_tlb_current(vcpu);
 		}
 		break;
 	case LAPIC_MODE_X2APIC:
@@ -6126,7 +6142,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
 {
 	if (!is_guest_mode(vcpu)) {
 		vmcs_write64(APIC_ACCESS_ADDR, hpa);
-		vmx_flush_tlb(vcpu);
+		vmx_flush_tlb_current(vcpu);
 	}
 }
 
-- 
cgit v1.2.3


From 4a41e43cbe2c0f47d7ec80d156ab786d03cdaacd Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:17 -0700
Subject: KVM: SVM: Document the ASID logic in svm_flush_tlb()

Add a comment in svm_flush_tlb() to document why it flushes only the
current ASID, even when it is invoked when flushing remote TLBs.

Cc: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-22-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2ad6a8d1a77f..fa5bb1b62059 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3157,6 +3157,13 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	/*
+	 * Flush only the current ASID even if the TLB flush was invoked via
+	 * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
+	 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
+	 * unconditionally does a TLB flush on both nested VM-Enter and nested
+	 * VM-Exit (via kvm_mmu_reset_context()).
+	 */
 	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
 		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
 	else
-- 
cgit v1.2.3


From 7780938cc70b848650722762fa4c7496fa68f9ec Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:18 -0700
Subject: KVM: x86: Rename ->tlb_flush() to ->tlb_flush_all()

Rename ->tlb_flush() to ->tlb_flush_all() in preparation for adding a
new hook to flush only the current ASID/context.

Opportunstically replace the comment in vmx_flush_tlb() that explains
why it flushes all EPTP/VPID contexts with a comment explaining why it
unconditionally uses INVEPT when EPT is enabled.  I.e. rely on the "all"
part of the name to clarify why it does global INVEPT/INVVPID.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-23-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu/mmu.c          |  2 +-
 arch/x86/kvm/svm/svm.c          |  2 +-
 arch/x86/kvm/vmx/vmx.c          | 16 +++++++---------
 arch/x86/kvm/x86.c              |  6 +++---
 5 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7156c749677a..e6305d5e28fa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1103,7 +1103,7 @@ struct kvm_x86_ops {
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
-	void (*tlb_flush)(struct kvm_vcpu *vcpu);
+	void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
 	int  (*tlb_remote_flush)(struct kvm *kvm);
 	int  (*tlb_remote_flush_with_range)(struct kvm *kvm,
 			struct kvm_tlb_range *range);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 081f6b220b6e..85e17a057094 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5179,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	kvm_mmu_load_pgd(vcpu);
-	kvm_x86_ops.tlb_flush(vcpu);
+	kvm_x86_ops.tlb_flush_all(vcpu);
 out:
 	return r;
 }
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index fa5bb1b62059..72b976e64df9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3944,7 +3944,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
 
-	.tlb_flush = svm_flush_tlb,
+	.tlb_flush_all = svm_flush_tlb,
 	.tlb_flush_gva = svm_flush_tlb_gva,
 	.tlb_flush_guest = svm_flush_tlb,
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0f1dfbae649f..da868846d96d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2838,18 +2838,16 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	/*
-	 * Flush all EPTP/VPID contexts, as the TLB flush _may_ have been
-	 * invoked via kvm_flush_remote_tlbs().  Flushing remote TLBs requires
-	 * all contexts to be flushed, not just the active context.
-	 *
-	 * Note, this also ensures a deferred TLB flush with VPID enabled and
-	 * EPT disabled invalidates the "correct" VPID, by nuking both L1 and
-	 * L2's VPIDs.
+	 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
+	 * the CPU is not required to invalidate guest-physical mappings on
+	 * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
+	 * associated with the root EPT structure and not any particular VPID
+	 * (INVVPID also isn't required to invalidate guest-physical mappings).
 	 */
 	if (enable_ept) {
 		ept_sync_global();
@@ -7765,7 +7763,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
 
-	.tlb_flush = vmx_flush_tlb,
+	.tlb_flush_all = vmx_flush_tlb_all,
 	.tlb_flush_gva = vmx_flush_tlb_gva,
 	.tlb_flush_guest = vmx_flush_tlb_guest,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index daf16247870c..cd2a3d01bffb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2690,10 +2690,10 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.time = 0;
 }
 
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
-	kvm_x86_ops.tlb_flush(vcpu);
+	kvm_x86_ops.tlb_flush_all(vcpu);
 }
 
 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
@@ -8223,7 +8223,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
 			kvm_mmu_load_pgd(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
-			kvm_vcpu_flush_tlb(vcpu);
+			kvm_vcpu_flush_tlb_all(vcpu);
 		if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
 			kvm_vcpu_flush_tlb_guest(vcpu);
 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
-- 
cgit v1.2.3


From 50b265a4eee0d9fb9581f1a300742515b80dffda Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:19 -0700
Subject: KVM: nVMX: Add helper to handle TLB flushes on nested
 VM-Enter/VM-Exit

Add a helper to determine whether or not a full TLB flush needs to be
performed on nested VM-Enter/VM-Exit, as the logic is identical for both
flows and needs a fairly beefy comment to boot.  This also provides a
common point to make future adjustments to the logic.

Handle vpid12 changes the new helper as well even though it is specific
to VM-Enter.  The vpid12 logic is an extension of the flushing logic,
and it's worth the extra bool parameter to provide a single location for
the flushing logic.

Cc: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-24-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 88 +++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b18dc7bc3be7..ba1aedb535ef 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1132,6 +1132,48 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
 	       (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
 }
 
+static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
+					    struct vmcs12 *vmcs12,
+					    bool is_vmenter)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/*
+	 * If VPID is disabled, linear and combined mappings are flushed on
+	 * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
+	 * their associated EPTP.
+	 */
+	if (!enable_vpid)
+		return;
+
+	/*
+	 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
+	 * for *all* contexts to be flushed on VM-Enter/VM-Exit.
+	 *
+	 * If VPID is enabled and used by vmc12, but L2 does not have a unique
+	 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
+	 * a VPID for L2, flush the TLB as the effective ASID is common to both
+	 * L1 and L2.
+	 *
+	 * Defer the flush so that it runs after vmcs02.EPTP has been set by
+	 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
+	 * redundant flushes further down the nested pipeline.
+	 *
+	 * If a TLB flush isn't required due to any of the above, and vpid12 is
+	 * changing then the new "virtual" VPID (vpid12) will reuse the same
+	 * "real" VPID (vpid02), and so needs to be sync'd.  There is no direct
+	 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
+	 * all nested vCPUs.
+	 */
+	if (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu)) {
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	} else if (is_vmenter &&
+		   vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+		vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+		vpid_sync_context(nested_get_vpid02(vcpu));
+	}
+}
+
 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
 {
 	superset &= mask;
@@ -2440,32 +2482,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (kvm_has_tsc_control)
 		decache_tsc_multiplier(vmx);
 
-	if (enable_vpid) {
-		/*
-		 * There is no direct mapping between vpid02 and vpid12, the
-		 * vpid02 is per-vCPU for L0 and reused while the value of
-		 * vpid12 is changed w/ one invvpid during nested vmentry.
-		 * The vpid12 is allocated by L1 for L2, so it will not
-		 * influence global bitmap(for vpid01 and vpid02 allocation)
-		 * even if spawn a lot of nested vCPUs.
-		 */
-		if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
-			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
-				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-				vpid_sync_context(nested_get_vpid02(vcpu));
-			}
-		} else {
-			/*
-			 * If L1 use EPT, then L0 needs to execute INVEPT on
-			 * EPTP02 instead of EPTP01. Therefore, delay TLB
-			 * flush until vmcs02->eptp is fully updated by
-			 * KVM_REQ_LOAD_MMU_PGD. Note that this assumes
-			 * KVM_REQ_TLB_FLUSH is evaluated after
-			 * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest().
-			 */
-			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-		}
-	}
+	nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
 
 	if (nested_cpu_has_ept(vmcs12))
 		nested_ept_init_mmu_context(vcpu);
@@ -4033,24 +4050,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	if (!enable_ept)
 		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 
-	/*
-	 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
-	 * VMEntry/VMExit. Thus, no need to flush TLB.
-	 *
-	 * If vmcs12 doesn't use VPID, L1 expects TLB to be
-	 * flushed on every VMEntry/VMExit.
-	 *
-	 * Otherwise, we can preserve TLB entries as long as we are
-	 * able to tag L1 TLB entries differently than L2 TLB entries.
-	 *
-	 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
-	 * and therefore we request the TLB flush to happen only after VMCS EPTP
-	 * has been set by KVM_REQ_LOAD_MMU_PGD.
-	 */
-	if (enable_vpid &&
-	    (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-	}
+	nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
 
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
 	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
-- 
cgit v1.2.3


From eeeb4f67a6cd437da1f5d1a20596cdc2d7b50551 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:20 -0700
Subject: KVM: x86: Introduce KVM_REQ_TLB_FLUSH_CURRENT to flush current ASID

Add KVM_REQ_TLB_FLUSH_CURRENT to allow optimized TLB flushing of VMX's
EPTP/VPID contexts[*] from the KVM MMU and/or in a deferred manner, e.g.
to flush L2's context during nested VM-Enter.

Convert KVM_REQ_TLB_FLUSH to KVM_REQ_TLB_FLUSH_CURRENT in flows where
the flush is directly associated with vCPU-scoped instruction emulation,
i.e. MOV CR3 and INVPCID.

Add a comment in vmx_vcpu_load_vmcs() above its KVM_REQ_TLB_FLUSH to
make it clear that it deliberately requests a flush of all contexts.

Service any pending flush request on nested VM-Exit as it's possible a
nested VM-Exit could occur after requesting a flush for L2.  Add the
same logic for nested VM-Enter even though it's _extremely_ unlikely
for flush to be pending on nested VM-Enter, but theoretically possible
(in the future) due to RSM (SMM) emulation.

[*] Intel also has an Address Space Identifier (ASID) concept, e.g.
    EPTP+VPID+PCID == ASID, it's just not documented in the SDM because
    the rules of invalidation are different based on which piece of the
    ASID is being changed, i.e. whether the EPTP, VPID, or PCID context
    must be invalidated.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-25-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/svm/svm.c          |  1 +
 arch/x86/kvm/vmx/nested.c       |  7 +++++++
 arch/x86/kvm/vmx/vmx.c          |  7 ++++++-
 arch/x86/kvm/x86.c              | 11 +++++++++--
 arch/x86/kvm/x86.h              |  6 ++++++
 6 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e6305d5e28fa..72e9c4492f47 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,8 +83,9 @@
 #define KVM_REQ_GET_VMCS12_PAGES	KVM_ARCH_REQ(24)
 #define KVM_REQ_APICV_UPDATE \
 	KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_TLB_FLUSH_CURRENT	KVM_ARCH_REQ(26)
 #define KVM_REQ_HV_TLB_FLUSH \
-	KVM_ARCH_REQ_FLAGS(26, KVM_REQUEST_NO_WAKEUP)
+	KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1104,6 +1105,7 @@ struct kvm_x86_ops {
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 	void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
+	void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
 	int  (*tlb_remote_flush)(struct kvm *kvm);
 	int  (*tlb_remote_flush_with_range)(struct kvm *kvm,
 			struct kvm_tlb_range *range);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 72b976e64df9..66123848448d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3945,6 +3945,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.set_rflags = svm_set_rflags,
 
 	.tlb_flush_all = svm_flush_tlb,
+	.tlb_flush_current = svm_flush_tlb,
 	.tlb_flush_gva = svm_flush_tlb_gva,
 	.tlb_flush_guest = svm_flush_tlb,
 
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ba1aedb535ef..567a7a0f30e0 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3208,6 +3208,9 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	u32 exit_reason = EXIT_REASON_INVALID_STATE;
 	u32 exit_qual;
 
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+		kvm_vcpu_flush_tlb_current(vcpu);
+
 	evaluate_pending_interrupts = exec_controls_get(vmx) &
 		(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
 	if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
@@ -4274,6 +4277,10 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	/* trying to cancel vmlaunch/vmresume is a bug */
 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
+	/* Service the TLB flush request for L2 before switching to L1. */
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+		kvm_vcpu_flush_tlb_current(vcpu);
+
 	leave_guest_mode(vcpu);
 
 	if (nested_cpu_has_preemption_timer(vmcs12))
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index da868846d96d..d958cee52f41 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1338,6 +1338,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
 		void *gdt = get_current_gdt_ro();
 		unsigned long sysenter_esp;
 
+		/*
+		 * Flush all EPTP/VPID contexts, the new pCPU may have stale
+		 * TLB entries from its previous association with the vCPU.
+		 */
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 
 		/*
@@ -5468,7 +5472,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 
 		if (kvm_get_active_pcid(vcpu) == operand.pcid) {
 			kvm_mmu_sync_roots(vcpu);
-			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 		}
 
 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
@@ -7764,6 +7768,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.set_rflags = vmx_set_rflags,
 
 	.tlb_flush_all = vmx_flush_tlb_all,
+	.tlb_flush_current = vmx_flush_tlb_current,
 	.tlb_flush_gva = vmx_flush_tlb_gva,
 	.tlb_flush_guest = vmx_flush_tlb_guest,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cd2a3d01bffb..ebbe34d89469 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1019,7 +1019,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
 		if (!skip_tlb_flush) {
 			kvm_mmu_sync_roots(vcpu);
-			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 		}
 		return 0;
 	}
@@ -8222,10 +8222,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_mmu_sync_roots(vcpu);
 		if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
 			kvm_mmu_load_pgd(vcpu);
-		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
 			kvm_vcpu_flush_tlb_all(vcpu);
+
+			/* Flushing all ASIDs flushes the current ASID... */
+			kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+		}
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+			kvm_vcpu_flush_tlb_current(vcpu);
 		if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
 			kvm_vcpu_flush_tlb_guest(vcpu);
+
 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			r = 0;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b968acc0516f..7b5ed8ed628e 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -125,6 +125,12 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
 	return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
 }
 
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+	++vcpu->stat.tlb_flush;
+	kvm_x86_ops.tlb_flush_current(vcpu);
+}
+
 static inline int is_pae(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
-- 
cgit v1.2.3


From 8c8560b833909cbf2e3a73d4dc8e65a60ba66f07 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:21 -0700
Subject: KVM: x86/mmu: Use KVM_REQ_TLB_FLUSH_CURRENT for MMU specific flushes

Flush only the current ASID/context when requesting a TLB flush due to a
change in the current vCPU's MMU to avoid blasting away TLB entries
associated with other ASIDs/contexts, e.g. entries cached for L1 when
a change in L2's MMU requires a flush.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-26-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 85e17a057094..575bd5de2118 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2309,7 +2309,7 @@ static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
 		return;
 
 	if (local_flush)
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 }
 
 #ifdef CONFIG_KVM_MMU_AUDIT
@@ -2516,11 +2516,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 				break;
 
 			WARN_ON(!list_empty(&invalid_list));
-			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 		}
 
 		if (sp->unsync_children)
-			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 
 		__clear_sp_write_flooding_count(sp);
 		trace_kvm_mmu_get_page(sp, false);
@@ -3121,7 +3121,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 		if (write_fault)
 			ret = RET_PF_EMULATE;
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 	}
 
 	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
@@ -4310,7 +4310,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
 			if (!skip_tlb_flush) {
 				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-				kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+				kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 			}
 
 			/*
@@ -5179,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	kvm_mmu_load_pgd(vcpu);
-	kvm_x86_ops.tlb_flush_all(vcpu);
+	kvm_x86_ops.tlb_flush_current(vcpu);
 out:
 	return r;
 }
-- 
cgit v1.2.3


From c51e1ffee50121a2b2e00208f517a695493ee85d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:22 -0700
Subject: KVM: nVMX: Selectively use TLB_FLUSH_CURRENT for nested
 VM-Enter/VM-Exit

Flush only the current context, as opposed to all contexts, when
requesting a TLB flush to handle the scenario where a L1 does not expect
a TLB flush, but one is required because L1 and L2 shared an ASID.  This
occurs if EPT is disabled (no per-EPTP tag), VPID is enabled (hardware
doesn't flush unconditionally) and vmcs02 does not have its own VPID due
to exhaustion of available VPIDs.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-27-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 567a7a0f30e0..deddf0f0f6e7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1152,8 +1152,8 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
 	 *
 	 * If VPID is enabled and used by vmc12, but L2 does not have a unique
 	 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
-	 * a VPID for L2, flush the TLB as the effective ASID is common to both
-	 * L1 and L2.
+	 * a VPID for L2, flush the current context as the effective ASID is
+	 * common to both L1 and L2.
 	 *
 	 * Defer the flush so that it runs after vmcs02.EPTP has been set by
 	 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
@@ -1165,8 +1165,10 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
 	 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
 	 * all nested vCPUs.
 	 */
-	if (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu)) {
+	if (!nested_cpu_has_vpid(vmcs12)) {
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	} else if (!nested_has_guest_tlb_tag(vcpu)) {
+		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 	} else if (is_vmenter &&
 		   vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
 		vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-- 
cgit v1.2.3


From 1196cb970b996be69a2fcd9756117b394f8e7526 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:23 -0700
Subject: KVM: nVMX: Reload APIC access page on nested VM-Exit only if
 necessary

Defer reloading L1's APIC page by logging the need for a reload and
processing it during nested VM-Exit instead of unconditionally reloading
the APIC page on nested VM-Exit.  This eliminates a TLB flush on the
majority of VM-Exits as the APIC page rarely needs to be reloaded.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-28-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c |  9 ++++-----
 arch/x86/kvm/vmx/vmx.c    | 10 +++++++---
 arch/x86/kvm/vmx/vmx.h    |  1 +
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index deddf0f0f6e7..a7a526a21292 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4346,11 +4346,10 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
 	vmx->nested.pi_desc = NULL;
 
-	/*
-	 * We are now running in L2, mmu_notifier will force to reload the
-	 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
-	 */
-	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+	if (vmx->nested.reload_vmcs01_apic_access_page) {
+		vmx->nested.reload_vmcs01_apic_access_page = false;
+		kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+	}
 
 	if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
 		vmx->nested.need_vmcs12_to_shadow_sync = true;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d958cee52f41..4739b780c74e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6142,10 +6142,14 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 
 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
 {
-	if (!is_guest_mode(vcpu)) {
-		vmcs_write64(APIC_ACCESS_ADDR, hpa);
-		vmx_flush_tlb_current(vcpu);
+	/* Defer reload until vmcs01 is the current VMCS. */
+	if (is_guest_mode(vcpu)) {
+		to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
+		return;
 	}
+
+	vmcs_write64(APIC_ACCESS_ADDR, hpa);
+	vmx_flush_tlb_current(vcpu);
 }
 
 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 4c7b0713b438..31d7252df163 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -136,6 +136,7 @@ struct nested_vmx {
 	bool vmcs02_initialized;
 
 	bool change_vmcs01_virtual_apic_mode;
+	bool reload_vmcs01_apic_access_page;
 
 	/*
 	 * Enlightened VMCS has been enabled. It does not mean that L1 has to
-- 
cgit v1.2.3


From a4148b7ca2a5afe1295a41b5e30048cabcb74f8d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:24 -0700
Subject: KVM: VMX: Retrieve APIC access page HPA only when necessary

Move the retrieval of the HPA associated with L1's APIC access page into
VMX code to avoid unnecessarily calling gfn_to_page(), e.g. when the
vCPU is in guest mode (L2).  Alternatively, the optimization logic in
VMX could be mirrored into the common x86 code, but that will get ugly
fast when further optimizations are introduced.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-29-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/vmx/vmx.c          | 16 ++++++++++++++--
 arch/x86/kvm/x86.c              | 13 +------------
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 72e9c4492f47..541e2df8fc6e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1152,7 +1152,7 @@ struct kvm_x86_ops {
 	bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 	void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
-	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
 	int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4739b780c74e..8550da629a61 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6140,16 +6140,28 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 	vmx_update_msr_bitmap(vcpu);
 }
 
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
 {
+	struct page *page;
+
 	/* Defer reload until vmcs01 is the current VMCS. */
 	if (is_guest_mode(vcpu)) {
 		to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
 		return;
 	}
 
-	vmcs_write64(APIC_ACCESS_ADDR, hpa);
+	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+	if (is_error_page(page))
+		return;
+
+	vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
 	vmx_flush_tlb_current(vcpu);
+
+	/*
+	 * Do not pin apic access page in memory, the MMU notifier
+	 * will call us again if it is migrated or swapped out.
+	 */
+	put_page(page);
 }
 
 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ebbe34d89469..90aa4abbc0a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8157,24 +8157,13 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
-	struct page *page = NULL;
-
 	if (!lapic_in_kernel(vcpu))
 		return;
 
 	if (!kvm_x86_ops.set_apic_access_page_addr)
 		return;
 
-	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-	if (is_error_page(page))
-		return;
-	kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page));
-
-	/*
-	 * Do not pin apic access page in memory, the MMU notifier
-	 * will call us again if it is migrated or swapped out.
-	 */
-	put_page(page);
+	kvm_x86_ops.set_apic_access_page_addr(vcpu);
 }
 
 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 4de1f9d469f4515464c2c4958a59479f641970ef Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:25 -0700
Subject: KVM: VMX: Don't reload APIC access page if its control is disabled

Don't reload the APIC access page if its control is disabled, e.g. if
the guest is running with x2APIC (likely) or with the local APIC
disabled (unlikely), to avoid unnecessary TLB flushes and VMWRITEs.
Unconditionally reload the APIC access page and flush the TLB when
the guest's virtual APIC transitions to "xAPIC enabled", as any
changes to the APIC access page's mapping will not be recorded while
the guest's virtual APIC is disabled.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-30-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8550da629a61..a87b8c1e8de6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6126,7 +6126,15 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 		if (flexpriority_enabled) {
 			sec_exec_control |=
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-			vmx_flush_tlb_current(vcpu);
+			kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+			/*
+			 * Flush the TLB, reloading the APIC access page will
+			 * only do so if its physical address has changed, but
+			 * the guest may have inserted a non-APIC mapping into
+			 * the TLB while the APIC access page was disabled.
+			 */
+			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 		}
 		break;
 	case LAPIC_MODE_X2APIC:
@@ -6150,6 +6158,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
 		return;
 	}
 
+	if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
+	    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+		return;
+
 	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
 	if (is_error_page(page))
 		return;
-- 
cgit v1.2.3


From b869855badd1387bd12415e4d5571e931825b546 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:26 -0700
Subject: KVM: x86/mmu: Move fast_cr3_switch() side effects to
 __kvm_mmu_new_cr3()

Handle the side effects of a fast CR3 (PGD) switch up a level in
__kvm_mmu_new_cr3(), which is the only caller of fast_cr3_switch().

This consolidates handling all side effects in __kvm_mmu_new_cr3()
(where freeing the current root when KVM can't do a fast switch is
already handled), and ameliorates the pain of adding a second boolean in
a future patch to provide a separate "skip" override for the MMU sync.

Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-31-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 69 +++++++++++++++++++++++---------------------------
 1 file changed, 31 insertions(+), 38 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 575bd5de2118..6a939204d467 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4284,8 +4284,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 }
 
 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
-			    union kvm_mmu_page_role new_role,
-			    bool skip_tlb_flush)
+			    union kvm_mmu_page_role new_role)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
 
@@ -4295,39 +4294,9 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 	 * later if necessary.
 	 */
 	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
-	    mmu->root_level >= PT64_ROOT_4LEVEL) {
-		if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
-			return false;
-
-		if (cached_root_available(vcpu, new_cr3, new_role)) {
-			/*
-			 * It is possible that the cached previous root page is
-			 * obsolete because of a change in the MMU generation
-			 * number. However, changing the generation number is
-			 * accompanied by KVM_REQ_MMU_RELOAD, which will free
-			 * the root set here and allocate a new one.
-			 */
-			kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
-			if (!skip_tlb_flush) {
-				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-				kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-			}
-
-			/*
-			 * The last MMIO access's GVA and GPA are cached in the
-			 * VCPU. When switching to a new CR3, that GVA->GPA
-			 * mapping may no longer be valid. So clear any cached
-			 * MMIO info even when we don't need to sync the shadow
-			 * page tables.
-			 */
-			vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
-
-			__clear_sp_write_flooding_count(
-				page_header(mmu->root_hpa));
-
-			return true;
-		}
-	}
+	    mmu->root_level >= PT64_ROOT_4LEVEL)
+		return !mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT) &&
+		       cached_root_available(vcpu, new_cr3, new_role);
 
 	return false;
 }
@@ -4336,9 +4305,33 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			      union kvm_mmu_page_role new_role,
 			      bool skip_tlb_flush)
 {
-	if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
-		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
-				   KVM_MMU_ROOT_CURRENT);
+	if (!fast_cr3_switch(vcpu, new_cr3, new_role)) {
+		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
+		return;
+	}
+
+	/*
+	 * It's possible that the cached previous root page is obsolete because
+	 * of a change in the MMU generation number. However, changing the
+	 * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will
+	 * free the root set here and allocate a new one.
+	 */
+	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+
+	if (!skip_tlb_flush) {
+		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+	}
+
+	/*
+	 * The last MMIO access's GVA and GPA are cached in the VCPU. When
+	 * switching to a new CR3, that GVA->GPA mapping may no longer be
+	 * valid. So clear any cached MMIO info even when we don't need to sync
+	 * the shadow page tables.
+	 */
+	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+	__clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
 }
 
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
-- 
cgit v1.2.3


From 4a632ac6ca66fb29b94a16495624c58f4d313f2f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:27 -0700
Subject: KVM: x86/mmu: Add separate override for MMU sync during fast CR3
 switch

Add a separate "skip" override for MMU sync, a future change to avoid
TLB flushes on nested VMX transitions may need to sync the MMU even if
the TLB flush is unnecessary.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-32-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/mmu/mmu.c          | 13 +++++++------
 arch/x86/kvm/vmx/nested.c       |  2 +-
 arch/x86/kvm/x86.c              |  2 +-
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 541e2df8fc6e..ee5886a5006a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1524,7 +1524,8 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gva_t gva, hpa_t root_hpa);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush,
+		     bool skip_mmu_sync);
 
 void kvm_configure_mmu(bool enable_tdp, int tdp_page_level);
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6a939204d467..97d1e9b80b8e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4303,7 +4303,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 
 static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			      union kvm_mmu_page_role new_role,
-			      bool skip_tlb_flush)
+			      bool skip_tlb_flush, bool skip_mmu_sync)
 {
 	if (!fast_cr3_switch(vcpu, new_cr3, new_role)) {
 		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
@@ -4318,10 +4318,10 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 	 */
 	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
 
-	if (!skip_tlb_flush) {
+	if (!skip_mmu_sync)
 		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+	if (!skip_tlb_flush)
 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-	}
 
 	/*
 	 * The last MMIO access's GVA and GPA are cached in the VCPU. When
@@ -4334,10 +4334,11 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 	__clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
 }
 
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush,
+		     bool skip_mmu_sync)
 {
 	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
-			  skip_tlb_flush);
+			  skip_tlb_flush, skip_mmu_sync);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
 
@@ -5030,7 +5031,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
 						   execonly, level);
 
-	__kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
+	__kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false, false);
 
 	if (new_role.as_u64 == context->mmu_role.as_u64)
 		return;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a7a526a21292..bf5efb30cc21 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1101,7 +1101,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 	}
 
 	if (!nested_ept)
-		kvm_mmu_new_cr3(vcpu, cr3, false);
+		kvm_mmu_new_cr3(vcpu, cr3, false, false);
 
 	vcpu->arch.cr3 = cr3;
 	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 90aa4abbc0a6..7bf5193ed9b4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1031,7 +1031,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 		return 1;
 
-	kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
+	kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
 	vcpu->arch.cr3 = cr3;
 	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
-- 
cgit v1.2.3


From 71fe70130d88729abc0e658d3202618c340d2e71 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:28 -0700
Subject: KVM: x86/mmu: Add module param to force TLB flush on root reuse

Add a module param, flush_on_reuse, to override skip_tlb_flush and
skip_mmu_sync when performing a so called "fast cr3 switch", i.e. when
reusing a cached root.  The primary motiviation for the control is to
provide a fallback mechanism in the event that TLB flushing and/or MMU
sync bugs are exposed/introduced by upcoming changes to stop
unconditionally flushing on nested VMX transitions.

Suggested-by: Jim Mattson <jmattson@google.com>
Suggested-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-33-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 97d1e9b80b8e..c98948459414 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -78,6 +78,9 @@ module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
 		&nx_huge_pages_recovery_ratio, 0644);
 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
 
+static bool __read_mostly force_flush_and_sync_on_reuse;
+module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
+
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
  * where the hardware walks 2 page tables:
@@ -4318,9 +4321,9 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 	 */
 	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
 
-	if (!skip_mmu_sync)
+	if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
 		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-	if (!skip_tlb_flush)
+	if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 
 	/*
-- 
cgit v1.2.3


From 41fab65e7c44ab1c3aa4b962e6f95649354db419 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:29 -0700
Subject: KVM: nVMX: Skip MMU sync on nested VMX transition when possible

Skip the MMU sync when reusing a cached root if EPT is enabled or L1
enabled VPID for L2.

If EPT is enabled, guest-physical mappings aren't flushed even if VPID
is disabled, i.e. L1 can't expect stale TLB entries to be flushed if it
has enabled EPT and L0 isn't shadowing PTEs (for L1 or L2) if L1 has
EPT disabled.

If VPID is enabled (and EPT is disabled), then L1 can't expect stale TLB
entries to be flushed (for itself or L2).

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-34-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c    |  2 +-
 arch/x86/kvm/vmx/nested.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c98948459414..34c015461c35 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5034,7 +5034,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
 						   execonly, level);
 
-	__kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false, false);
+	__kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false, true);
 
 	if (new_role.as_u64 == context->mmu_role.as_u64)
 		return;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bf5efb30cc21..a8290954caa7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1073,6 +1073,48 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
 	return (val & invalid_mask) == 0;
 }
 
+/*
+ * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
+ * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
+ * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
+ * Here's why.
+ *
+ * If EPT is enabled by L0 a sync is never needed:
+ * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
+ *   cannot be unsync'd SPTEs for either L1 or L2.
+ *
+ * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
+ *   VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
+ *   (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
+ *   stale guest-physical mappings for L2 from the TLB.  And as above, L0 isn't
+ *   shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
+ *
+ * If EPT is disabled by L0:
+ * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
+ *   enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
+ *   required to invalidate linear mappings (EPT is disabled so there are
+ *   no combined or guest-physical mappings), i.e. L1 can't rely on the
+ *   (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
+ *
+ * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
+ *   linear mappings (EPT is disabled so there are no combined or guest-physical
+ *   mappings) to be invalidated on both VM-Enter and VM-Exit.
+ *
+ * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
+ * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
+ * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
+ * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
+ * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
+ * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
+ * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
+ * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
+ * stale TLB entries, at which point L0 will sync L2's MMU.
+ */
+static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
+{
+	return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
+}
+
 /*
  * Load guest's/host's cr3 at nested entry/exit.  @nested_ept is true if we are
  * emulating VM-Entry into a guest with EPT enabled.  On failure, the expected
@@ -1100,8 +1142,12 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 		}
 	}
 
+	/*
+	 * See nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
+	 */
 	if (!nested_ept)
-		kvm_mmu_new_cr3(vcpu, cr3, false, false);
+		kvm_mmu_new_cr3(vcpu, cr3, false,
+				!nested_vmx_transition_mmu_sync(vcpu));
 
 	vcpu->arch.cr3 = cr3;
 	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
-- 
cgit v1.2.3


From 9805c5f74b8a65da397ec6b70734e44f1f191c0b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:30 -0700
Subject: KVM: nVMX: Don't flush TLB on nested VMX transition

Unconditionally skip the TLB flush triggered when reusing a root for a
nested transition as nested_vmx_transition_tlb_flush() ensures the TLB
is flushed when needed, regardless of whether the MMU can reuse a cached
root (or the last root).

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-35-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c    | 2 +-
 arch/x86/kvm/vmx/nested.c | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 34c015461c35..10fb9e7f1eca 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5034,7 +5034,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
 						   execonly, level);
 
-	__kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false, true);
+	__kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, true, true);
 
 	if (new_role.as_u64 == context->mmu_role.as_u64)
 		return;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a8290954caa7..237e2e32a659 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1143,10 +1143,12 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 	}
 
 	/*
-	 * See nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
+	 * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
+	 * flushes are handled by nested_vmx_transition_tlb_flush().  See
+	 * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
 	 */
 	if (!nested_ept)
-		kvm_mmu_new_cr3(vcpu, cr3, false,
+		kvm_mmu_new_cr3(vcpu, cr3, true,
 				!nested_vmx_transition_mmu_sync(vcpu));
 
 	vcpu->arch.cr3 = cr3;
-- 
cgit v1.2.3


From ce8fe7b77bd8ee405295e349c82d0ef8c9788200 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:31 -0700
Subject: KVM: nVMX: Free only the affected contexts when emulating INVEPT

Add logic to handle_invept() to free only those roots that match the
target EPT context when emulating a single-context INVEPT.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-36-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 237e2e32a659..7606d544ce9c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5163,17 +5163,27 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	return nested_vmx_succeed(vcpu);
 }
 
+#define EPTP_PA_MASK   GENMASK_ULL(51, 12)
+
+static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
+{
+	return VALID_PAGE(root_hpa) &&
+		((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
+}
+
 /* Emulate the INVEPT instruction */
 static int handle_invept(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 vmx_instruction_info, types;
-	unsigned long type;
+	unsigned long type, roots_to_free;
+	struct kvm_mmu *mmu;
 	gva_t gva;
 	struct x86_exception e;
 	struct {
 		u64 eptp, gpa;
 	} operand;
+	int i;
 
 	if (!(vmx->nested.msrs.secondary_ctls_high &
 	      SECONDARY_EXEC_ENABLE_EPT) ||
@@ -5205,27 +5215,41 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
+	/*
+	 * Nested EPT roots are always held through guest_mmu,
+	 * not root_mmu.
+	 */
+	mmu = &vcpu->arch.guest_mmu;
+
 	switch (type) {
 	case VMX_EPT_EXTENT_CONTEXT:
 		if (!nested_vmx_check_eptp(vcpu, operand.eptp))
 			return nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 
-		/* TODO: sync only the target EPTP context. */
-		fallthrough;
+		roots_to_free = 0;
+		if (nested_ept_root_matches(mmu->root_hpa, mmu->root_cr3,
+					    operand.eptp))
+			roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+			if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
+						    mmu->prev_roots[i].cr3,
+						    operand.eptp))
+				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+		}
+		break;
 	case VMX_EPT_EXTENT_GLOBAL:
-		/*
-		 * Nested EPT roots are always held through guest_mmu,
-		 * not root_mmu.
-		 */
-		kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu,
-				   KVM_MMU_ROOTS_ALL);
+		roots_to_free = KVM_MMU_ROOTS_ALL;
 		break;
 	default:
 		BUG_ON(1);
 		break;
 	}
 
+	if (roots_to_free)
+		kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+
 	return nested_vmx_succeed(vcpu);
 }
 
-- 
cgit v1.2.3


From be01e8e2c632c41c69bb30e7196661ec6e8fdc10 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:32 -0700
Subject: KVM: x86: Replace "cr3" with "pgd" in "new cr3/pgd" related code

Rename functions and variables in kvm_mmu_new_cr3() and related code to
replace "cr3" with "pgd", i.e. continue the work started by commit
727a7e27cf88a ("KVM: x86: rename set_cr3 callback and related flags to
load_mmu_pgd").  kvm_mmu_new_cr3() and company are not always loading a
new CR3, e.g. when nested EPT is enabled "cr3" is actually an EPTP.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-37-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  8 +++---
 arch/x86/kvm/mmu/mmu.c          | 58 ++++++++++++++++++++---------------------
 arch/x86/kvm/vmx/nested.c       |  6 ++---
 arch/x86/kvm/vmx/vmx.c          |  2 +-
 arch/x86/kvm/x86.c              |  2 +-
 5 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ee5886a5006a..d099749168f0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -375,12 +375,12 @@ struct rsvd_bits_validate {
 };
 
 struct kvm_mmu_root_info {
-	gpa_t cr3;
+	gpa_t pgd;
 	hpa_t hpa;
 };
 
 #define KVM_MMU_ROOT_INFO_INVALID \
-	((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE })
+	((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE })
 
 #define KVM_MMU_NUM_PREV_ROOTS 3
 
@@ -406,7 +406,7 @@ struct kvm_mmu {
 	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   u64 *spte, const void *pte);
 	hpa_t root_hpa;
-	gpa_t root_cr3;
+	gpa_t root_pgd;
 	union kvm_mmu_role mmu_role;
 	u8 root_level;
 	u8 shadow_root_level;
@@ -1524,7 +1524,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gva_t gva, hpa_t root_hpa);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush,
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
 		     bool skip_mmu_sync);
 
 void kvm_configure_mmu(bool enable_tdp, int tdp_page_level);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 10fb9e7f1eca..3385d6dde541 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3665,7 +3665,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 							   &invalid_list);
 			mmu->root_hpa = INVALID_PAGE;
 		}
-		mmu->root_cr3 = 0;
+		mmu->root_pgd = 0;
 	}
 
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3722,8 +3722,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 	} else
 		BUG();
 
-	/* root_cr3 is ignored for direct MMUs. */
-	vcpu->arch.mmu->root_cr3 = 0;
+	/* root_pgd is ignored for direct MMUs. */
+	vcpu->arch.mmu->root_pgd = 0;
 
 	return 0;
 }
@@ -3732,11 +3732,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
 	u64 pdptr, pm_mask;
-	gfn_t root_gfn, root_cr3;
+	gfn_t root_gfn, root_pgd;
 	int i;
 
-	root_cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
-	root_gfn = root_cr3 >> PAGE_SHIFT;
+	root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
+	root_gfn = root_pgd >> PAGE_SHIFT;
 
 	if (mmu_check_root(vcpu, root_gfn))
 		return 1;
@@ -3761,7 +3761,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu->root_hpa = root;
-		goto set_root_cr3;
+		goto set_root_pgd;
 	}
 
 	/*
@@ -3827,8 +3827,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
 	}
 
-set_root_cr3:
-	vcpu->arch.mmu->root_cr3 = root_cr3;
+set_root_pgd:
+	vcpu->arch.mmu->root_pgd = root_pgd;
 
 	return 0;
 }
@@ -4244,49 +4244,49 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->nx = false;
 }
 
-static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t cr3,
+static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
 				  union kvm_mmu_page_role role)
 {
-	return (role.direct || cr3 == root->cr3) &&
+	return (role.direct || pgd == root->pgd) &&
 	       VALID_PAGE(root->hpa) && page_header(root->hpa) &&
 	       role.word == page_header(root->hpa)->role.word;
 }
 
 /*
- * Find out if a previously cached root matching the new CR3/role is available.
+ * Find out if a previously cached root matching the new pgd/role is available.
  * The current root is also inserted into the cache.
  * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
  * returned.
  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
  * false is returned. This root should now be freed by the caller.
  */
-static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd,
 				  union kvm_mmu_page_role new_role)
 {
 	uint i;
 	struct kvm_mmu_root_info root;
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
 
-	root.cr3 = mmu->root_cr3;
+	root.pgd = mmu->root_pgd;
 	root.hpa = mmu->root_hpa;
 
-	if (is_root_usable(&root, new_cr3, new_role))
+	if (is_root_usable(&root, new_pgd, new_role))
 		return true;
 
 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
 		swap(root, mmu->prev_roots[i]);
 
-		if (is_root_usable(&root, new_cr3, new_role))
+		if (is_root_usable(&root, new_pgd, new_role))
 			break;
 	}
 
 	mmu->root_hpa = root.hpa;
-	mmu->root_cr3 = root.cr3;
+	mmu->root_pgd = root.pgd;
 
 	return i < KVM_MMU_NUM_PREV_ROOTS;
 }
 
-static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
 			    union kvm_mmu_page_role new_role)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -4298,17 +4298,17 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 	 */
 	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
 	    mmu->root_level >= PT64_ROOT_4LEVEL)
-		return !mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT) &&
-		       cached_root_available(vcpu, new_cr3, new_role);
+		return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) &&
+		       cached_root_available(vcpu, new_pgd, new_role);
 
 	return false;
 }
 
-static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
 			      union kvm_mmu_page_role new_role,
 			      bool skip_tlb_flush, bool skip_mmu_sync)
 {
-	if (!fast_cr3_switch(vcpu, new_cr3, new_role)) {
+	if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
 		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
 		return;
 	}
@@ -4337,13 +4337,13 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 	__clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
 }
 
-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush,
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
 		     bool skip_mmu_sync)
 {
-	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
+	__kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
 			  skip_tlb_flush, skip_mmu_sync);
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
+EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
 
 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 {
@@ -5034,7 +5034,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
 						   execonly, level);
 
-	__kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, true, true);
+	__kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
 
 	if (new_role.as_u64 == context->mmu_role.as_u64)
 		return;
@@ -5551,7 +5551,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 
 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
 		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
-		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
 			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
 			tlb_flush = true;
 		}
@@ -5705,13 +5705,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 
 	vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
-	vcpu->arch.root_mmu.root_cr3 = 0;
+	vcpu->arch.root_mmu.root_pgd = 0;
 	vcpu->arch.root_mmu.translate_gpa = translate_gpa;
 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 		vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
 
 	vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
-	vcpu->arch.guest_mmu.root_cr3 = 0;
+	vcpu->arch.guest_mmu.root_pgd = 0;
 	vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 		vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 7606d544ce9c..aca57d8da400 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1148,7 +1148,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 	 * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
 	 */
 	if (!nested_ept)
-		kvm_mmu_new_cr3(vcpu, cr3, true,
+		kvm_mmu_new_pgd(vcpu, cr3, true,
 				!nested_vmx_transition_mmu_sync(vcpu));
 
 	vcpu->arch.cr3 = cr3;
@@ -5228,13 +5228,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 
 		roots_to_free = 0;
-		if (nested_ept_root_matches(mmu->root_hpa, mmu->root_cr3,
+		if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
 					    operand.eptp))
 			roots_to_free |= KVM_MMU_ROOT_CURRENT;
 
 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
 			if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
-						    mmu->prev_roots[i].cr3,
+						    mmu->prev_roots[i].pgd,
 						    operand.eptp))
 				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
 		}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a87b8c1e8de6..c9c959fb8a66 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5476,7 +5476,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 		}
 
 		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-			if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
+			if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
 			    == operand.pcid)
 				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7bf5193ed9b4..de77bc9bd0d7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1031,7 +1031,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 		return 1;
 
-	kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
+	kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
 	vcpu->arch.cr3 = cr3;
 	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
 
-- 
cgit v1.2.3


From be100ef136254ab4a3ff223e2288aae6c5809ac6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 20 Mar 2020 14:28:33 -0700
Subject: KVM: VMX: Clean cr3/pgd handling in vmx_load_mmu_pgd()

Rename @cr3 to @pgd in vmx_load_mmu_pgd() to reflect that it will be
loaded into vmcs.EPT_POINTER and not vmcs.GUEST_CR3 when EPT is enabled.
Similarly, load guest_cr3 with @pgd if and only if EPT is disabled.

This fixes one of the last, if not _the_ last, cases in KVM where a
variable that is not strictly a cr3 value uses "cr3" instead of "pgd".

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200320212833.3507-38-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c9c959fb8a66..aa1b8cf7c915 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3034,16 +3034,15 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
 	return eptp;
 }
 
-void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
 {
 	struct kvm *kvm = vcpu->kvm;
 	bool update_guest_cr3 = true;
 	unsigned long guest_cr3;
 	u64 eptp;
 
-	guest_cr3 = cr3;
 	if (enable_ept) {
-		eptp = construct_eptp(vcpu, cr3);
+		eptp = construct_eptp(vcpu, pgd);
 		vmcs_write64(EPT_POINTER, eptp);
 
 		if (kvm_x86_ops.tlb_remote_flush) {
@@ -3064,6 +3063,8 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
 		else /* vmcs01.GUEST_CR3 is already up-to-date. */
 			update_guest_cr3 = false;
 		ept_load_pdptrs(vcpu);
+	} else {
+		guest_cr3 = pgd;
 	}
 
 	if (update_guest_cr3)
-- 
cgit v1.2.3


From 1c164cb3ffd084e5359a56abde715acf121e69a4 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Sat, 11 Apr 2020 17:36:27 +0200
Subject: KVM: SVM: Use do_machine_check to pass MCE to the host

Use do_machine_check instead of INT $12 to pass MCE to the host,
the same approach VMX uses.

On a related note, there is no reason to limit the use of do_machine_check
to 64 bit targets, as is currently done for VMX. MCE handling works
for both target families.

The patch is only compile tested, for both, 64 and 32 bit targets,
someone should test the passing of the exception by injecting
some MCEs into the guest.

For future non-RFC patch, kvm_machine_check should be moved to some
appropriate header file.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Message-Id: <20200411153627.3474710-1-ubizjak@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 66123848448d..747e60c65c98 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -33,6 +33,7 @@
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
 #include <asm/irq_remapping.h>
+#include <asm/mce.h>
 #include <asm/spec-ctrl.h>
 #include <asm/cpu_device_id.h>
 
@@ -1839,6 +1840,25 @@ static bool is_erratum_383(void)
 	return true;
 }
 
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+	struct pt_regs regs = {
+		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
+		.flags = X86_EFLAGS_IF,
+	};
+
+	do_machine_check(&regs, 0);
+#endif
+}
+
 static void svm_handle_mce(struct vcpu_svm *svm)
 {
 	if (is_erratum_383()) {
@@ -1857,11 +1877,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
 	 * On an #MC intercept the MCE handler is not called automatically in
 	 * the host. So do it by hand here.
 	 */
-	asm volatile (
-		"int $0x12\n");
-	/* not sure if we ever come back to this point */
-
-	return;
+	kvm_machine_check();
 }
 
 static int mc_interception(struct vcpu_svm *svm)
-- 
cgit v1.2.3


From 812756a82ea51e3c7ff7ba5e6fa3f34345234bc7 Mon Sep 17 00:00:00 2001
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Date: Tue, 14 Apr 2020 17:56:25 +0200
Subject: kvm_host: unify VM_STAT and VCPU_STAT definitions in a single place

The macros VM_STAT and VCPU_STAT are redundantly implemented in multiple
files, each used by a different architecure to initialize the debugfs
entries for statistics. Since they all have the same purpose, they can be
unified in a single common definition in include/linux/kvm_host.h

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Message-Id: <20200414155625.20559-1-eesposit@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/guest.c    |  23 +++---
 arch/mips/kvm/mips.c      |  61 +++++++-------
 arch/powerpc/kvm/book3s.c |  61 +++++++-------
 arch/powerpc/kvm/booke.c  |  41 +++++-----
 arch/s390/kvm/kvm-s390.c  | 203 +++++++++++++++++++++++-----------------------
 arch/x86/kvm/x86.c        |  80 +++++++++---------
 include/linux/kvm_host.h  |   5 ++
 7 files changed, 231 insertions(+), 243 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 23ebe51410f0..8417b200bec9 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -29,20 +29,17 @@
 
 #include "trace.h"
 
-#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM }
-#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU }
-
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-	VCPU_STAT(halt_successful_poll),
-	VCPU_STAT(halt_attempted_poll),
-	VCPU_STAT(halt_poll_invalid),
-	VCPU_STAT(halt_wakeup),
-	VCPU_STAT(hvc_exit_stat),
-	VCPU_STAT(wfe_exit_stat),
-	VCPU_STAT(wfi_exit_stat),
-	VCPU_STAT(mmio_exit_user),
-	VCPU_STAT(mmio_exit_kernel),
-	VCPU_STAT(exits),
+	VCPU_STAT("halt_successful_poll", halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
+	VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
+	VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
+	VCPU_STAT("mmio_exit_user", mmio_exit_user),
+	VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
+	VCPU_STAT("exits", exits),
 	{ NULL }
 };
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 8f05dd0a0f4e..fdf1c14d9205 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -39,40 +39,39 @@
 #define VECTORSPACING 0x100	/* for EI/VI mode */
 #endif
 
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x)
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-	{ "wait",	  VCPU_STAT(wait_exits),	 KVM_STAT_VCPU },
-	{ "cache",	  VCPU_STAT(cache_exits),	 KVM_STAT_VCPU },
-	{ "signal",	  VCPU_STAT(signal_exits),	 KVM_STAT_VCPU },
-	{ "interrupt",	  VCPU_STAT(int_exits),		 KVM_STAT_VCPU },
-	{ "cop_unusable", VCPU_STAT(cop_unusable_exits), KVM_STAT_VCPU },
-	{ "tlbmod",	  VCPU_STAT(tlbmod_exits),	 KVM_STAT_VCPU },
-	{ "tlbmiss_ld",	  VCPU_STAT(tlbmiss_ld_exits),	 KVM_STAT_VCPU },
-	{ "tlbmiss_st",	  VCPU_STAT(tlbmiss_st_exits),	 KVM_STAT_VCPU },
-	{ "addrerr_st",	  VCPU_STAT(addrerr_st_exits),	 KVM_STAT_VCPU },
-	{ "addrerr_ld",	  VCPU_STAT(addrerr_ld_exits),	 KVM_STAT_VCPU },
-	{ "syscall",	  VCPU_STAT(syscall_exits),	 KVM_STAT_VCPU },
-	{ "resvd_inst",	  VCPU_STAT(resvd_inst_exits),	 KVM_STAT_VCPU },
-	{ "break_inst",	  VCPU_STAT(break_inst_exits),	 KVM_STAT_VCPU },
-	{ "trap_inst",	  VCPU_STAT(trap_inst_exits),	 KVM_STAT_VCPU },
-	{ "msa_fpe",	  VCPU_STAT(msa_fpe_exits),	 KVM_STAT_VCPU },
-	{ "fpe",	  VCPU_STAT(fpe_exits),		 KVM_STAT_VCPU },
-	{ "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU },
-	{ "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
+	VCPU_STAT("wait", wait_exits),
+	VCPU_STAT("cache", cache_exits),
+	VCPU_STAT("signal", signal_exits),
+	VCPU_STAT("interrupt", int_exits),
+	VCPU_STAT("cop_unusable", cop_unusable_exits),
+	VCPU_STAT("tlbmod", tlbmod_exits),
+	VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits),
+	VCPU_STAT("tlbmiss_st", tlbmiss_st_exits),
+	VCPU_STAT("addrerr_st", addrerr_st_exits),
+	VCPU_STAT("addrerr_ld", addrerr_ld_exits),
+	VCPU_STAT("syscall", syscall_exits),
+	VCPU_STAT("resvd_inst", resvd_inst_exits),
+	VCPU_STAT("break_inst", break_inst_exits),
+	VCPU_STAT("trap_inst", trap_inst_exits),
+	VCPU_STAT("msa_fpe", msa_fpe_exits),
+	VCPU_STAT("fpe", fpe_exits),
+	VCPU_STAT("msa_disabled", msa_disabled_exits),
+	VCPU_STAT("flush_dcache", flush_dcache_exits),
 #ifdef CONFIG_KVM_MIPS_VZ
-	{ "vz_gpsi",	  VCPU_STAT(vz_gpsi_exits),	 KVM_STAT_VCPU },
-	{ "vz_gsfc",	  VCPU_STAT(vz_gsfc_exits),	 KVM_STAT_VCPU },
-	{ "vz_hc",	  VCPU_STAT(vz_hc_exits),	 KVM_STAT_VCPU },
-	{ "vz_grr",	  VCPU_STAT(vz_grr_exits),	 KVM_STAT_VCPU },
-	{ "vz_gva",	  VCPU_STAT(vz_gva_exits),	 KVM_STAT_VCPU },
-	{ "vz_ghfc",	  VCPU_STAT(vz_ghfc_exits),	 KVM_STAT_VCPU },
-	{ "vz_gpa",	  VCPU_STAT(vz_gpa_exits),	 KVM_STAT_VCPU },
-	{ "vz_resvd",	  VCPU_STAT(vz_resvd_exits),	 KVM_STAT_VCPU },
+	VCPU_STAT("vz_gpsi", vz_gpsi_exits),
+	VCPU_STAT("vz_gsfc", vz_gsfc_exits),
+	VCPU_STAT("vz_hc", vz_hc_exits),
+	VCPU_STAT("vz_grr", vz_grr_exits),
+	VCPU_STAT("vz_gva", vz_gva_exits),
+	VCPU_STAT("vz_ghfc", vz_ghfc_exits),
+	VCPU_STAT("vz_gpa", vz_gpa_exits),
+	VCPU_STAT("vz_resvd", vz_resvd_exits),
 #endif
-	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
-	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU },
-	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU },
-	{ "halt_wakeup",  VCPU_STAT(halt_wakeup),	 KVM_STAT_VCPU },
+	VCPU_STAT("halt_successful_poll", halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", halt_wakeup),
 	{NULL}
 };
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 5690a1f9b976..37508a356f28 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -36,41 +36,38 @@
 #include "book3s.h"
 #include "trace.h"
 
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
-
 /* #define EXIT_DEBUG */
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-	{ "exits",       VCPU_STAT(sum_exits) },
-	{ "mmio",        VCPU_STAT(mmio_exits) },
-	{ "sig",         VCPU_STAT(signal_exits) },
-	{ "sysc",        VCPU_STAT(syscall_exits) },
-	{ "inst_emu",    VCPU_STAT(emulated_inst_exits) },
-	{ "dec",         VCPU_STAT(dec_exits) },
-	{ "ext_intr",    VCPU_STAT(ext_intr_exits) },
-	{ "queue_intr",  VCPU_STAT(queue_intr) },
-	{ "halt_poll_success_ns",	VCPU_STAT(halt_poll_success_ns) },
-	{ "halt_poll_fail_ns",		VCPU_STAT(halt_poll_fail_ns) },
-	{ "halt_wait_ns",		VCPU_STAT(halt_wait_ns) },
-	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
-	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
-	{ "halt_successful_wait",	VCPU_STAT(halt_successful_wait) },
-	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
-	{ "pf_storage",  VCPU_STAT(pf_storage) },
-	{ "sp_storage",  VCPU_STAT(sp_storage) },
-	{ "pf_instruc",  VCPU_STAT(pf_instruc) },
-	{ "sp_instruc",  VCPU_STAT(sp_instruc) },
-	{ "ld",          VCPU_STAT(ld) },
-	{ "ld_slow",     VCPU_STAT(ld_slow) },
-	{ "st",          VCPU_STAT(st) },
-	{ "st_slow",     VCPU_STAT(st_slow) },
-	{ "pthru_all",       VCPU_STAT(pthru_all) },
-	{ "pthru_host",      VCPU_STAT(pthru_host) },
-	{ "pthru_bad_aff",   VCPU_STAT(pthru_bad_aff) },
-	{ "largepages_2M",    VM_STAT(num_2M_pages, .mode = 0444) },
-	{ "largepages_1G",    VM_STAT(num_1G_pages, .mode = 0444) },
+	VCPU_STAT("exits", sum_exits),
+	VCPU_STAT("mmio", mmio_exits),
+	VCPU_STAT("sig", signal_exits),
+	VCPU_STAT("sysc", syscall_exits),
+	VCPU_STAT("inst_emu", emulated_inst_exits),
+	VCPU_STAT("dec", dec_exits),
+	VCPU_STAT("ext_intr", ext_intr_exits),
+	VCPU_STAT("queue_intr", queue_intr),
+	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+	VCPU_STAT("halt_wait_ns", halt_wait_ns),
+	VCPU_STAT("halt_successful_poll", halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+	VCPU_STAT("halt_successful_wait", halt_successful_wait),
+	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("pf_storage", pf_storage),
+	VCPU_STAT("sp_storage", sp_storage),
+	VCPU_STAT("pf_instruc", pf_instruc),
+	VCPU_STAT("sp_instruc", sp_instruc),
+	VCPU_STAT("ld", ld),
+	VCPU_STAT("ld_slow", ld_slow),
+	VCPU_STAT("st", st),
+	VCPU_STAT("st_slow", st_slow),
+	VCPU_STAT("pthru_all", pthru_all),
+	VCPU_STAT("pthru_host", pthru_host),
+	VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
+	VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
+	VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
 	{ NULL }
 };
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 6c18ea88fd25..c2984cb6dfa7 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -35,29 +35,26 @@
 
 unsigned long kvmppc_booke_handlers;
 
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-	{ "mmio",       VCPU_STAT(mmio_exits) },
-	{ "sig",        VCPU_STAT(signal_exits) },
-	{ "itlb_r",     VCPU_STAT(itlb_real_miss_exits) },
-	{ "itlb_v",     VCPU_STAT(itlb_virt_miss_exits) },
-	{ "dtlb_r",     VCPU_STAT(dtlb_real_miss_exits) },
-	{ "dtlb_v",     VCPU_STAT(dtlb_virt_miss_exits) },
-	{ "sysc",       VCPU_STAT(syscall_exits) },
-	{ "isi",        VCPU_STAT(isi_exits) },
-	{ "dsi",        VCPU_STAT(dsi_exits) },
-	{ "inst_emu",   VCPU_STAT(emulated_inst_exits) },
-	{ "dec",        VCPU_STAT(dec_exits) },
-	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
-	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
-	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
-	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
-	{ "doorbell", VCPU_STAT(dbell_exits) },
-	{ "guest doorbell", VCPU_STAT(gdbell_exits) },
-	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+	VCPU_STAT("mmio", mmio_exits),
+	VCPU_STAT("sig", signal_exits),
+	VCPU_STAT("itlb_r", itlb_real_miss_exits),
+	VCPU_STAT("itlb_v", itlb_virt_miss_exits),
+	VCPU_STAT("dtlb_r", dtlb_real_miss_exits),
+	VCPU_STAT("dtlb_v", dtlb_virt_miss_exits),
+	VCPU_STAT("sysc", syscall_exits),
+	VCPU_STAT("isi", isi_exits),
+	VCPU_STAT("dsi", dsi_exits),
+	VCPU_STAT("inst_emu", emulated_inst_exits),
+	VCPU_STAT("dec", dec_exits),
+	VCPU_STAT("ext_intr", ext_intr_exits),
+	VCPU_STAT("halt_successful_poll", halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("doorbell", dbell_exits),
+	VCPU_STAT("guest doorbell", gdbell_exits),
+	VM_STAT("remote_tlb_flush", remote_tlb_flush),
 	{ NULL }
 };
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5dcf9ff12828..5307929a6c89 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -57,110 +57,107 @@
 #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
 			   (KVM_MAX_VCPUS + LOCAL_IRQS))
 
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-	{ "userspace_handled", VCPU_STAT(exit_userspace) },
-	{ "exit_null", VCPU_STAT(exit_null) },
-	{ "exit_validity", VCPU_STAT(exit_validity) },
-	{ "exit_stop_request", VCPU_STAT(exit_stop_request) },
-	{ "exit_external_request", VCPU_STAT(exit_external_request) },
-	{ "exit_io_request", VCPU_STAT(exit_io_request) },
-	{ "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
-	{ "exit_instruction", VCPU_STAT(exit_instruction) },
-	{ "exit_pei", VCPU_STAT(exit_pei) },
-	{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
-	{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
-	{ "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
-	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
-	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
-	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-	{ "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) },
-	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
-	{ "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
-	{ "instruction_lctl", VCPU_STAT(instruction_lctl) },
-	{ "instruction_stctl", VCPU_STAT(instruction_stctl) },
-	{ "instruction_stctg", VCPU_STAT(instruction_stctg) },
-	{ "deliver_ckc", VCPU_STAT(deliver_ckc) },
-	{ "deliver_cputm", VCPU_STAT(deliver_cputm) },
-	{ "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
-	{ "deliver_external_call", VCPU_STAT(deliver_external_call) },
-	{ "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
-	{ "deliver_virtio", VCPU_STAT(deliver_virtio) },
-	{ "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
-	{ "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
-	{ "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
-	{ "deliver_program", VCPU_STAT(deliver_program) },
-	{ "deliver_io", VCPU_STAT(deliver_io) },
-	{ "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
-	{ "exit_wait_state", VCPU_STAT(exit_wait_state) },
-	{ "inject_ckc", VCPU_STAT(inject_ckc) },
-	{ "inject_cputm", VCPU_STAT(inject_cputm) },
-	{ "inject_external_call", VCPU_STAT(inject_external_call) },
-	{ "inject_float_mchk", VM_STAT(inject_float_mchk) },
-	{ "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
-	{ "inject_io", VM_STAT(inject_io) },
-	{ "inject_mchk", VCPU_STAT(inject_mchk) },
-	{ "inject_pfault_done", VM_STAT(inject_pfault_done) },
-	{ "inject_program", VCPU_STAT(inject_program) },
-	{ "inject_restart", VCPU_STAT(inject_restart) },
-	{ "inject_service_signal", VM_STAT(inject_service_signal) },
-	{ "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
-	{ "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
-	{ "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
-	{ "inject_virtio", VM_STAT(inject_virtio) },
-	{ "instruction_epsw", VCPU_STAT(instruction_epsw) },
-	{ "instruction_gs", VCPU_STAT(instruction_gs) },
-	{ "instruction_io_other", VCPU_STAT(instruction_io_other) },
-	{ "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
-	{ "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
-	{ "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
-	{ "instruction_ptff", VCPU_STAT(instruction_ptff) },
-	{ "instruction_stidp", VCPU_STAT(instruction_stidp) },
-	{ "instruction_sck", VCPU_STAT(instruction_sck) },
-	{ "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
-	{ "instruction_spx", VCPU_STAT(instruction_spx) },
-	{ "instruction_stpx", VCPU_STAT(instruction_stpx) },
-	{ "instruction_stap", VCPU_STAT(instruction_stap) },
-	{ "instruction_iske", VCPU_STAT(instruction_iske) },
-	{ "instruction_ri", VCPU_STAT(instruction_ri) },
-	{ "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
-	{ "instruction_sske", VCPU_STAT(instruction_sske) },
-	{ "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
-	{ "instruction_essa", VCPU_STAT(instruction_essa) },
-	{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
-	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
-	{ "instruction_tb", VCPU_STAT(instruction_tb) },
-	{ "instruction_tpi", VCPU_STAT(instruction_tpi) },
-	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
-	{ "instruction_tsch", VCPU_STAT(instruction_tsch) },
-	{ "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
-	{ "instruction_sie", VCPU_STAT(instruction_sie) },
-	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
-	{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
-	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
-	{ "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
-	{ "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) },
-	{ "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) },
-	{ "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
-	{ "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) },
-	{ "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) },
-	{ "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) },
-	{ "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
-	{ "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) },
-	{ "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) },
-	{ "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
-	{ "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
-	{ "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
-	{ "instruction_diag_10", VCPU_STAT(diagnose_10) },
-	{ "instruction_diag_44", VCPU_STAT(diagnose_44) },
-	{ "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
-	{ "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) },
-	{ "instruction_diag_258", VCPU_STAT(diagnose_258) },
-	{ "instruction_diag_308", VCPU_STAT(diagnose_308) },
-	{ "instruction_diag_500", VCPU_STAT(diagnose_500) },
-	{ "instruction_diag_other", VCPU_STAT(diagnose_other) },
+	VCPU_STAT("userspace_handled", exit_userspace),
+	VCPU_STAT("exit_null", exit_null),
+	VCPU_STAT("exit_validity", exit_validity),
+	VCPU_STAT("exit_stop_request", exit_stop_request),
+	VCPU_STAT("exit_external_request", exit_external_request),
+	VCPU_STAT("exit_io_request", exit_io_request),
+	VCPU_STAT("exit_external_interrupt", exit_external_interrupt),
+	VCPU_STAT("exit_instruction", exit_instruction),
+	VCPU_STAT("exit_pei", exit_pei),
+	VCPU_STAT("exit_program_interruption", exit_program_interruption),
+	VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program),
+	VCPU_STAT("exit_operation_exception", exit_operation_exception),
+	VCPU_STAT("halt_successful_poll", halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+	VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal),
+	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("instruction_lctlg", instruction_lctlg),
+	VCPU_STAT("instruction_lctl", instruction_lctl),
+	VCPU_STAT("instruction_stctl", instruction_stctl),
+	VCPU_STAT("instruction_stctg", instruction_stctg),
+	VCPU_STAT("deliver_ckc", deliver_ckc),
+	VCPU_STAT("deliver_cputm", deliver_cputm),
+	VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal),
+	VCPU_STAT("deliver_external_call", deliver_external_call),
+	VCPU_STAT("deliver_service_signal", deliver_service_signal),
+	VCPU_STAT("deliver_virtio", deliver_virtio),
+	VCPU_STAT("deliver_stop_signal", deliver_stop_signal),
+	VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal),
+	VCPU_STAT("deliver_restart_signal", deliver_restart_signal),
+	VCPU_STAT("deliver_program", deliver_program),
+	VCPU_STAT("deliver_io", deliver_io),
+	VCPU_STAT("deliver_machine_check", deliver_machine_check),
+	VCPU_STAT("exit_wait_state", exit_wait_state),
+	VCPU_STAT("inject_ckc", inject_ckc),
+	VCPU_STAT("inject_cputm", inject_cputm),
+	VCPU_STAT("inject_external_call", inject_external_call),
+	VM_STAT("inject_float_mchk", inject_float_mchk),
+	VCPU_STAT("inject_emergency_signal", inject_emergency_signal),
+	VM_STAT("inject_io", inject_io),
+	VCPU_STAT("inject_mchk", inject_mchk),
+	VM_STAT("inject_pfault_done", inject_pfault_done),
+	VCPU_STAT("inject_program", inject_program),
+	VCPU_STAT("inject_restart", inject_restart),
+	VM_STAT("inject_service_signal", inject_service_signal),
+	VCPU_STAT("inject_set_prefix", inject_set_prefix),
+	VCPU_STAT("inject_stop_signal", inject_stop_signal),
+	VCPU_STAT("inject_pfault_init", inject_pfault_init),
+	VM_STAT("inject_virtio", inject_virtio),
+	VCPU_STAT("instruction_epsw", instruction_epsw),
+	VCPU_STAT("instruction_gs", instruction_gs),
+	VCPU_STAT("instruction_io_other", instruction_io_other),
+	VCPU_STAT("instruction_lpsw", instruction_lpsw),
+	VCPU_STAT("instruction_lpswe", instruction_lpswe),
+	VCPU_STAT("instruction_pfmf", instruction_pfmf),
+	VCPU_STAT("instruction_ptff", instruction_ptff),
+	VCPU_STAT("instruction_stidp", instruction_stidp),
+	VCPU_STAT("instruction_sck", instruction_sck),
+	VCPU_STAT("instruction_sckpf", instruction_sckpf),
+	VCPU_STAT("instruction_spx", instruction_spx),
+	VCPU_STAT("instruction_stpx", instruction_stpx),
+	VCPU_STAT("instruction_stap", instruction_stap),
+	VCPU_STAT("instruction_iske", instruction_iske),
+	VCPU_STAT("instruction_ri", instruction_ri),
+	VCPU_STAT("instruction_rrbe", instruction_rrbe),
+	VCPU_STAT("instruction_sske", instruction_sske),
+	VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock),
+	VCPU_STAT("instruction_essa", instruction_essa),
+	VCPU_STAT("instruction_stsi", instruction_stsi),
+	VCPU_STAT("instruction_stfl", instruction_stfl),
+	VCPU_STAT("instruction_tb", instruction_tb),
+	VCPU_STAT("instruction_tpi", instruction_tpi),
+	VCPU_STAT("instruction_tprot", instruction_tprot),
+	VCPU_STAT("instruction_tsch", instruction_tsch),
+	VCPU_STAT("instruction_sthyi", instruction_sthyi),
+	VCPU_STAT("instruction_sie", instruction_sie),
+	VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense),
+	VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running),
+	VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call),
+	VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency),
+	VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency),
+	VCPU_STAT("instruction_sigp_start", instruction_sigp_start),
+	VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop),
+	VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status),
+	VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status),
+	VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status),
+	VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch),
+	VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix),
+	VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart),
+	VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset),
+	VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset),
+	VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown),
+	VCPU_STAT("instruction_diag_10", diagnose_10),
+	VCPU_STAT("instruction_diag_44", diagnose_44),
+	VCPU_STAT("instruction_diag_9c", diagnose_9c),
+	VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
+	VCPU_STAT("instruction_diag_258", diagnose_258),
+	VCPU_STAT("instruction_diag_308", diagnose_308),
+	VCPU_STAT("instruction_diag_500", diagnose_500),
+	VCPU_STAT("instruction_diag_other", diagnose_other),
 	{ NULL }
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index de77bc9bd0d7..8104dc9ff5b2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -97,9 +97,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 
 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
 
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
-
 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 
@@ -194,45 +191,44 @@ u64 __read_mostly supported_xss;
 EXPORT_SYMBOL_GPL(supported_xss);
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-	{ "pf_fixed", VCPU_STAT(pf_fixed) },
-	{ "pf_guest", VCPU_STAT(pf_guest) },
-	{ "tlb_flush", VCPU_STAT(tlb_flush) },
-	{ "invlpg", VCPU_STAT(invlpg) },
-	{ "exits", VCPU_STAT(exits) },
-	{ "io_exits", VCPU_STAT(io_exits) },
-	{ "mmio_exits", VCPU_STAT(mmio_exits) },
-	{ "signal_exits", VCPU_STAT(signal_exits) },
-	{ "irq_window", VCPU_STAT(irq_window_exits) },
-	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
-	{ "halt_exits", VCPU_STAT(halt_exits) },
-	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
-	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
-	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
-	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
-	{ "hypercalls", VCPU_STAT(hypercalls) },
-	{ "request_irq", VCPU_STAT(request_irq_exits) },
-	{ "irq_exits", VCPU_STAT(irq_exits) },
-	{ "host_state_reload", VCPU_STAT(host_state_reload) },
-	{ "fpu_reload", VCPU_STAT(fpu_reload) },
-	{ "insn_emulation", VCPU_STAT(insn_emulation) },
-	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
-	{ "irq_injections", VCPU_STAT(irq_injections) },
-	{ "nmi_injections", VCPU_STAT(nmi_injections) },
-	{ "req_event", VCPU_STAT(req_event) },
-	{ "l1d_flush", VCPU_STAT(l1d_flush) },
-	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
-	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
-	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
-	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
-	{ "mmu_flooded", VM_STAT(mmu_flooded) },
-	{ "mmu_recycled", VM_STAT(mmu_recycled) },
-	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
-	{ "mmu_unsync", VM_STAT(mmu_unsync) },
-	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
-	{ "largepages", VM_STAT(lpages, .mode = 0444) },
-	{ "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
-	{ "max_mmu_page_hash_collisions",
-		VM_STAT(max_mmu_page_hash_collisions) },
+	VCPU_STAT("pf_fixed", pf_fixed),
+	VCPU_STAT("pf_guest", pf_guest),
+	VCPU_STAT("tlb_flush", tlb_flush),
+	VCPU_STAT("invlpg", invlpg),
+	VCPU_STAT("exits", exits),
+	VCPU_STAT("io_exits", io_exits),
+	VCPU_STAT("mmio_exits", mmio_exits),
+	VCPU_STAT("signal_exits", signal_exits),
+	VCPU_STAT("irq_window", irq_window_exits),
+	VCPU_STAT("nmi_window", nmi_window_exits),
+	VCPU_STAT("halt_exits", halt_exits),
+	VCPU_STAT("halt_successful_poll", halt_successful_poll),
+	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("hypercalls", hypercalls),
+	VCPU_STAT("request_irq", request_irq_exits),
+	VCPU_STAT("irq_exits", irq_exits),
+	VCPU_STAT("host_state_reload", host_state_reload),
+	VCPU_STAT("fpu_reload", fpu_reload),
+	VCPU_STAT("insn_emulation", insn_emulation),
+	VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
+	VCPU_STAT("irq_injections", irq_injections),
+	VCPU_STAT("nmi_injections", nmi_injections),
+	VCPU_STAT("req_event", req_event),
+	VCPU_STAT("l1d_flush", l1d_flush),
+	VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
+	VM_STAT("mmu_pte_write", mmu_pte_write),
+	VM_STAT("mmu_pte_updated", mmu_pte_updated),
+	VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
+	VM_STAT("mmu_flooded", mmu_flooded),
+	VM_STAT("mmu_recycled", mmu_recycled),
+	VM_STAT("mmu_cache_miss", mmu_cache_miss),
+	VM_STAT("mmu_unsync", mmu_unsync),
+	VM_STAT("remote_tlb_flush", remote_tlb_flush),
+	VM_STAT("largepages", lpages, .mode = 0444),
+	VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
+	VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
 	{ NULL }
 };
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 01276e3d01b9..658215f6102c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1130,6 +1130,11 @@ struct kvm_stats_debugfs_item {
 #define KVM_DBGFS_GET_MODE(dbgfs_item)                                         \
 	((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
 
+#define VM_STAT(n, x, ...) 							\
+	{ n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ }
+#define VCPU_STAT(n, x, ...)							\
+	{ n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ }
+
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 
-- 
cgit v1.2.3


From 789afc5ccd4ef1374669f788c90a870d4e41074c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:10 -0700
Subject: KVM: nVMX: Move reflection check into nested_vmx_reflect_vmexit()

Move the call to nested_vmx_exit_reflected() from vmx_handle_exit() into
nested_vmx_reflect_vmexit() and change the semantics of the return value
for nested_vmx_reflect_vmexit() to indicate whether or not the exit was
reflected into L1.  nested_vmx_exit_reflected() and
nested_vmx_reflect_vmexit() are intrinsically tied together, calling one
without simultaneously calling the other makes little sense.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.h | 16 +++++++++++-----
 arch/x86/kvm/vmx/vmx.c    |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index e8f4a74f7251..d6112b3e9ecf 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -81,12 +81,16 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
 }
 
 /*
- * Reflect a VM Exit into L1.
+ * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
+ * reflected into L1.
  */
-static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
-					    u32 exit_reason)
+static inline bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
+					     u32 exit_reason)
 {
-	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	u32 exit_intr_info;
+
+	if (!nested_vmx_exit_reflected(vcpu, exit_reason))
+		return false;
 
 	/*
 	 * At this point, the exit interruption info in exit_intr_info
@@ -94,6 +98,8 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
 	 * we need to query the in-kernel LAPIC.
 	 */
 	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if ((exit_intr_info &
 	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
 	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
@@ -105,7 +111,7 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
 
 	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
 			  vmcs_readl(EXIT_QUALIFICATION));
-	return 1;
+	return true;
 }
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aa1b8cf7c915..d92ed73b22da 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5905,8 +5905,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 		 */
 		nested_mark_vmcs12_pages_dirty(vcpu);
 
-		if (nested_vmx_exit_reflected(vcpu, exit_reason))
-			return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+		if (nested_vmx_reflect_vmexit(vcpu, exit_reason))
+			return 1;
 	}
 
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
-- 
cgit v1.2.3


From 7b7bd87dbd6aa8c09d5e8a8028bda69c3ab13969 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:11 -0700
Subject: KVM: nVMX: Uninline nested_vmx_reflect_vmexit(), i.e. move it to
 nested.c

Uninline nested_vmx_reflect_vmexit() in preparation of refactoring
nested_vmx_exit_reflected() to split up the reflection logic into more
consumable chunks, e.g. VM-Fail vs. L1 wants the exit vs. L0 always
handles the exit.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 34 +++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/nested.h | 36 +-----------------------------------
 2 files changed, 34 insertions(+), 36 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index aca57d8da400..afc8c3538ab3 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5646,7 +5646,7 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
  * should handle it ourselves in L0 (and then continue L2). Only call this
  * when in is_guest_mode (L2).
  */
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5813,6 +5813,38 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 	}
 }
 
+/*
+ * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
+ * reflected into L1.
+ */
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+	u32 exit_intr_info;
+
+	if (!nested_vmx_exit_reflected(vcpu, exit_reason))
+		return false;
+
+	/*
+	 * At this point, the exit interruption info in exit_intr_info
+	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
+	 * we need to query the in-kernel LAPIC.
+	 */
+	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	if ((exit_intr_info &
+	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+		vmcs12->vm_exit_intr_error_code =
+			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	}
+
+	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+			  vmcs_readl(EXIT_QUALIFICATION));
+	return true;
+}
 
 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 				struct kvm_nested_state __user *user_kvm_nested_state,
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index d6112b3e9ecf..bd959bd2eb58 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -25,7 +25,7 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void);
 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 						     bool from_vmentry);
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason);
 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		       u32 exit_intr_info, unsigned long exit_qualification);
 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
@@ -80,40 +80,6 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
 	return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
 }
 
-/*
- * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
- * reflected into L1.
- */
-static inline bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
-					     u32 exit_reason)
-{
-	u32 exit_intr_info;
-
-	if (!nested_vmx_exit_reflected(vcpu, exit_reason))
-		return false;
-
-	/*
-	 * At this point, the exit interruption info in exit_intr_info
-	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
-	 * we need to query the in-kernel LAPIC.
-	 */
-	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
-
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-	if ((exit_intr_info &
-	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
-	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-		vmcs12->vm_exit_intr_error_code =
-			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-	}
-
-	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
-			  vmcs_readl(EXIT_QUALIFICATION));
-	return true;
-}
-
 /*
  * Return the cr0 value that a nested guest would read. This is a combination
  * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
-- 
cgit v1.2.3


From fbdd50250396d55bf9986b2b9f271a7e91c111ea Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:12 -0700
Subject: KVM: nVMX: Move VM-Fail check out of nested_vmx_exit_reflected()

Check for VM-Fail on nested VM-Enter in nested_vmx_reflect_vmexit() in
preparation for separating nested_vmx_exit_reflected() into separate "L0
wants exit exit" and "L1 wants the exit" helpers.

Explicitly set exit_intr_info and exit_qual to zero instead of reading
them from vmcs02, as they are invalid on VM-Fail (and thankfully ignored
by nested_vmx_vmexit() for nested VM-Fail).

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-4-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index afc8c3538ab3..cb7fd78ad9da 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5652,15 +5652,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-	WARN_ON_ONCE(vmx->nested.nested_run_pending);
-
-	if (unlikely(vmx->fail)) {
-		trace_kvm_nested_vmenter_failed(
-			"hardware VM-instruction error: ",
-			vmcs_read32(VM_INSTRUCTION_ERROR));
-		return true;
-	}
-
 	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
 				vmcs_readl(EXIT_QUALIFICATION),
 				vmx->idt_vectoring_info,
@@ -5819,7 +5810,23 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
  */
 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
-	u32 exit_intr_info;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 exit_intr_info, exit_qual;
+
+	WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+	/*
+	 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
+	 * has already loaded L2's state.
+	 */
+	if (unlikely(vmx->fail)) {
+		trace_kvm_nested_vmenter_failed(
+			"hardware VM-instruction error: ",
+			vmcs_read32(VM_INSTRUCTION_ERROR));
+		exit_intr_info = 0;
+		exit_qual = 0;
+		goto reflect_vmexit;
+	}
 
 	if (!nested_vmx_exit_reflected(vcpu, exit_reason))
 		return false;
@@ -5840,9 +5847,10 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
 		vmcs12->vm_exit_intr_error_code =
 			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	}
+	exit_qual = vmcs_readl(EXIT_QUALIFICATION);
 
-	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
-			  vmcs_readl(EXIT_QUALIFICATION));
+reflect_vmexit:
+	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
 	return true;
 }
 
-- 
cgit v1.2.3


From 236871b6744a6b3c8f9653a429cc041bba8908d5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:13 -0700
Subject: KVM: nVMX: Move nested VM-Exit tracepoint into
 nested_vmx_reflect_vmexit()

Move the tracepoint for nested VM-Exits in preparation of splitting the
reflection logic into L1 wants the exit vs. L0 always handles the exit.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-5-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index cb7fd78ad9da..de122ef3eeed 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5648,19 +5648,13 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
  */
 static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
-	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
-				vmcs_readl(EXIT_QUALIFICATION),
-				vmx->idt_vectoring_info,
-				intr_info,
-				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
-				KVM_ISA_VMX);
+	u32 intr_info;
 
 	switch (exit_reason) {
 	case EXIT_REASON_EXCEPTION_NMI:
+		intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 		if (is_nmi(intr_info))
 			return false;
 		else if (is_page_fault(intr_info))
@@ -5828,6 +5822,14 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
 		goto reflect_vmexit;
 	}
 
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+
+	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
+				vmx->idt_vectoring_info, exit_intr_info,
+				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+				KVM_ISA_VMX);
+
 	if (!nested_vmx_exit_reflected(vcpu, exit_reason))
 		return false;
 
@@ -5838,7 +5840,6 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
 	 */
 	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
 
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if ((exit_intr_info &
 	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
 	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
@@ -5847,7 +5848,6 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
 		vmcs12->vm_exit_intr_error_code =
 			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	}
-	exit_qual = vmcs_readl(EXIT_QUALIFICATION);
 
 reflect_vmexit:
 	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
-- 
cgit v1.2.3


From 2c1f3323802e4c4495bc8cc55d4867ded6014274 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:14 -0700
Subject: KVM: nVMX: Split VM-Exit reflection logic into L0 vs. L1 wants

Split the logic that determines whether a nested VM-Exit is reflected
into L1 into "L0 wants" and "L1 wants" to document the core control flow
at a high level.  If L0 wants the VM-Exit, e.g. because the exit is due
to a hardware event that isn't passed through to L1, then KVM should
handle the exit in L0 without considering L1's configuration.  Then, if
L0 doesn't want the exit, KVM needs to query L1's wants to determine
whether or not L1 "caused" the exit, e.g. by setting an exiting control,
versus the exit occurring due to an L0 setting, e.g. when L0 intercepts
an action that L1 chose to pass-through.

Note, this adds an extra read on vmcs.VM_EXIT_INTR_INFO for exception.
This will be addressed in a future patch via a VMX-wide enhancement,
rather than pile on another case where vmx->exit_intr_info is
conditionally available.

Suggested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-6-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 111 +++++++++++++++++++++++++++++-----------------
 1 file changed, 70 insertions(+), 41 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index de122ef3eeed..c2745cd9e022 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5642,34 +5642,85 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
 }
 
 /*
- * Return true if we should exit from L2 to L1 to handle an exit, or false if we
- * should handle it ourselves in L0 (and then continue L2). Only call this
- * when in is_guest_mode (L2).
+ * Return true if L0 wants to handle an exit from L2 regardless of whether or not
+ * L1 wants the exit.  Only call this when in is_guest_mode (L2).
  */
-static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	u32 intr_info;
 
 	switch (exit_reason) {
 	case EXIT_REASON_EXCEPTION_NMI:
 		intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 		if (is_nmi(intr_info))
-			return false;
+			return true;
 		else if (is_page_fault(intr_info))
-			return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
+			return vcpu->arch.apf.host_apf_reason || !enable_ept;
 		else if (is_debug(intr_info) &&
 			 vcpu->guest_debug &
 			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
-			return false;
+			return true;
 		else if (is_breakpoint(intr_info) &&
 			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
-			return false;
+			return true;
+		return false;
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		return true;
+	case EXIT_REASON_MCE_DURING_VMENTRY:
+		return true;
+	case EXIT_REASON_EPT_VIOLATION:
+		/*
+		 * L0 always deals with the EPT violation. If nested EPT is
+		 * used, and the nested mmu code discovers that the address is
+		 * missing in the guest EPT table (EPT12), the EPT violation
+		 * will be injected with nested_ept_inject_page_fault()
+		 */
+		return true;
+	case EXIT_REASON_EPT_MISCONFIG:
+		/*
+		 * L2 never uses directly L1's EPT, but rather L0's own EPT
+		 * table (shadow on EPT) or a merged EPT table that L0 built
+		 * (EPT on EPT). So any problems with the structure of the
+		 * table is L0's fault.
+		 */
+		return true;
+	case EXIT_REASON_PREEMPTION_TIMER:
+		return true;
+	case EXIT_REASON_PML_FULL:
+		/* We emulate PML support to L1. */
+		return true;
+	case EXIT_REASON_VMFUNC:
+		/* VM functions are emulated through L2->L0 vmexits. */
+		return true;
+	case EXIT_REASON_ENCLS:
+		/* SGX is never exposed to L1 */
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+/*
+ * Return 1 if L1 wants to intercept an exit from L2.  Only call this when in
+ * is_guest_mode (L2).
+ */
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u32 intr_info;
+
+	switch (exit_reason) {
+	case EXIT_REASON_EXCEPTION_NMI:
+		intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+		if (is_nmi(intr_info))
+			return true;
+		else if (is_page_fault(intr_info))
+			return true;
 		return vmcs12->exception_bitmap &
 				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
 	case EXIT_REASON_EXTERNAL_INTERRUPT:
-		return false;
+		return nested_exit_on_intr(vcpu);
 	case EXIT_REASON_TRIPLE_FAULT:
 		return true;
 	case EXIT_REASON_INTERRUPT_WINDOW:
@@ -5734,7 +5785,7 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 			nested_cpu_has2(vmcs12,
 				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
 	case EXIT_REASON_MCE_DURING_VMENTRY:
-		return false;
+		return true;
 	case EXIT_REASON_TPR_BELOW_THRESHOLD:
 		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
 	case EXIT_REASON_APIC_ACCESS:
@@ -5746,22 +5797,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 		 * delivery" only come from vmcs12.
 		 */
 		return true;
-	case EXIT_REASON_EPT_VIOLATION:
-		/*
-		 * L0 always deals with the EPT violation. If nested EPT is
-		 * used, and the nested mmu code discovers that the address is
-		 * missing in the guest EPT table (EPT12), the EPT violation
-		 * will be injected with nested_ept_inject_page_fault()
-		 */
-		return false;
-	case EXIT_REASON_EPT_MISCONFIG:
-		/*
-		 * L2 never uses directly L1's EPT, but rather L0's own EPT
-		 * table (shadow on EPT) or a merged EPT table that L0 built
-		 * (EPT on EPT). So any problems with the structure of the
-		 * table is L0's fault.
-		 */
-		return false;
 	case EXIT_REASON_INVPCID:
 		return
 			nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
@@ -5778,17 +5813,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 		 * the XSS exit bitmap in vmcs12.
 		 */
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
-	case EXIT_REASON_PREEMPTION_TIMER:
-		return false;
-	case EXIT_REASON_PML_FULL:
-		/* We emulate PML support to L1. */
-		return false;
-	case EXIT_REASON_VMFUNC:
-		/* VM functions are emulated through L2->L0 vmexits. */
-		return false;
-	case EXIT_REASON_ENCLS:
-		/* SGX is never exposed to L1 */
-		return false;
 	case EXIT_REASON_UMWAIT:
 	case EXIT_REASON_TPAUSE:
 		return nested_cpu_has2(vmcs12,
@@ -5830,7 +5854,12 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
 				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
 				KVM_ISA_VMX);
 
-	if (!nested_vmx_exit_reflected(vcpu, exit_reason))
+	/* If L0 (KVM) wants the exit, it trumps L1's desires. */
+	if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
+		return false;
+
+	/* If L1 doesn't want the exit, handle it in L0. */
+	if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
 		return false;
 
 	/*
@@ -6162,7 +6191,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
 	 * reason is that if one of these bits is necessary, it will appear
 	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
 	 * fields of vmcs01 and vmcs02, will turn these bits off - and
-	 * nested_vmx_exit_reflected() will not pass related exits to L1.
+	 * nested_vmx_l1_wants_exit() will not pass related exits to L1.
 	 * These rules have exceptions below.
 	 */
 
-- 
cgit v1.2.3


From 1d283062c949f16fcc01d8a508b680e50c7d5ea0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:15 -0700
Subject: KVM: nVMX: Drop a superfluous WARN on reflecting EXTERNAL_INTERRUPT

Drop the WARN in nested_vmx_reflect_vmexit() that fires if KVM attempts
to reflect an external interrupt.  The WARN is blatantly impossible to
hit now that nested_vmx_l0_wants_exit() is called from
nested_vmx_reflect_vmexit() unconditionally returns true for
EXTERNAL_INTERRUPT.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-7-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index c2745cd9e022..e8db560ee3eb 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5863,12 +5863,11 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
 		return false;
 
 	/*
-	 * At this point, the exit interruption info in exit_intr_info
-	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
-	 * we need to query the in-kernel LAPIC.
+	 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits.  For
+	 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
+	 * need to be synthesized by querying the in-kernel LAPIC, but external
+	 * interrupts are never reflected to L1 so it's a non-issue.
 	 */
-	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
-
 	if ((exit_intr_info &
 	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
 	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
-- 
cgit v1.2.3


From f47baaed4fef3e31baa750cf337fe62bfcaca31f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:16 -0700
Subject: KVM: nVMX: Pull exit_reason from vcpu_vmx in
 nested_vmx_reflect_vmexit()

Grab the exit reason from the vcpu struct in nested_vmx_reflect_vmexit()
instead of having the exit reason explicitly passed from the caller.
This fixes a discrepancy between VM-Fail and VM-Exit handling, as the
VM-Fail case is already handled by checking vcpu_vmx, e.g. the exit
reason previously passed on the stack is bogus if vmx->fail is set.

Not taking the exit reason on the stack also avoids having to document
that nested_vmx_reflect_vmexit() requires the full exit reason, as
opposed to just the basic exit reason, which is not at all obvious since
the only usages of the full exit reason are for tracing and way down in
prepare_vmcs12() where it's propagated to vmcs12.

No functional change intended.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-8-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 3 ++-
 arch/x86/kvm/vmx/nested.h | 2 +-
 arch/x86/kvm/vmx/vmx.c    | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index e8db560ee3eb..67d27bc40143 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5826,9 +5826,10 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
  * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
  * reflected into L1.
  */
-bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 exit_reason = vmx->exit_reason;
 	u32 exit_intr_info, exit_qual;
 
 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index bd959bd2eb58..61cafee13ece 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -25,7 +25,7 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void);
 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 						     bool from_vmentry);
-bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason);
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		       u32 exit_intr_info, unsigned long exit_qualification);
 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d92ed73b22da..19a82f846e8e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5905,7 +5905,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 		 */
 		nested_mark_vmcs12_pages_dirty(vcpu);
 
-		if (nested_vmx_reflect_vmexit(vcpu, exit_reason))
+		if (nested_vmx_reflect_vmexit(vcpu))
 			return 1;
 	}
 
-- 
cgit v1.2.3


From 2a7833899f6ab08d654a51f763a85ccf6c744090 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:17 -0700
Subject: KVM: nVMX: Cast exit_reason to u16 to check for nested
 EXTERNAL_INTERRUPT

Explicitly check only the basic exit reason when emulating an external
interrupt VM-Exit in nested_vmx_vmexit().  Checking the full exit reason
doesn't currently cause problems, but only because the only exit reason
modifier support by KVM is FAILED_VMENTRY, which is mutually exclusive
with EXTERNAL_INTERRUPT.  Future modifiers, e.g. ENCLAVE_MODE, will
coexist with EXTERNAL_INTERRUPT.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-9-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 67d27bc40143..23d84d063197 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4406,7 +4406,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
 	if (likely(!vmx->fail)) {
-		if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+		if ((u16)exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
 		    nested_exit_intr_ack_set(vcpu)) {
 			int irq = kvm_cpu_get_interrupt(vcpu);
 			WARN_ON(irq < 0);
-- 
cgit v1.2.3


From 4dcefa312a6c1cc2aa2dd8a9ba76d97d63b3bd17 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 10:55:18 -0700
Subject: KVM: nVMX: Rename exit_reason to vm_exit_reason for nested VM-Exit

Use "vm_exit_reason" for code related to injecting a nested VM-Exit to
VM-Exits to make it clear that nested_vmx_vmexit() expects the full exit
eason, not just the basic exit reason.  The basic exit reason (bits 15:0
of vmcs.VM_EXIT_REASON) is colloquially referred to as simply "exit
reason".

Note, other flows, e.g. vmx_handle_exit(), are intentionally left as is.
A future patch will convert vmx->exit_reason to a union + bit-field, and
the exempted flows will interact with the unionized of "exit_reason".

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415175519.14230-10-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 29 +++++++++++++++--------------
 arch/x86/kvm/vmx/nested.h |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 23d84d063197..6f303ccd478d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -328,19 +328,19 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exit_reason;
+	u32 vm_exit_reason;
 	unsigned long exit_qualification = vcpu->arch.exit_qualification;
 
 	if (vmx->nested.pml_full) {
-		exit_reason = EXIT_REASON_PML_FULL;
+		vm_exit_reason = EXIT_REASON_PML_FULL;
 		vmx->nested.pml_full = false;
 		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
 	} else if (fault->error_code & PFERR_RSVD_MASK)
-		exit_reason = EXIT_REASON_EPT_MISCONFIG;
+		vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
 	else
-		exit_reason = EXIT_REASON_EPT_VIOLATION;
+		vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
 
-	nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
+	nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
 	vmcs12->guest_physical_address = fault->address;
 }
 
@@ -4002,11 +4002,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  * which already writes to vmcs12 directly.
  */
 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-			   u32 exit_reason, u32 exit_intr_info,
+			   u32 vm_exit_reason, u32 exit_intr_info,
 			   unsigned long exit_qualification)
 {
 	/* update exit information fields: */
-	vmcs12->vm_exit_reason = exit_reason;
+	vmcs12->vm_exit_reason = vm_exit_reason;
 	vmcs12->exit_qualification = exit_qualification;
 	vmcs12->vm_exit_intr_info = exit_intr_info;
 
@@ -4318,7 +4318,7 @@ vmabort:
  * and modify vmcs12 to make it see what it would expect to see there if
  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
  */
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 		       u32 exit_intr_info, unsigned long exit_qualification)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4342,9 +4342,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	if (likely(!vmx->fail)) {
 		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
 
-		if (exit_reason != -1)
-			prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
-				       exit_qualification);
+		if (vm_exit_reason != -1)
+			prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
+				       exit_intr_info, exit_qualification);
 
 		/*
 		 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
@@ -4399,14 +4399,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 	}
 
-	if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+	if ((vm_exit_reason != -1) &&
+	    (enable_shadow_vmcs || vmx->nested.hv_evmcs))
 		vmx->nested.need_vmcs12_to_shadow_sync = true;
 
 	/* in case we halted in L2 */
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
 	if (likely(!vmx->fail)) {
-		if ((u16)exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+		if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
 		    nested_exit_intr_ack_set(vcpu)) {
 			int irq = kvm_cpu_get_interrupt(vcpu);
 			WARN_ON(irq < 0);
@@ -4414,7 +4415,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 				INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
 		}
 
-		if (exit_reason != -1)
+		if (vm_exit_reason != -1)
 			trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
 						       vmcs12->exit_qualification,
 						       vmcs12->idt_vectoring_info_field,
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 61cafee13ece..1514ff4db77f 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -26,7 +26,7 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 						     bool from_vmentry);
 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 		       u32 exit_intr_info, unsigned long exit_qualification);
 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
-- 
cgit v1.2.3


From 9932b49e5abef0218254d15b8278e3dbee5ceea3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 13:34:50 -0700
Subject: KVM: nVMX: Invoke ept_save_pdptrs() if and only if PAE paging is
 enabled

Invoke ept_save_pdptrs() when restoring L1's host state on a "late"
VM-Fail if and only if PAE paging is enabled.  This saves a CALL in the
common case where L1 is a 64-bit host, and avoids incorrectly marking
the PDPTRs as dirty.

WARN if ept_save_pdptrs() is called with PAE disabled now that the
nested usage pre-checks is_pae_paging().  Barring a bug in KVM's MMU,
attempting to read the PDPTRs with PAE disabled is now impossible.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415203454.8296-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c |  2 +-
 arch/x86/kvm/vmx/vmx.c    | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6f303ccd478d..5a214cbf243c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4250,7 +4250,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
 	 * VMFail, like everything else we just need to ensure our
 	 * software model is up-to-date.
 	 */
-	if (enable_ept)
+	if (enable_ept && is_pae_paging(vcpu))
 		ept_save_pdptrs(vcpu);
 
 	kvm_mmu_reset_context(vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 19a82f846e8e..f9bee922f1e6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2937,12 +2937,13 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
 
-	if (is_pae_paging(vcpu)) {
-		mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
-		mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
-		mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
-		mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
-	}
+	if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
+		return;
+
+	mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+	mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+	mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+	mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
 
 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
 }
-- 
cgit v1.2.3


From e5d03de5937e915c8e41b6357c337529dfae6797 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 13:34:51 -0700
Subject: KVM: nVMX: Reset register cache (available and dirty masks) on VMCS
 switch

Reset the per-vCPU available and dirty register masks when switching
between vmcs01 and vmcs02, as the masks track state relative to the
current VMCS.  The stale masks don't cause problems in the current code
base because the registers are either unconditionally written on nested
transitions or, in the case of segment registers, have an additional
tracker that is manually reset.

Note, by dropping (previously implicitly, now explicitly) the dirty mask
when switching the active VMCS, KVM is technically losing writes to the
associated fields.  But, the only regs that can be dirtied (RIP, RSP and
PDPTRs) are unconditionally written on nested transitions, e.g. explicit
writeback is a waste of cycles, and a WARN_ON would be rather pointless.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415203454.8296-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c |  1 +
 arch/x86/kvm/vmx/vmx.c    |  7 +------
 arch/x86/kvm/vmx/vmx.h    | 11 +++++++++++
 3 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5a214cbf243c..c01005ec445f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -307,6 +307,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 	vmx_sync_vmcs_host_state(vmx, prev);
 	put_cpu();
 
+	vmx_register_cache_reset(vcpu);
 	vmx_segment_cache_clear(vmx);
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f9bee922f1e6..629c89314d44 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6711,12 +6711,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	loadsegment(es, __USER_DS);
 #endif
 
-	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
-				  | (1 << VCPU_EXREG_RFLAGS)
-				  | (1 << VCPU_EXREG_PDPTR)
-				  | (1 << VCPU_EXREG_SEGMENTS)
-				  | (1 << VCPU_EXREG_CR3));
-	vcpu->arch.regs_dirty = 0;
+	vmx_register_cache_reset(vcpu);
 
 	pt_guest_exit(vmx);
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 31d7252df163..6b668b604898 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -8,6 +8,7 @@
 #include <asm/intel_pt.h>
 
 #include "capabilities.h"
+#include "kvm_cache_regs.h"
 #include "ops.h"
 #include "vmcs.h"
 
@@ -447,6 +448,16 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 	vmx->segment_cache.bitmask = 0;
 }
 
+static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+				  | (1 << VCPU_EXREG_RFLAGS)
+				  | (1 << VCPU_EXREG_PDPTR)
+				  | (1 << VCPU_EXREG_SEGMENTS)
+				  | (1 << VCPU_EXREG_CR3));
+	vcpu->arch.regs_dirty = 0;
+}
+
 static inline u32 vmx_vmentry_ctrl(void)
 {
 	u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
-- 
cgit v1.2.3


From ec0241f3bbe155a58455ce4a6057be5db6529b0f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 13:34:52 -0700
Subject: KVM: nVMX: Drop manual clearing of segment cache on nested VMCS
 switch

Drop the call to vmx_segment_cache_clear() in vmx_switch_vmcs() now that
the entire register cache is reset when switching the active VMCS, e.g.
vmx_segment_cache_test_set() will reset the segment cache due to
VCPU_EXREG_SEGMENTS being unavailable.

Move vmx_segment_cache_clear() to vmx.c now that it's no longer invoked
by the nested code.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415203454.8296-4-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 1 -
 arch/x86/kvm/vmx/vmx.c    | 5 +++++
 arch/x86/kvm/vmx/vmx.h    | 5 -----
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index c01005ec445f..dad6b05e35ae 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -308,7 +308,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 	put_cpu();
 
 	vmx_register_cache_reset(vcpu);
-	vmx_segment_cache_clear(vmx);
 }
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 629c89314d44..e0587f532b64 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -437,6 +437,11 @@ static const struct kvm_vmx_segment_field {
 	VMX_SEGMENT_FIELD(LDTR),
 };
 
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+	vmx->segment_cache.bitmask = 0;
+}
+
 static unsigned long host_idt_base;
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 6b668b604898..78c99a60fa49 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -443,11 +443,6 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
 BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
 BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
 
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
-{
-	vmx->segment_cache.bitmask = 0;
-}
-
 static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
-- 
cgit v1.2.3


From 5addc235199f15ae964e7ac6b20cf43f4a661573 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 13:34:53 -0700
Subject: KVM: VMX: Cache vmcs.EXIT_QUALIFICATION using arch avail_reg flags

Introduce a new "extended register" type, EXIT_INFO_1 (to pair with the
nomenclature in .get_exit_info()), and use it to cache VMX's
vmcs.EXIT_QUALIFICATION.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415203454.8296-5-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/nested.c       | 20 ++++++++++----------
 arch/x86/kvm/vmx/vmx.c          | 28 ++++++++++++++--------------
 arch/x86/kvm/vmx/vmx.h          | 15 ++++++++++++++-
 4 files changed, 39 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d099749168f0..9673ec700cd0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -170,6 +170,7 @@ enum kvm_reg {
 	VCPU_EXREG_CR3,
 	VCPU_EXREG_RFLAGS,
 	VCPU_EXREG_SEGMENTS,
+	VCPU_EXREG_EXIT_INFO_1,
 };
 
 enum {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index dad6b05e35ae..dc16b2f8ce97 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4604,7 +4604,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
 	gva_t gva;
 	struct x86_exception e;
 
-	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
 				vmcs_read32(VMX_INSTRUCTION_INFO), false,
 				sizeof(*vmpointer), &gva))
 		return 1;
@@ -4869,7 +4869,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 {
 	struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
 						    : get_vmcs12(vcpu);
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
 	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct x86_exception e;
@@ -4955,7 +4955,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 {
 	struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
 						    : get_vmcs12(vcpu);
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
 	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct x86_exception e;
@@ -5140,7 +5140,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 /* Emulate the VMPTRST instruction */
 static int handle_vmptrst(struct kvm_vcpu *vcpu)
 {
-	unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
 	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
 	struct x86_exception e;
@@ -5208,7 +5208,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	/* According to the Intel VMX instruction reference, the memory
 	 * operand is read even if it isn't needed (e.g., for type==global)
 	 */
-	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
 			vmx_instruction_info, false, sizeof(operand), &gva))
 		return 1;
 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
@@ -5290,7 +5290,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	/* according to the intel vmx instruction reference, the memory
 	 * operand is read even if it isn't needed (e.g., for type==global)
 	 */
-	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
 			vmx_instruction_info, false, sizeof(operand), &gva))
 		return 1;
 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
@@ -5420,7 +5420,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
 fail:
 	nested_vmx_vmexit(vcpu, vmx->exit_reason,
 			  vmcs_read32(VM_EXIT_INTR_INFO),
-			  vmcs_readl(EXIT_QUALIFICATION));
+			  vmx_get_exit_qual(vcpu));
 	return 1;
 }
 
@@ -5471,7 +5471,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
 		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
 
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	exit_qualification = vmx_get_exit_qual(vcpu);
 
 	port = exit_qualification >> 16;
 	size = (exit_qualification & 7) + 1;
@@ -5525,7 +5525,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 	struct vmcs12 *vmcs12)
 {
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
 	int cr = exit_qualification & 15;
 	int reg;
 	unsigned long val;
@@ -5849,7 +5849,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
 	}
 
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-	exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+	exit_qual = vmx_get_exit_qual(vcpu);
 
 	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
 				vmx->idt_vectoring_info, exit_intr_info,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e0587f532b64..b7aab2284a9c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4698,7 +4698,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	}
 
 	if (is_page_fault(intr_info)) {
-		cr2 = vmcs_readl(EXIT_QUALIFICATION);
+		cr2 = vmx_get_exit_qual(vcpu);
 		/* EPT won't cause page fault directly */
 		WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
 		return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
@@ -4714,7 +4714,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 		kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
 		return 1;
 	case DB_VECTOR:
-		dr6 = vmcs_readl(EXIT_QUALIFICATION);
+		dr6 = vmx_get_exit_qual(vcpu);
 		if (!(vcpu->guest_debug &
 		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
 			vcpu->arch.dr6 &= ~DR_TRAP_BITS;
@@ -4769,7 +4769,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
 	int size, in, string;
 	unsigned port;
 
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	exit_qualification = vmx_get_exit_qual(vcpu);
 	string = (exit_qualification & 16) != 0;
 
 	++vcpu->stat.io_exits;
@@ -4860,7 +4860,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 	int err;
 	int ret;
 
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	exit_qualification = vmx_get_exit_qual(vcpu);
 	cr = exit_qualification & 15;
 	reg = (exit_qualification >> 8) & 15;
 	switch ((exit_qualification >> 4) & 3) {
@@ -4937,7 +4937,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 	unsigned long exit_qualification;
 	int dr, dr7, reg;
 
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	exit_qualification = vmx_get_exit_qual(vcpu);
 	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
 
 	/* First, if DR does not exist, trigger UD */
@@ -5050,7 +5050,7 @@ static int handle_invd(struct kvm_vcpu *vcpu)
 
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
 
 	kvm_mmu_invlpg(vcpu, exit_qualification);
 	return kvm_skip_emulated_instruction(vcpu);
@@ -5082,7 +5082,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
 	if (likely(fasteoi)) {
-		unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+		unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
 		int access_type, offset;
 
 		access_type = exit_qualification & APIC_ACCESS_TYPE;
@@ -5103,7 +5103,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
 
 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
 {
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
 	int vector = exit_qualification & 0xff;
 
 	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
@@ -5113,7 +5113,7 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
 
 static int handle_apic_write(struct kvm_vcpu *vcpu)
 {
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
 	u32 offset = exit_qualification & 0xfff;
 
 	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
@@ -5134,7 +5134,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
 	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
 
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	exit_qualification = vmx_get_exit_qual(vcpu);
 
 	reason = (u32)exit_qualification >> 30;
 	if (reason == TASK_SWITCH_GATE && idt_v) {
@@ -5184,7 +5184,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	gpa_t gpa;
 	u64 error_code;
 
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	exit_qualification = vmx_get_exit_qual(vcpu);
 
 	/*
 	 * EPT violation happened while executing iret from NMI,
@@ -5444,7 +5444,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
 	/* According to the Intel instruction reference, the memory operand
 	 * is read even if it isn't needed (e.g., for type==all)
 	 */
-	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+	if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
 				vmx_instruction_info, false,
 				sizeof(operand), &gva))
 		return 1;
@@ -5520,7 +5520,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 
 	trace_kvm_pml_full(vcpu->vcpu_id);
 
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	exit_qualification = vmx_get_exit_qual(vcpu);
 
 	/*
 	 * PML buffer FULL happened while executing iret from NMI,
@@ -5634,7 +5634,7 @@ static const int kvm_vmx_max_exit_handlers =
 
 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
-	*info1 = vmcs_readl(EXIT_QUALIFICATION);
+	*info1 = vmx_get_exit_qual(vcpu);
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 78c99a60fa49..a13eafec67fc 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -210,6 +210,7 @@ struct vcpu_vmx {
 	 */
 	bool		      guest_state_loaded;
 
+	unsigned long         exit_qualification;
 	u32                   exit_intr_info;
 	u32                   idt_vectoring_info;
 	ulong                 rflags;
@@ -449,7 +450,8 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
 				  | (1 << VCPU_EXREG_RFLAGS)
 				  | (1 << VCPU_EXREG_PDPTR)
 				  | (1 << VCPU_EXREG_SEGMENTS)
-				  | (1 << VCPU_EXREG_CR3));
+				  | (1 << VCPU_EXREG_CR3)
+				  | (1 << VCPU_EXREG_EXIT_INFO_1));
 	vcpu->arch.regs_dirty = 0;
 }
 
@@ -493,6 +495,17 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 	return &(to_vmx(vcpu)->pi_desc);
 }
 
+static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) {
+		kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+		vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	}
+	return vmx->exit_qualification;
+}
+
 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
 void free_vmcs(struct vmcs *vmcs);
 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
-- 
cgit v1.2.3


From 8791585837f659943936b8e1cce9d039436ad1ca Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 13:34:54 -0700
Subject: KVM: VMX: Cache vmcs.EXIT_INTR_INFO using arch avail_reg flags

Introduce a new "extended register" type, EXIT_INFO_2 (to pair with the
nomenclature in .get_exit_info()), and use it to cache VMX's
vmcs.EXIT_INTR_INFO.  Drop a comment in vmx_recover_nmi_blocking() that
is obsoleted by the generic caching mechanism.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415203454.8296-6-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/nested.c       | 10 +++++-----
 arch/x86/kvm/vmx/vmx.c          | 20 ++++++++------------
 arch/x86/kvm/vmx/vmx.h          | 14 +++++++++++++-
 4 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9673ec700cd0..ceba205d32a2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,6 +171,7 @@ enum kvm_reg {
 	VCPU_EXREG_RFLAGS,
 	VCPU_EXREG_SEGMENTS,
 	VCPU_EXREG_EXIT_INFO_1,
+	VCPU_EXREG_EXIT_INFO_2,
 };
 
 enum {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index dc16b2f8ce97..78744c6e4a04 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5419,7 +5419,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
 
 fail:
 	nested_vmx_vmexit(vcpu, vmx->exit_reason,
-			  vmcs_read32(VM_EXIT_INTR_INFO),
+			  vmx_get_intr_info(vcpu),
 			  vmx_get_exit_qual(vcpu));
 	return 1;
 }
@@ -5652,7 +5652,7 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
 
 	switch (exit_reason) {
 	case EXIT_REASON_EXCEPTION_NMI:
-		intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+		intr_info = vmx_get_intr_info(vcpu);
 		if (is_nmi(intr_info))
 			return true;
 		else if (is_page_fault(intr_info))
@@ -5708,12 +5708,12 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
  */
 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
+	u32 intr_info = vmx_get_intr_info(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u32 intr_info;
 
 	switch (exit_reason) {
 	case EXIT_REASON_EXCEPTION_NMI:
-		intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+		intr_info = vmx_get_intr_info(vcpu);
 		if (is_nmi(intr_info))
 			return true;
 		else if (is_page_fault(intr_info))
@@ -5848,7 +5848,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
 		goto reflect_vmexit;
 	}
 
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	exit_intr_info = vmx_get_intr_info(vcpu);
 	exit_qual = vmx_get_exit_qual(vcpu);
 
 	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b7aab2284a9c..50951cdbe76f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5635,7 +5635,7 @@ static const int kvm_vmx_max_exit_handlers =
 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
 	*info1 = vmx_get_exit_qual(vcpu);
-	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+	*info2 = vmx_get_intr_info(vcpu);
 }
 
 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
@@ -6298,16 +6298,16 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 
 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 {
-	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
 
 	/* if exit due to PF check for async PF */
-	if (is_page_fault(vmx->exit_intr_info)) {
+	if (is_page_fault(intr_info)) {
 		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
 	/* Handle machine checks before interrupts are enabled */
-	} else if (is_machine_check(vmx->exit_intr_info)) {
+	} else if (is_machine_check(intr_info)) {
 		kvm_machine_check();
 	/* We need to handle NMIs before interrupts are enabled */
-	} else if (is_nmi(vmx->exit_intr_info)) {
+	} else if (is_nmi(intr_info)) {
 		kvm_before_interrupt(&vmx->vcpu);
 		asm("int $2");
 		kvm_after_interrupt(&vmx->vcpu);
@@ -6322,9 +6322,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 	unsigned long tmp;
 #endif
 	gate_desc *desc;
-	u32 intr_info;
+	u32 intr_info = vmx_get_intr_info(vcpu);
 
-	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if (WARN_ONCE(!is_external_intr(intr_info),
 	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
 		return;
@@ -6405,11 +6404,8 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 	if (enable_vnmi) {
 		if (vmx->loaded_vmcs->nmi_known_unmasked)
 			return;
-		/*
-		 * Can't use vmx->exit_intr_info since we're not sure what
-		 * the exit reason is.
-		 */
-		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+		exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
 		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
 		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
 		/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a13eafec67fc..edfb739e5907 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -451,7 +451,8 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
 				  | (1 << VCPU_EXREG_PDPTR)
 				  | (1 << VCPU_EXREG_SEGMENTS)
 				  | (1 << VCPU_EXREG_CR3)
-				  | (1 << VCPU_EXREG_EXIT_INFO_1));
+				  | (1 << VCPU_EXREG_EXIT_INFO_1)
+				  | (1 << VCPU_EXREG_EXIT_INFO_2));
 	vcpu->arch.regs_dirty = 0;
 }
 
@@ -506,6 +507,17 @@ static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
 	return vmx->exit_qualification;
 }
 
+static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) {
+		kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+		vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	}
+	return vmx->exit_intr_info;
+}
+
 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
 void free_vmcs(struct vmcs *vmcs);
 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
-- 
cgit v1.2.3


From c583eed6d72bfea853ca7aa645d9295dd2720c6a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 14:44:13 -0700
Subject: KVM: x86/mmu: Set @writable to false for non-visible accesses by L2

Explicitly set @writable to false in try_async_pf() if the GFN->PFN
translation is short-circuited due to the requested GFN not being
visible to L2.

Leaving @writable ('map_writable' in the callers) uninitialized is ok
in that it's never actually consumed, but one has to track it all the
way through set_spte() being short-circuited by set_mmio_spte() to
understand that the uninitialized variable is benign, and relying on
@writable being ignored is an unnecessary risk.  Explicitly setting
@writable also aligns try_async_pf() with __gfn_to_pfn_memslot().

Jim Mattson <jmattson@google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200415214414.10194-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3385d6dde541..b67046fa0ac4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4090,6 +4090,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	 */
 	if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
 		*pfn = KVM_PFN_NOSLOT;
+		*writable = false;
 		return false;
 	}
 
-- 
cgit v1.2.3


From c36b71503a2268206ebeda6697094ffb4e7e94c2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 16 Apr 2020 09:48:07 -0400
Subject: KVM: x86/mmu: Avoid an extra memslot lookup in try_async_pf() for L2

Create a new function kvm_is_visible_memslot() and use it from
kvm_is_visible_gfn(); use the new function in try_async_pf() too,
to avoid an extra memslot lookup.

Opportunistically squish a multi-line comment into a single-line comment.

Note, the end result, KVM_PFN_NOSLOT, is unchanged.

Cc: Jim Mattson <jmattson@google.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c   | 9 +++------
 include/linux/kvm_host.h | 6 ++++++
 virt/kvm/kvm_main.c      | 6 +-----
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b67046fa0ac4..e618472c572b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4082,19 +4082,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
 			 bool *writable)
 {
-	struct kvm_memory_slot *slot;
+	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	bool async;
 
-	/*
-	 * Don't expose private memslots to L2.
-	 */
-	if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+	/* Don't expose private memslots to L2. */
+	if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
 		*pfn = KVM_PFN_NOSLOT;
 		*writable = false;
 		return false;
 	}
 
-	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	async = false;
 	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
 	if (!async)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 658215f6102c..7d4f1eb70274 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1357,6 +1357,12 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 }
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 
+static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot)
+{
+	return (memslot && memslot->id < KVM_USER_MEM_SLOTS &&
+		!(memslot->flags & KVM_MEMSLOT_INVALID));
+}
+
 struct kvm_vcpu *kvm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index da8fd45e0e3e..8aa577db131e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1607,11 +1607,7 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
 
-	if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
-	      memslot->flags & KVM_MEMSLOT_INVALID)
-		return false;
-
-	return true;
+	return kvm_is_visible_memslot(memslot);
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 
-- 
cgit v1.2.3


From b8d295f96b6c9745d7eebee622690e0e835c63af Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 15 Apr 2020 17:07:39 -0700
Subject: KVM: nVMX: Remove non-functional "support" for CR3 target values

Remove all references to cr3_target_value[0-3] and replace the fields
in vmcs12 with "dead_space" to preserve the vmcs12 layout.  KVM doesn't
support emulating CR3-target values, despite a variety of code that
implies otherwise, as KVM unconditionally reports '0' for the number of
supported CR3-target values.

This technically fixes a bug where KVM would incorrectly allow VMREAD
and VMWRITE to nonexistent fields, i.e. cr3_target_value[0-3].  Per
Intel's SDM, the number of supported CR3-target values reported in
VMX_MISC also enumerates the existence of the associated VMCS fields:

  If a future implementation supports more than 4 CR3-target values, they
  will be encoded consecutively following the 4 encodings given here.

Alternatively, the "bug" could be fixed by actually advertisting support
for 4 CR3-target values, but that'd likely just enable kvm-unit-tests
given that no one has complained about lack of support for going on ten
years, e.g. KVM, Xen and HyperV don't use CR3-target values.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200416000739.9012-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/nested-vmx.rst |  5 +----
 arch/x86/kvm/vmx/evmcs.c              |  8 --------
 arch/x86/kvm/vmx/nested.c             | 17 -----------------
 arch/x86/kvm/vmx/vmcs12.c             |  4 ----
 arch/x86/kvm/vmx/vmcs12.h             | 10 ++--------
 arch/x86/kvm/vmx/vmx.c                |  9 ---------
 6 files changed, 3 insertions(+), 50 deletions(-)

(limited to 'arch')

diff --git a/Documentation/virt/kvm/nested-vmx.rst b/Documentation/virt/kvm/nested-vmx.rst
index 592b0ab6970b..89851cbb7df9 100644
--- a/Documentation/virt/kvm/nested-vmx.rst
+++ b/Documentation/virt/kvm/nested-vmx.rst
@@ -116,10 +116,7 @@ struct shadow_vmcs is ever changed.
 		natural_width cr4_guest_host_mask;
 		natural_width cr0_read_shadow;
 		natural_width cr4_read_shadow;
-		natural_width cr3_target_value0;
-		natural_width cr3_target_value1;
-		natural_width cr3_target_value2;
-		natural_width cr3_target_value3;
+		natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
 		natural_width exit_qualification;
 		natural_width guest_linear_address;
 		natural_width guest_cr0;
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 303813423c3e..73f3e07c1852 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -160,14 +160,6 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
 		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
 	EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
 		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-	EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
-		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-	EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
-		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-	EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
-		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
-	EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
-		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
 
 	/* 32 bit rw */
 	EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 78744c6e4a04..f228339cd0a0 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1785,10 +1785,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
 	 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
 	 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
 	 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
-	 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
-	 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
-	 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
-	 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
 	 * vmcs12->page_fault_error_code_mask =
 	 *		evmcs->page_fault_error_code_mask;
 	 * vmcs12->page_fault_error_code_match =
@@ -1862,10 +1858,6 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
 	 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
 	 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
 	 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
-	 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
-	 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
-	 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
-	 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
 	 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
 	 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
 	 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
@@ -5541,15 +5533,6 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 				return true;
 			break;
 		case 3:
-			if ((vmcs12->cr3_target_count >= 1 &&
-					vmcs12->cr3_target_value0 == val) ||
-				(vmcs12->cr3_target_count >= 2 &&
-					vmcs12->cr3_target_value1 == val) ||
-				(vmcs12->cr3_target_count >= 3 &&
-					vmcs12->cr3_target_value2 == val) ||
-				(vmcs12->cr3_target_count >= 4 &&
-					vmcs12->cr3_target_value3 == val))
-				return false;
 			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
 				return true;
 			break;
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 53dfb401316d..c8e51c004f78 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -115,10 +115,6 @@ const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
 	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
 	FIELD(CR4_READ_SHADOW, cr4_read_shadow),
-	FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
-	FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
-	FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
-	FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
 	FIELD(EXIT_QUALIFICATION, exit_qualification),
 	FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
 	FIELD(GUEST_CR0, guest_cr0),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index d0c6df373f67..80232daf00ff 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -80,10 +80,7 @@ struct __packed vmcs12 {
 	natural_width cr4_guest_host_mask;
 	natural_width cr0_read_shadow;
 	natural_width cr4_read_shadow;
-	natural_width cr3_target_value0;
-	natural_width cr3_target_value1;
-	natural_width cr3_target_value2;
-	natural_width cr3_target_value3;
+	natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
 	natural_width exit_qualification;
 	natural_width guest_linear_address;
 	natural_width guest_cr0;
@@ -263,10 +260,7 @@ static inline void vmx_check_vmcs12_offsets(void)
 	CHECK_OFFSET(cr4_guest_host_mask, 352);
 	CHECK_OFFSET(cr0_read_shadow, 360);
 	CHECK_OFFSET(cr4_read_shadow, 368);
-	CHECK_OFFSET(cr3_target_value0, 376);
-	CHECK_OFFSET(cr3_target_value1, 384);
-	CHECK_OFFSET(cr3_target_value2, 392);
-	CHECK_OFFSET(cr3_target_value3, 400);
+	CHECK_OFFSET(dead_space, 376);
 	CHECK_OFFSET(exit_qualification, 408);
 	CHECK_OFFSET(guest_linear_address, 416);
 	CHECK_OFFSET(guest_cr0, 424);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 50951cdbe76f..281436b08e13 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5717,7 +5717,6 @@ void dump_vmcs(void)
 	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
 	unsigned long cr4;
 	u64 efer;
-	int i, n;
 
 	if (!dump_invalid_vmcs) {
 		pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5854,14 +5853,6 @@ void dump_vmcs(void)
 		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
 	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
 		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
-	n = vmcs_read32(CR3_TARGET_COUNT);
-	for (i = 0; i + 1 < n; i += 4)
-		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
-		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
-		       i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
-	if (i < n)
-		pr_err("CR3 target%u=%016lx\n",
-		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
 	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
 		pr_err("PLE Gap=%08x Window=%08x\n",
 		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
-- 
cgit v1.2.3


From 873e1da16918a83cc7440e299dda0dc7ab6db34c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 10 Apr 2020 10:47:02 -0700
Subject: KVM: VMX: Optimize handling of VM-Entry failures in vmx_vcpu_run()

Mark the VM-Fail, VM-Exit on VM-Enter, and #MC on VM-Enter paths as
'unlikely' so as to improve code generation so that it favors successful
VM-Enter.  The performance of successful VM-Enter is for more important,
irrespective of whether or not success is actually likely.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200410174703.1138-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 281436b08e13..1fc10b821b78 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6724,11 +6724,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx->nested.nested_run_pending = 0;
 	vmx->idt_vectoring_info = 0;
 
-	vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
-	if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+	if (unlikely(vmx->fail)) {
+		vmx->exit_reason = 0xdead;
+		return;
+	}
+
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
 		kvm_machine_check();
 
-	if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+	if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
 		return;
 
 	vmx->loaded_vmcs->launched = 1;
-- 
cgit v1.2.3


From a9ab13ff6e844ad5b3ed39339e6db9a76bb539ad Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Fri, 10 Apr 2020 10:47:03 -0700
Subject: KVM: X86: Improve latency for single target IPI fastpath

IPI and Timer cause the main MSRs write vmexits in cloud environment
observation, let's optimize virtual IPI latency more aggressively to
inject target IPI as soon as possible.

Running kvm-unit-tests/vmexit.flat IPI testing on SKX server, disable
adaptive advance lapic timer and adaptive halt-polling to avoid the
interference, this patch can give another 7% improvement.

w/o fastpath   -> x86.c fastpath      4238 -> 3543  16.4%
x86.c fastpath -> vmx.c fastpath      3543 -> 3293     7%
w/o fastpath   -> vmx.c fastpath      4238 -> 3293  22.3%

Cc: Haiwei Li <lihaiwei@tencent.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200410174703.1138-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++---
 arch/x86/kvm/svm/svm.c          | 24 ++++++++++++++++--------
 arch/x86/kvm/vmx/vmx.c          | 22 +++++++++++++---------
 arch/x86/kvm/x86.c              |  6 +++---
 4 files changed, 34 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ceba205d32a2..f26df2cb0591 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1126,7 +1126,7 @@ struct kvm_x86_ops {
 	 */
 	void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
 
-	void (*run)(struct kvm_vcpu *vcpu);
+	enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
 	int (*handle_exit)(struct kvm_vcpu *vcpu,
 		enum exit_fastpath_completion exit_fastpath);
 	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1176,8 +1176,7 @@ struct kvm_x86_ops {
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage,
 			       struct x86_exception *exception);
-	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu,
-		enum exit_fastpath_completion *exit_fastpath);
+	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
 
 	int (*check_nested_events)(struct kvm_vcpu *vcpu);
 	void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 747e60c65c98..a6f4e1bdb045 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3299,10 +3299,21 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(svm);
 }
 
+static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+	if (!is_guest_mode(vcpu) &&
+	    to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+	    to_svm(vcpu)->vmcb->control.exit_info_1)
+		return handle_fastpath_set_msr_irqoff(vcpu);
+
+	return EXIT_FASTPATH_NONE;
+}
+
 void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
 
-static void svm_vcpu_run(struct kvm_vcpu *vcpu)
+static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
+	enum exit_fastpath_completion exit_fastpath;
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
@@ -3314,7 +3325,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * again.
 	 */
 	if (unlikely(svm->nested.exit_required))
-		return;
+		return EXIT_FASTPATH_NONE;
 
 	/*
 	 * Disable singlestep if we're injecting an interrupt/exception.
@@ -3398,6 +3409,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	stgi();
 
 	/* Any pending NMI will happen here */
+	exit_fastpath = svm_exit_handlers_fastpath(vcpu);
 
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_after_interrupt(&svm->vcpu);
@@ -3426,6 +3438,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		svm_handle_mce(svm);
 
 	mark_all_clean(svm->vmcb);
+	return exit_fastpath;
 }
 
 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3727,13 +3740,8 @@ out:
 	return ret;
 }
 
-static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
-	enum exit_fastpath_completion *exit_fastpath)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
-	if (!is_guest_mode(vcpu) &&
-	    to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
-	    to_svm(vcpu)->vmcb->control.exit_info_1)
-		*exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
 }
 
 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1fc10b821b78..766303b31949 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6350,8 +6350,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 }
 STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
 
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
-	enum exit_fastpath_completion *exit_fastpath)
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
@@ -6359,9 +6358,6 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
 		handle_external_interrupt_irqoff(vcpu);
 	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
 		handle_exception_nmi_irqoff(vmx);
-	else if (!is_guest_mode(vcpu) &&
-		vmx->exit_reason == EXIT_REASON_MSR_WRITE)
-		*exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
 }
 
 static bool vmx_has_emulated_msr(int index)
@@ -6565,8 +6561,9 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
+	enum exit_fastpath_completion exit_fastpath;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long cr3, cr4;
 
@@ -6578,7 +6575,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	/* Don't enter VMX if guest state is invalid, let the exit handler
 	   start emulation until we arrive back to a valid state */
 	if (vmx->emulation_required)
-		return;
+		return EXIT_FASTPATH_NONE;
 
 	if (vmx->ple_window_dirty) {
 		vmx->ple_window_dirty = false;
@@ -6726,7 +6723,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	if (unlikely(vmx->fail)) {
 		vmx->exit_reason = 0xdead;
-		return;
+		return EXIT_FASTPATH_NONE;
 	}
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6734,13 +6731,20 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		kvm_machine_check();
 
 	if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
-		return;
+		return EXIT_FASTPATH_NONE;
+
+	if (!is_guest_mode(vcpu) && vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+		exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
+	else
+		exit_fastpath = EXIT_FASTPATH_NONE;
 
 	vmx->loaded_vmcs->launched = 1;
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
+
+	return exit_fastpath;
 }
 
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8104dc9ff5b2..8d9bcd5faac5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8179,7 +8179,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_int_win =
 		dm_request_for_irq_injection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
-	enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE;
+	enum exit_fastpath_completion exit_fastpath;
 
 	bool req_immediate_exit = false;
 
@@ -8406,7 +8406,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
-	kvm_x86_ops.run(vcpu);
+	exit_fastpath = kvm_x86_ops.run(vcpu);
 
 	/*
 	 * Do this here before restoring debug registers on the host.  And
@@ -8438,7 +8438,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
 
-	kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath);
+	kvm_x86_ops.handle_exit_irqoff(vcpu);
 
 	/*
 	 * Consume any pending interrupts, including the possible source of
-- 
cgit v1.2.3


From 4f233371f6bb1578340b6c7034ddf7b76606d4ae Mon Sep 17 00:00:00 2001
From: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Date: Thu, 9 Apr 2020 16:50:33 -0400
Subject: KVM: nSVM: Check for CR0.CD and CR0.NW on VMRUN of nested guests

According to section "Canonicalization and Consistency Checks" in APM vol. 2,
the following guest state combination is illegal:

	"CR0.CD is zero and CR0.NW is set"

Signed-off-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Message-Id: <20200409205035.16830-2-krish.sadhukhan@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index c62893102777..3e5bd739a6f6 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -207,6 +207,10 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 	if ((vmcb->save.efer & EFER_SVME) == 0)
 		return false;
 
+	if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
+	    (vmcb->save.cr0 & X86_CR0_NW))
+		return false;
+
 	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
 		return false;
 
-- 
cgit v1.2.3


From 1b94f6f81007b4afaea3480ec018bc9236148961 Mon Sep 17 00:00:00 2001
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Date: Thu, 16 Apr 2020 13:10:57 +0800
Subject: KVM: Remove redundant argument to kvm_arch_vcpu_ioctl_run

In earlier versions of kvm, 'kvm_run' was an independent structure
and was not included in the vcpu structure. At present, 'kvm_run'
is already included in the vcpu structure, so the parameter
'kvm_run' is redundant.

This patch simplifies the function definition, removes the extra
'kvm_run' parameter, and extracts it from the 'kvm_vcpu' structure
if necessary.

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Message-Id: <20200416051057.26526-1-tianjia.zhang@linux.alibaba.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c       |  3 ++-
 arch/powerpc/kvm/powerpc.c |  3 ++-
 arch/s390/kvm/kvm-s390.c   |  3 ++-
 arch/x86/kvm/x86.c         | 11 ++++++-----
 include/linux/kvm_host.h   |  2 +-
 virt/kvm/arm/arm.c         |  6 +++---
 virt/kvm/kvm_main.c        |  2 +-
 7 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index fdf1c14d9205..9f50ceef9978 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -438,8 +438,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	return -ENOIOCTLCMD;
 }
 
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
+	struct kvm_run *run = vcpu->run;
 	int r = -EINTR;
 
 	vcpu_load(vcpu);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index e15166b0a16d..7e24691e138a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1764,8 +1764,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 	return r;
 }
 
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
+	struct kvm_run *run = vcpu->run;
 	int r;
 
 	vcpu_load(vcpu);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5307929a6c89..75471b646fd7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4333,8 +4333,9 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		store_regs_fmt2(vcpu, kvm_run);
 }
 
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
+	struct kvm_run *kvm_run = vcpu->run;
 	int rc;
 
 	if (kvm_run->immediate_exit)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8d9bcd5faac5..59958ce2b681 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8710,8 +8710,9 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	trace_kvm_fpu(0);
 }
 
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
+	struct kvm_run *kvm_run = vcpu->run;
 	int r;
 
 	vcpu_load(vcpu);
@@ -8729,18 +8730,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		r = -EAGAIN;
 		if (signal_pending(current)) {
 			r = -EINTR;
-			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			kvm_run->exit_reason = KVM_EXIT_INTR;
 			++vcpu->stat.signal_exits;
 		}
 		goto out;
 	}
 
-	if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+	if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
 		r = -EINVAL;
 		goto out;
 	}
 
-	if (vcpu->run->kvm_dirty_regs) {
+	if (kvm_run->kvm_dirty_regs) {
 		r = sync_regs(vcpu);
 		if (r != 0)
 			goto out;
@@ -8770,7 +8771,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 out:
 	kvm_put_guest_fpu(vcpu);
-	if (vcpu->run->kvm_valid_regs)
+	if (kvm_run->kvm_valid_regs)
 		store_regs(vcpu);
 	post_kvm_run_save(vcpu);
 	kvm_sigset_deactivate(vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d4f1eb70274..5285a5568208 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -866,7 +866,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state);
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 					struct kvm_guest_debug *dbg);
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu);
 
 int kvm_arch_init(void *opaque);
 void kvm_arch_exit(void);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 48d0ec44ad77..f5390ac2165b 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -639,7 +639,6 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
 /**
  * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
  * @vcpu:	The VCPU pointer
- * @run:	The kvm_run structure pointer used for userspace state exchange
  *
  * This function is called through the VCPU_RUN ioctl called from user space. It
  * will execute VM code in a loop until the time slice for the process is used
@@ -647,8 +646,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
  * return with return value 0 and with the kvm_run structure filled in with the
  * required data for the requested emulation.
  */
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
+	struct kvm_run *run = vcpu->run;
 	int ret;
 
 	if (unlikely(!kvm_vcpu_initialized(vcpu)))
@@ -659,7 +659,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		return ret;
 
 	if (run->exit_reason == KVM_EXIT_MMIO) {
-		ret = kvm_handle_mmio_return(vcpu, vcpu->run);
+		ret = kvm_handle_mmio_return(vcpu, run);
 		if (ret)
 			return ret;
 	}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8aa577db131e..e2f60e313c87 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3129,7 +3129,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 				synchronize_rcu();
 			put_pid(oldpid);
 		}
-		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+		r = kvm_arch_vcpu_ioctl_run(vcpu);
 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	}
-- 
cgit v1.2.3


From e72436bc3a5206f95bb384e741154166ddb3202e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 17 Apr 2020 12:21:06 -0400
Subject: KVM: SVM: avoid infinite loop on NPF from bad address

When a nested page fault is taken from an address that does not have
a memslot associated to it, kvm_mmu_do_page_fault returns RET_PF_EMULATE
(via mmu_set_spte) and kvm_mmu_page_fault then invokes svm_need_emulation_on_page_fault.

The default answer there is to return false, but in this case this just
causes the page fault to be retried ad libitum.  Since this is not a
fast path, and the only other case where it is taken is an erratum,
just stick a kvm_vcpu_gfn_to_memslot check in there to detect the
common case where the erratum is not happening.

This fixes an infinite loop in the new set_memory_region_test.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 7 +++++++
 virt/kvm/kvm_main.c    | 1 +
 2 files changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a6f4e1bdb045..eb95283ab68d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3837,6 +3837,13 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
 	bool smap = cr4 & X86_CR4_SMAP;
 	bool is_user = svm_get_cpl(vcpu) == 3;
 
+	/*
+	 * If RIP is invalid, go ahead with emulation which will cause an
+	 * internal error exit.
+	 */
+	if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
+		return true;
+
 	/*
 	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
 	 *
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e2f60e313c87..e7436d054305 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1602,6 +1602,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
 {
 	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
 
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
-- 
cgit v1.2.3


From 56083bdf67c78030f11cdaed5b2b54959a329b02 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 17 Apr 2020 10:32:53 -0400
Subject: KVM: x86: check_nested_events is never NULL

Both Intel and AMD now implement it, so there is no need to check if the
callback is implemented.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 59958ce2b681..0492baeb78ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7699,7 +7699,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 	 * from L2 to L1 due to pending L1 events which require exit
 	 * from L2 to L1.
 	 */
-	if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) {
+	if (is_guest_mode(vcpu)) {
 		r = kvm_x86_ops.check_nested_events(vcpu);
 		if (r != 0)
 			return r;
@@ -7761,7 +7761,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 		 * proposal and current concerns.  Perhaps we should be setting
 		 * KVM_REQ_EVENT only on certain events and not unconditionally?
 		 */
-		if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) {
+		if (is_guest_mode(vcpu)) {
 			r = kvm_x86_ops.check_nested_events(vcpu);
 			if (r != 0)
 				return r;
@@ -8527,7 +8527,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
-	if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events)
+	if (is_guest_mode(vcpu))
 		kvm_x86_ops.check_nested_events(vcpu);
 
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
-- 
cgit v1.2.3


From 25091990ef289a5962b5bddd8c68de3f691c9e3c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 17 Apr 2020 10:34:29 -0400
Subject: KVM: eVMCS: check if nesting is enabled

In the next patch nested_get_evmcs_version will be always set in kvm_x86_ops for
VMX, even if nesting is disabled.  Therefore, check whether VMX (aka nesting)
is available in the function, the caller will not do the check anymore.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/evmcs.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 73f3e07c1852..e5325bd0f304 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -4,6 +4,7 @@
 #include <linux/smp.h>
 
 #include "../hyperv.h"
+#include "../cpuid.h"
 #include "evmcs.h"
 #include "vmcs.h"
 #include "vmx.h"
@@ -326,17 +327,18 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
 
 uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       /*
-        * vmcs_version represents the range of supported Enlightened VMCS
-        * versions: lower 8 bits is the minimal version, higher 8 bits is the
-        * maximum supported version. KVM supports versions from 1 to
-        * KVM_EVMCS_VERSION.
-        */
-       if (vmx->nested.enlightened_vmcs_enabled)
-               return (KVM_EVMCS_VERSION << 8) | 1;
-
-       return 0;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	/*
+	 * vmcs_version represents the range of supported Enlightened VMCS
+	 * versions: lower 8 bits is the minimal version, higher 8 bits is the
+	 * maximum supported version. KVM supports versions from 1 to
+	 * KVM_EVMCS_VERSION.
+	 */
+	if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
+	    vmx->nested.enlightened_vmcs_enabled)
+		return (KVM_EVMCS_VERSION << 8) | 1;
+
+	return 0;
 }
 
 void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
-- 
cgit v1.2.3


From 33b22172452f05c351fd2fa24c28d2e76c7b0692 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 17 Apr 2020 10:24:18 -0400
Subject: KVM: x86: move nested-related kvm_x86_ops to a separate struct

Clean up some of the patching of kvm_x86_ops, by moving kvm_x86_ops related to
nested virtualization into a separate struct.

As a result, these ops will always be non-NULL on VMX.  This is not a problem:

* check_nested_events is only called if is_guest_mode(vcpu) returns true

* get_nested_state treats VMXOFF state the same as nested being disabled

* set_nested_state fails if you attempt to set nested state while
  nesting is disabled

* nested_enable_evmcs could already be called on a CPU without VMX enabled
  in CPUID.

* nested_get_evmcs_version was fixed in the previous patch

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 29 ++++++++++++++++-------------
 arch/x86/kvm/hyperv.c           |  4 ++--
 arch/x86/kvm/svm/nested.c       |  6 +++++-
 arch/x86/kvm/svm/svm.c          | 13 +++++--------
 arch/x86/kvm/svm/svm.h          |  3 ++-
 arch/x86/kvm/vmx/nested.c       | 16 +++++++++-------
 arch/x86/kvm/vmx/nested.h       |  2 ++
 arch/x86/kvm/vmx/vmx.c          |  7 +------
 arch/x86/kvm/x86.c              | 28 ++++++++++++++--------------
 9 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f26df2cb0591..a239a297be33 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1178,7 +1178,6 @@ struct kvm_x86_ops {
 			       struct x86_exception *exception);
 	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
 
-	int (*check_nested_events)(struct kvm_vcpu *vcpu);
 	void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
 
 	void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
@@ -1211,6 +1210,7 @@ struct kvm_x86_ops {
 
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
+	const struct kvm_x86_nested_ops *nested_ops;
 
 	/*
 	 * Architecture specific hooks for vCPU blocking due to
@@ -1238,14 +1238,6 @@ struct kvm_x86_ops {
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
 
-	int (*get_nested_state)(struct kvm_vcpu *vcpu,
-				struct kvm_nested_state __user *user_kvm_nested_state,
-				unsigned user_data_size);
-	int (*set_nested_state)(struct kvm_vcpu *vcpu,
-				struct kvm_nested_state __user *user_kvm_nested_state,
-				struct kvm_nested_state *kvm_state);
-	bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
-
 	int (*smi_allowed)(struct kvm_vcpu *vcpu);
 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
@@ -1257,16 +1249,27 @@ struct kvm_x86_ops {
 
 	int (*get_msr_feature)(struct kvm_msr_entry *entry);
 
-	int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
-				   uint16_t *vmcs_version);
-	uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
-
 	bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
 
 	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
 	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
 };
 
+struct kvm_x86_nested_ops {
+	int (*check_events)(struct kvm_vcpu *vcpu);
+	int (*get_state)(struct kvm_vcpu *vcpu,
+			 struct kvm_nested_state __user *user_kvm_nested_state,
+			 unsigned user_data_size);
+	int (*set_state)(struct kvm_vcpu *vcpu,
+			 struct kvm_nested_state __user *user_kvm_nested_state,
+			 struct kvm_nested_state *kvm_state);
+	bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
+	int (*enable_evmcs)(struct kvm_vcpu *vcpu,
+			    uint16_t *vmcs_version);
+	uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+};
+
 struct kvm_x86_init_ops {
 	int (*cpu_has_kvm_support)(void);
 	int (*disabled_by_bios)(void);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index b850f676abe4..2f96ff9e60ee 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1799,8 +1799,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 	};
 	int i, nent = ARRAY_SIZE(cpuid_entries);
 
-	if (kvm_x86_ops.nested_get_evmcs_version)
-		evmcs_ver = kvm_x86_ops.nested_get_evmcs_version(vcpu);
+	if (kvm_x86_ops.nested_ops->get_evmcs_version)
+		evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
 
 	/* Skip NESTED_FEATURES if eVMCS is not supported */
 	if (!evmcs_ver)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 3e5bd739a6f6..6ea047e6882e 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -784,7 +784,7 @@ static bool nested_exit_on_intr(struct vcpu_svm *svm)
 	return (svm->nested.intercept & 1ULL);
 }
 
-int svm_check_nested_events(struct kvm_vcpu *vcpu)
+static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	bool block_nested_events =
@@ -825,3 +825,7 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
 
 	return NESTED_EXIT_CONTINUE;
 }
+
+struct kvm_x86_nested_ops svm_nested_ops = {
+	.check_events = svm_check_nested_events,
+};
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index eb95283ab68d..c86f7278509b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3902,9 +3902,9 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
 	/*
 	 * TODO: Last condition latch INIT signals on vCPU when
 	 * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
-	 * To properly emulate the INIT intercept, SVM should implement
-	 * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit()
-	 * there if an INIT signal is pending.
+	 * To properly emulate the INIT intercept,
+	 * svm_check_nested_events() should call nested_svm_vmexit()
+	 * if an INIT signal is pending.
 	 */
 	return !gif_set(svm) ||
 		   (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
@@ -4032,6 +4032,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.sched_in = svm_sched_in,
 
 	.pmu_ops = &amd_pmu_ops,
+	.nested_ops = &svm_nested_ops,
+
 	.deliver_posted_interrupt = svm_deliver_avic_intr,
 	.dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
 	.update_pi_irte = svm_update_pi_irte,
@@ -4046,14 +4048,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.mem_enc_reg_region = svm_register_enc_region,
 	.mem_enc_unreg_region = svm_unregister_enc_region,
 
-	.nested_enable_evmcs = NULL,
-	.nested_get_evmcs_version = NULL,
-
 	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
 
 	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
-
-	.check_nested_events = svm_check_nested_events,
 };
 
 static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index ca95204f9dde..98c2890d561d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -398,9 +398,10 @@ int nested_svm_exit_handled(struct vcpu_svm *svm);
 int nested_svm_check_permissions(struct vcpu_svm *svm);
 int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 			       bool has_error_code, u32 error_code);
-int svm_check_nested_events(struct kvm_vcpu *vcpu);
 int nested_svm_exit_special(struct vcpu_svm *svm);
 
+extern struct kvm_x86_nested_ops svm_nested_ops;
+
 /* avic.c */
 
 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK	(0xFF)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f228339cd0a0..56074d3443e0 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6440,12 +6440,14 @@ __init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
 	exit_handlers[EXIT_REASON_INVVPID]	= handle_invvpid;
 	exit_handlers[EXIT_REASON_VMFUNC]	= handle_vmfunc;
 
-	ops->check_nested_events = vmx_check_nested_events;
-	ops->get_nested_state = vmx_get_nested_state;
-	ops->set_nested_state = vmx_set_nested_state;
-	ops->get_vmcs12_pages = nested_get_vmcs12_pages;
-	ops->nested_enable_evmcs = nested_enable_evmcs;
-	ops->nested_get_evmcs_version = nested_get_evmcs_version;
-
 	return 0;
 }
+
+struct kvm_x86_nested_ops vmx_nested_ops = {
+	.check_events = vmx_check_nested_events,
+	.get_state = vmx_get_nested_state,
+	.set_state = vmx_set_nested_state,
+	.get_vmcs12_pages = nested_get_vmcs12_pages,
+	.enable_evmcs = nested_enable_evmcs,
+	.get_evmcs_version = nested_get_evmcs_version,
+};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 1514ff4db77f..7ce9572c3d3a 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -278,4 +278,6 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
 #define nested_guest_cr4_valid	nested_cr4_valid
 #define nested_host_cr4_valid	nested_cr4_valid
 
+extern struct kvm_x86_nested_ops vmx_nested_ops;
+
 #endif /* __KVM_X86_VMX_NESTED_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 766303b31949..455cd2c8dbce 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7862,6 +7862,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.post_block = vmx_post_block,
 
 	.pmu_ops = &intel_pmu_ops,
+	.nested_ops = &vmx_nested_ops,
 
 	.update_pi_irte = vmx_update_pi_irte,
 
@@ -7877,12 +7878,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.pre_leave_smm = vmx_pre_leave_smm,
 	.enable_smi_window = enable_smi_window,
 
-	.check_nested_events = NULL,
-	.get_nested_state = NULL,
-	.set_nested_state = NULL,
-	.get_vmcs12_pages = NULL,
-	.nested_enable_evmcs = NULL,
-	.nested_get_evmcs_version = NULL,
 	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
 	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0492baeb78ab..8c0b77ac8dc6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3442,14 +3442,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_X2APIC_API_VALID_FLAGS;
 		break;
 	case KVM_CAP_NESTED_STATE:
-		r = kvm_x86_ops.get_nested_state ?
-			kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0;
+		r = kvm_x86_ops.nested_ops->get_state ?
+			kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
 		break;
 	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
 		r = kvm_x86_ops.enable_direct_tlbflush != NULL;
 		break;
 	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
-		r = kvm_x86_ops.nested_enable_evmcs != NULL;
+		r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
 		break;
 	default:
 		break;
@@ -4235,9 +4235,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		return kvm_hv_activate_synic(vcpu, cap->cap ==
 					     KVM_CAP_HYPERV_SYNIC2);
 	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
-		if (!kvm_x86_ops.nested_enable_evmcs)
+		if (!kvm_x86_ops.nested_ops->enable_evmcs)
 			return -ENOTTY;
-		r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version);
+		r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
 		if (!r) {
 			user_ptr = (void __user *)(uintptr_t)cap->args[0];
 			if (copy_to_user(user_ptr, &vmcs_version,
@@ -4552,7 +4552,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		u32 user_data_size;
 
 		r = -EINVAL;
-		if (!kvm_x86_ops.get_nested_state)
+		if (!kvm_x86_ops.nested_ops->get_state)
 			break;
 
 		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
@@ -4560,8 +4560,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (get_user(user_data_size, &user_kvm_nested_state->size))
 			break;
 
-		r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state,
-						  user_data_size);
+		r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
+						     user_data_size);
 		if (r < 0)
 			break;
 
@@ -4582,7 +4582,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		int idx;
 
 		r = -EINVAL;
-		if (!kvm_x86_ops.set_nested_state)
+		if (!kvm_x86_ops.nested_ops->set_state)
 			break;
 
 		r = -EFAULT;
@@ -4604,7 +4604,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			break;
 
 		idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+		r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 		break;
 	}
@@ -7700,7 +7700,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 	 * from L2 to L1.
 	 */
 	if (is_guest_mode(vcpu)) {
-		r = kvm_x86_ops.check_nested_events(vcpu);
+		r = kvm_x86_ops.nested_ops->check_events(vcpu);
 		if (r != 0)
 			return r;
 	}
@@ -7762,7 +7762,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 		 * KVM_REQ_EVENT only on certain events and not unconditionally?
 		 */
 		if (is_guest_mode(vcpu)) {
-			r = kvm_x86_ops.check_nested_events(vcpu);
+			r = kvm_x86_ops.nested_ops->check_events(vcpu);
 			if (r != 0)
 				return r;
 		}
@@ -8185,7 +8185,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
-			if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) {
+			if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) {
 				r = 0;
 				goto out;
 			}
@@ -8528,7 +8528,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu))
-		kvm_x86_ops.check_nested_events(vcpu);
+		kvm_x86_ops.nested_ops->check_events(vcpu);
 
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		!vcpu->arch.apf.halted);
-- 
cgit v1.2.3


From 9bd4af240f4db39e754081d135e6ef7a54bb6828 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 21 Apr 2020 00:53:27 -0700
Subject: KVM: nVMX: Drop a redundant call to vmx_get_intr_info()

Drop nested_vmx_l1_wants_exit()'s initialization of intr_info from
vmx_get_intr_info() that was inadvertantly introduced along with the
caching mechanism.  EXIT_REASON_EXCEPTION_NMI, the only consumer of
intr_info, populates the variable before using it.

Fixes: bb53120d67cd ("KVM: VMX: Cache vmcs.EXIT_INTR_INFO using arch avail_reg flags")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200421075328.14458-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 56074d3443e0..12b78468317c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5691,8 +5691,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
  */
 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
-	u32 intr_info = vmx_get_intr_info(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u32 intr_info;
 
 	switch (exit_reason) {
 	case EXIT_REASON_EXCEPTION_NMI:
-- 
cgit v1.2.3


From 87796555d48ca1cc681515e065b824ecab9c4282 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 17:11:27 -0700
Subject: KVM: nVMX: Store vmcs.EXIT_QUALIFICATION as an unsigned long, not u32

Use an unsigned long for 'exit_qual' in nested_vmx_reflect_vmexit(), the
EXIT_QUALIFICATION field is naturally sized, not a 32-bit field.

The bug is most easily observed by doing VMXON (or any VMX instruction)
in L2 with a negative displacement, in which case dropping the upper
bits on nested VM-Exit results in L1 calculating the wrong virtual
address for the memory operand, e.g. "vmxon -0x8(%rbp)" yields:

  Unhandled cpu exception 14 #PF at ip 0000000000400553
  rbp=0000000000537000 cr2=0000000100536ff8

Fixes: fbdd50250396d ("KVM: nVMX: Move VM-Fail check out of nested_vmx_exit_reflected()")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423001127.13490-1-sean.j.christopherson@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 12b78468317c..b516c24494e3 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5814,7 +5814,8 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exit_reason = vmx->exit_reason;
-	u32 exit_intr_info, exit_qual;
+	unsigned long exit_qual;
+	u32 exit_intr_info;
 
 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
-- 
cgit v1.2.3


From 7c67f54661fcc8d141fb11abbab1739f32e13b03 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 10:52:48 -0400
Subject: KVM: SVM: do not allow VMRUN inside SMM

VMRUN is not supported inside the SMM handler and the behavior is undefined.
Just raise a #UD.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6ea047e6882e..a7c3b3030e59 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -345,8 +345,12 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 	struct kvm_host_map map;
 	u64 vmcb_gpa;
 
-	vmcb_gpa = svm->vmcb->save.rax;
+	if (is_smm(&svm->vcpu)) {
+		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+		return 1;
+	}
 
+	vmcb_gpa = svm->vmcb->save.rax;
 	ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
 	if (ret == -EINVAL) {
 		kvm_inject_gp(&svm->vcpu, 0);
-- 
cgit v1.2.3


From f74f94140fa50f768e61d626de4c146502b9102d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 13:22:27 -0400
Subject: KVM: SVM: introduce nested_run_pending

We want to inject vmexits immediately from svm_check_nested_events,
so that the interrupt/NMI window requests happen in inject_pending_event
right after it returns.

This however has the same issue as in vmx_check_nested_events, so
introduce a nested_run_pending flag with the exact same purpose
of delaying vmexit injection after the vmentry.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 4 +++-
 arch/x86/kvm/svm/svm.c    | 1 +
 arch/x86/kvm/svm/svm.h    | 4 ++++
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 1429f506fe9e..7a724ea3d994 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -414,6 +414,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 
 	copy_vmcb_control_area(hsave, vmcb);
 
+	svm->nested.nested_run_pending = 1;
 	enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
 
 	if (!nested_svm_vmrun_msrpm(svm)) {
@@ -815,7 +816,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	bool block_nested_events =
-		kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
+		kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required ||
+		svm->nested.nested_run_pending;
 
 	if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
 		if (block_nested_events)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b627564e41f9..c2a4e2d21676 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3429,6 +3429,7 @@ static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
 	sync_cr8_to_lapic(vcpu);
 
 	svm->next_rip = 0;
+	svm->nested.nested_run_pending = 0;
 
 	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 98c2890d561d..435f3328c99c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -97,6 +97,10 @@ struct nested_state {
 	/* A VMEXIT is required but not yet emulated */
 	bool exit_required;
 
+	/* A VMRUN has started but has not yet been performed, so
+	 * we cannot inject a nested vmexit yet.  */
+	bool nested_run_pending;
+
 	/* cache for intercepts of the guest */
 	u32 intercept_cr;
 	u32 intercept_dr;
-- 
cgit v1.2.3


From 38c0b192bd6df089cdffd9828d19e764385f96e6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 13:13:09 -0400
Subject: KVM: SVM: leave halted state on vmexit

Similar to VMX, we need to leave the halted state when performing a vmexit.
Failure to do so will cause a hang after vmexit.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 7a724ea3d994..2a0f1b043145 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -473,6 +473,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	leave_guest_mode(&svm->vcpu);
 	svm->nested.vmcb = 0;
 
+	/* in case we halted in L2 */
+	svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
 	/* Give the current vmcb to the guest */
 	disable_gif(svm);
 
-- 
cgit v1.2.3


From 6e085cbfb0f0c7de4ca0f370adb572b0e07a818c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 13:15:33 -0400
Subject: KVM: SVM: immediately inject INTR vmexit

We can immediately leave SVM guest mode in svm_check_nested_events
now that we have the nested_run_pending mechanism.  This makes
things easier because we can run the rest of inject_pending_event
with GIF=0, and KVM will naturally end up requesting the next
interrupt window.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 2a0f1b043145..4654668798b7 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -801,13 +801,13 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 
 static void nested_svm_intr(struct vcpu_svm *svm)
 {
+	trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+
 	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
 	svm->vmcb->control.exit_info_1 = 0;
 	svm->vmcb->control.exit_info_2 = 0;
 
-	/* nested_svm_vmexit this gets called afterwards from handle_exit */
-	svm->nested.exit_required = true;
-	trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+	nested_svm_vmexit(svm);
 }
 
 static bool nested_exit_on_intr(struct vcpu_svm *svm)
-- 
cgit v1.2.3


From 9c3d370a8efaac49b0520617c9ab4e552aefbf1e Mon Sep 17 00:00:00 2001
From: Cathy Avery <cavery@redhat.com>
Date: Tue, 14 Apr 2020 16:11:06 -0400
Subject: KVM: SVM: Implement check_nested_events for NMI

Migrate nested guest NMI intercept processing
to new check_nested_events.

Signed-off-by: Cathy Avery <cavery@redhat.com>
Message-Id: <20200414201107.22952-2-cavery@redhat.com>
[Reorder clauses as NMIs have higher priority than IRQs; inject
 immediate vmexit as is now done for IRQ vmexits. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 21 +++++++++++++++++++++
 arch/x86/kvm/svm/svm.c    |  6 ++----
 arch/x86/kvm/svm/svm.h    | 15 ---------------
 3 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4654668798b7..3f268a3041a3 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -799,6 +799,20 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	return vmexit;
 }
 
+static bool nested_exit_on_nmi(struct vcpu_svm *svm)
+{
+	return (svm->nested.intercept & (1ULL << INTERCEPT_NMI));
+}
+
+static void nested_svm_nmi(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+	svm->vmcb->control.exit_info_1 = 0;
+	svm->vmcb->control.exit_info_2 = 0;
+
+	nested_svm_vmexit(svm);
+}
+
 static void nested_svm_intr(struct vcpu_svm *svm)
 {
 	trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
@@ -822,6 +836,13 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 		kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required ||
 		svm->nested.nested_run_pending;
 
+	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(svm)) {
+		if (block_nested_events)
+			return -EBUSY;
+		nested_svm_nmi(svm);
+		return 0;
+	}
+
 	if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
 		if (block_nested_events)
 			return -EBUSY;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c2a4e2d21676..f97f29e1eb49 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3070,9 +3070,10 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
 	int ret;
+
 	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
 	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
-	ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+	ret = ret && gif_set(svm);
 
 	return ret;
 }
@@ -3150,9 +3151,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 		return; /* STGI will cause a vm exit */
 	}
 
-	if (svm->nested.exit_required)
-		return; /* we're not going to run the guest yet */
-
 	/*
 	 * Something prevents NMI from been injected. Single step over possible
 	 * problem (IRET or exception injection or interrupt shadow)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 435f3328c99c..a2bc33aadb67 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -373,21 +373,6 @@ void disable_nmi_singlestep(struct vcpu_svm *svm);
 #define NESTED_EXIT_DONE	1	/* Exit caused nested vmexit  */
 #define NESTED_EXIT_CONTINUE	2	/* Further checks needed      */
 
-/* This function returns true if it is save to enable the nmi window */
-static inline bool nested_svm_nmi(struct vcpu_svm *svm)
-{
-	if (!is_guest_mode(&svm->vcpu))
-		return true;
-
-	if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
-		return true;
-
-	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
-	svm->nested.exit_required = true;
-
-	return false;
-}
-
 static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
 {
 	return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
-- 
cgit v1.2.3


From 6ce347af1499deff5890a52aedacd451330af706 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:38 -0700
Subject: KVM: nVMX: Preserve exception priority irrespective of exiting
 behavior

Short circuit vmx_check_nested_events() if an exception is pending and
needs to be injected into L2, priority between coincident events is not
dependent on exiting behavior.  This fixes a bug where a single-step #DB
that is not intercepted by L1 is incorrectly dropped due to servicing a
VMX Preemption Timer VM-Exit.

Injected exceptions also need to be blocked if nested VM-Enter is
pending or an exception was already injected, otherwise injecting the
exception could overwrite an existing event injection from L1.
Technically, this scenario should be impossible, i.e. KVM shouldn't
inject its own exception during nested VM-Enter.  This will be addressed
in a future patch.

Note, event priority between SMI, NMI and INTR is incorrect for L2, e.g.
SMI should take priority over VM-Exit on NMI/INTR, and NMI that is
injected into L2 should take priority over VM-Exit INTR.  This will also
be addressed in a future patch.

Fixes: b6b8a1451fc4 ("KVM: nVMX: Rework interception of IRQs and NMIs")
Reported-by: Jim Mattson <jmattson@google.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Peter Shier <pshier@google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b644bbf85460..de678be3c92d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3716,11 +3716,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 	/*
 	 * Process any exceptions that are not debug traps before MTF.
 	 */
-	if (vcpu->arch.exception.pending &&
-	    !vmx_pending_dbg_trap(vcpu) &&
-	    nested_vmx_check_exception(vcpu, &exit_qual)) {
+	if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
+		if (!nested_vmx_check_exception(vcpu, &exit_qual))
+			goto no_vmexit;
 		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
 		return 0;
 	}
@@ -3733,10 +3733,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (vcpu->arch.exception.pending &&
-	    nested_vmx_check_exception(vcpu, &exit_qual)) {
+	if (vcpu->arch.exception.pending) {
 		if (block_nested_events)
 			return -EBUSY;
+		if (!nested_vmx_check_exception(vcpu, &exit_qual))
+			goto no_vmexit;
 		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
 		return 0;
 	}
@@ -3771,6 +3772,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
+no_vmexit:
 	vmx_complete_nested_posted_interrupt(vcpu);
 	return 0;
 }
-- 
cgit v1.2.3


From d2060bd42e4482b15c35f961a294ee57f369027d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:39 -0700
Subject: KVM: nVMX: Open a window for pending nested VMX preemption timer

Add a kvm_x86_ops hook to detect a nested pending "hypervisor timer" and
use it to effectively open a window for servicing the expired timer.
Like pending SMIs on VMX, opening a window simply means requesting an
immediate exit.

This fixes a bug where an expired VMX preemption timer (for L2) will be
delayed and/or lost if a pending exception is injected into L2.  The
pending exception is rightly prioritized by vmx_check_nested_events()
and injected into L2, with the preemption timer left pending.  Because
no window opened, L2 is free to run uninterrupted.

Fixes: f4124500c2c13 ("KVM: nVMX: Fully emulate preemption timer")
Reported-by: Jim Mattson <jmattson@google.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Peter Shier <pshier@google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-3-sean.j.christopherson@intel.com>
[Check it in kvm_vcpu_has_events too, to ensure that the preemption
 timer is serviced promptly even if the vCPU is halted and L1 is not
 intercepting HLT. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx/nested.c       | 10 ++++++++--
 arch/x86/kvm/x86.c              |  9 +++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b3a5da27c2a5..e6671c61fd65 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1256,6 +1256,7 @@ struct kvm_x86_ops {
 
 struct kvm_x86_nested_ops {
 	int (*check_events)(struct kvm_vcpu *vcpu);
+	bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
 	int (*get_state)(struct kvm_vcpu *vcpu,
 			 struct kvm_nested_state __user *user_kvm_nested_state,
 			 unsigned user_data_size);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index de678be3c92d..51220a14a633 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3687,6 +3687,12 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
 			    vcpu->arch.exception.payload);
 }
 
+static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
+{
+	return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+	       to_vmx(vcpu)->nested.preemption_timer_expired;
+}
+
 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3742,8 +3748,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
-	    vmx->nested.preemption_timer_expired) {
+	if (nested_vmx_preemption_timer_pending(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
 		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
@@ -6448,6 +6453,7 @@ __init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
 
 struct kvm_x86_nested_ops vmx_nested_ops = {
 	.check_events = vmx_check_nested_events,
+	.hv_timer_pending = nested_vmx_preemption_timer_pending,
 	.get_state = vmx_get_nested_state,
 	.set_state = vmx_set_nested_state,
 	.get_vmcs12_pages = nested_get_vmcs12_pages,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 542a00008caa..e874182d113c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8355,6 +8355,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 				kvm_x86_ops.enable_nmi_window(vcpu);
 			if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
 				kvm_x86_ops.enable_irq_window(vcpu);
+			if (is_guest_mode(vcpu) &&
+			    kvm_x86_ops.nested_ops->hv_timer_pending &&
+			    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+				req_immediate_exit = true;
 			WARN_ON(vcpu->arch.exception.pending);
 		}
 
@@ -10211,6 +10215,11 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (kvm_hv_has_stimer_pending(vcpu))
 		return true;
 
+	if (is_guest_mode(vcpu) &&
+	    kvm_x86_ops.nested_ops->hv_timer_pending &&
+	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+		return true;
+
 	return false;
 }
 
-- 
cgit v1.2.3


From 8081ad06b68a728e676d3b08e9ab70ce4039747b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:40 -0700
Subject: KVM: x86: Set KVM_REQ_EVENT if run is canceled with
 req_immediate_exit set

Re-request KVM_REQ_EVENT if vcpu_enter_guest() bails after processing
pending requests and an immediate exit was requested.  This fixes a bug
where a pending event, e.g. VMX preemption timer, is delayed and/or lost
if the exit was deferred due to something other than a higher priority
_injected_ event, e.g. due to a pending nested VM-Enter.  This bug only
affects the !injected case as kvm_x86_ops.cancel_injection() sets
KVM_REQ_EVENT to redo the injection, but that's purely serendipitous
behavior with respect to the deferred event.

Note, emulated preemption timer isn't the only event that can be
affected, it simply happens to be the only event where not re-requesting
KVM_REQ_EVENT is blatantly visible to the guest.

Fixes: f4124500c2c13 ("KVM: nVMX: Fully emulate preemption timer")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-4-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e874182d113c..bdcb4e7c579d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8519,6 +8519,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	return r;
 
 cancel_injection:
+	if (req_immediate_exit)
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
 	kvm_x86_ops.cancel_injection(vcpu);
 	if (unlikely(vcpu->arch.apic_attention))
 		kvm_lapic_sync_from_vapic(vcpu);
-- 
cgit v1.2.3


From 88c604b66eb6a74841cd40f5bdf639112ad69115 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:41 -0700
Subject: KVM: x86: Make return for {interrupt_nmi,smi}_allowed() a bool
 instead of int

Return an actual bool for kvm_x86_ops' {interrupt_nmi}_allowed() hook to
better reflect the return semantics, and to avoid creating an even
bigger mess when the related VMX code is refactored in upcoming patches.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-5-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 +++---
 arch/x86/kvm/svm/svm.c          | 16 ++++++++--------
 arch/x86/kvm/vmx/vmx.c          | 14 +++++++-------
 3 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e6671c61fd65..44268642b3c6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1138,8 +1138,8 @@ struct kvm_x86_ops {
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu);
 	void (*cancel_injection)(struct kvm_vcpu *vcpu);
-	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
-	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+	bool (*interrupt_allowed)(struct kvm_vcpu *vcpu);
+	bool (*nmi_allowed)(struct kvm_vcpu *vcpu);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
 	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
@@ -1237,7 +1237,7 @@ struct kvm_x86_ops {
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
 
-	int (*smi_allowed)(struct kvm_vcpu *vcpu);
+	bool (*smi_allowed)(struct kvm_vcpu *vcpu);
 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
 	int (*enable_smi_window)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f97f29e1eb49..c6e5374db825 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3065,11 +3065,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
-static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
+static bool svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
-	int ret;
+	bool ret;
 
 	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
 	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
@@ -3098,14 +3098,14 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 	}
 }
 
-static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
 
 	if (!gif_set(svm) ||
 	     (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
-		return 0;
+		return false;
 
 	if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
 		return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
@@ -3767,23 +3767,23 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
 	vcpu->arch.mcg_cap &= 0x1ff;
 }
 
-static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+static bool svm_smi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	/* Per APM Vol.2 15.22.2 "Response to SMI" */
 	if (!gif_set(svm))
-		return 0;
+		return false;
 
 	if (is_guest_mode(&svm->vcpu) &&
 	    svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
 		/* TODO: Might need to set exit_info_1 and exit_info_2 here */
 		svm->vmcb->control.exit_code = SVM_EXIT_SMI;
 		svm->nested.exit_required = true;
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 46aa3ca01929..4ca1f1f4ef1a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4510,21 +4510,21 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 	}
 }
 
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	if (to_vmx(vcpu)->nested.nested_run_pending)
-		return 0;
+		return false;
 
 	if (!enable_vnmi &&
 	    to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
-		return 0;
+		return false;
 
 	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
 		   | GUEST_INTR_STATE_NMI));
 }
 
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return false;
@@ -7675,12 +7675,12 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 			~FEAT_CTL_LMCE_ENABLED;
 }
 
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+static bool vmx_smi_allowed(struct kvm_vcpu *vcpu)
 {
 	/* we need a nested vmexit to enter SMM, postpone if run is pending */
 	if (to_vmx(vcpu)->nested.nested_run_pending)
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
 static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
-- 
cgit v1.2.3


From a9fa7cb6aa997ba58294f1a07d402ce5855bafe1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 11:02:36 -0400
Subject: KVM: x86: replace is_smm checks with kvm_x86_ops.smi_allowed

Do not hardcode is_smm so that all the architectural conditions for
blocking SMIs are listed in a single place.  Well, in two places because
this introduces some code duplication between Intel and AMD.

This ensures that nested SVM obeys GIF in kvm_vcpu_has_events.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 2 +-
 arch/x86/kvm/vmx/vmx.c | 2 +-
 arch/x86/kvm/x86.c     | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c6e5374db825..739414028536 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3783,7 +3783,7 @@ static bool svm_smi_allowed(struct kvm_vcpu *vcpu)
 		return false;
 	}
 
-	return true;
+	return !is_smm(vcpu);
 }
 
 static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4ca1f1f4ef1a..d0bea514d447 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7680,7 +7680,7 @@ static bool vmx_smi_allowed(struct kvm_vcpu *vcpu)
 	/* we need a nested vmexit to enter SMM, postpone if run is pending */
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return false;
-	return true;
+	return !is_smm(vcpu);
 }
 
 static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bdcb4e7c579d..446fbdd05bd4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7764,8 +7764,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 	if (kvm_event_needs_reinjection(vcpu))
 		return 0;
 
-	if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
-	    kvm_x86_ops.smi_allowed(vcpu)) {
+	if (vcpu->arch.smi_pending && kvm_x86_ops.smi_allowed(vcpu)) {
 		vcpu->arch.smi_pending = false;
 		++vcpu->arch.smi_count;
 		enter_smm(vcpu);
@@ -10206,7 +10205,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 		return true;
 
 	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
-	    (vcpu->arch.smi_pending && !is_smm(vcpu)))
+	    (vcpu->arch.smi_pending &&
+	     kvm_x86_ops.smi_allowed(vcpu)))
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
-- 
cgit v1.2.3


From 429ab576f387c8fcaeb2a172901aaba3c6794310 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:42 -0700
Subject: KVM: nVMX: Report NMIs as allowed when in L2 and Exit-on-NMI is set

Report NMIs as allowed when the vCPU is in L2 and L2 is being run with
Exit-on-NMI enabled, as NMIs are always unblocked from L1's perspective
in this case.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-7-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 5 -----
 arch/x86/kvm/vmx/nested.h | 5 +++++
 arch/x86/kvm/vmx/vmx.c    | 3 +++
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 51220a14a633..643f86d7d537 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -698,11 +698,6 @@ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
 		VM_EXIT_ACK_INTR_ON_EXIT;
 }
 
-static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
-{
-	return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
-}
-
 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
 					  struct vmcs12 *vmcs12)
 {
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 7ce9572c3d3a..5cc72ae0e277 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -225,6 +225,11 @@ static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
 	    VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
 }
 
+static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+	return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
+}
+
 /*
  * In nested virtualization, check if L1 asked to exit on external interrupts.
  * For most existing hypervisors, this will always return true.
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d0bea514d447..f688e6e876e8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4515,6 +4515,9 @@ static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return false;
 
+	if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+		return true;
+
 	if (!enable_vnmi &&
 	    to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
 		return false;
-- 
cgit v1.2.3


From bbdad0b5a708ddb37a9f051504c2133fa92df97a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 08:06:43 -0400
Subject: KVM: nSVM: Report NMIs as allowed when in L2 and Exit-on-NMI is set

Report NMIs as allowed when the vCPU is in L2 and L2 is being run with
Exit-on-NMI enabled, as NMIs are always unblocked from L1's perspective
in this case.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 5 -----
 arch/x86/kvm/svm/svm.c    | 3 +++
 arch/x86/kvm/svm/svm.h    | 5 +++++
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 3f268a3041a3..2828fa5b6016 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -799,11 +799,6 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	return vmexit;
 }
 
-static bool nested_exit_on_nmi(struct vcpu_svm *svm)
-{
-	return (svm->nested.intercept & (1ULL << INTERCEPT_NMI));
-}
-
 static void nested_svm_nmi(struct vcpu_svm *svm)
 {
 	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 739414028536..c4f1846b6259 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3071,6 +3071,9 @@ static bool svm_nmi_allowed(struct kvm_vcpu *vcpu)
 	struct vmcb *vmcb = svm->vmcb;
 	bool ret;
 
+	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+		return true;
+
 	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
 	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
 	ret = ret && gif_set(svm);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a2bc33aadb67..d8ae654340d4 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -378,6 +378,11 @@ static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
 	return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
 }
 
+static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
+{
+	return (svm->nested.intercept & (1ULL << INTERCEPT_NMI));
+}
+
 void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 			  struct vmcb *nested_vmcb, struct kvm_host_map *map);
 int nested_svm_vmrun(struct vcpu_svm *svm);
-- 
cgit v1.2.3


From 55714cddbf1028bbfa19fd7d69182de3f135ce99 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 08:17:28 -0400
Subject: KVM: nSVM: Move SMI vmexit handling to svm_check_nested_events()

Unlike VMX, SVM allows a hypervisor to take a SMI vmexit without having
any special SMM-monitor enablement sequence.  Therefore, it has to be
handled like interrupts and NMIs.  Check for an unblocked SMI in
svm_check_nested_events() so that pending SMIs are correctly prioritized
over IRQs and NMIs when the latter events will trigger VM-Exit.

Note that there is no need to test explicitly for SMI vmexits, because
guests always runs outside SMM and therefore can never get an SMI while
they are blocked.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 16 ++++++++++++++++
 arch/x86/kvm/svm/svm.c    |  8 --------
 arch/x86/kvm/svm/svm.h    |  5 +++++
 3 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 2828fa5b6016..aaec6d0aa701 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -799,6 +799,15 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	return vmexit;
 }
 
+static void nested_svm_smi(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.exit_code = SVM_EXIT_SMI;
+	svm->vmcb->control.exit_info_1 = 0;
+	svm->vmcb->control.exit_info_2 = 0;
+
+	nested_svm_vmexit(svm);
+}
+
 static void nested_svm_nmi(struct vcpu_svm *svm)
 {
 	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
@@ -831,6 +840,13 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 		kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required ||
 		svm->nested.nested_run_pending;
 
+	if (vcpu->arch.smi_pending && nested_exit_on_smi(svm)) {
+		if (block_nested_events)
+			return -EBUSY;
+		nested_svm_smi(svm);
+		return 0;
+	}
+
 	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(svm)) {
 		if (block_nested_events)
 			return -EBUSY;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c4f1846b6259..83b8bc305fe1 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3778,14 +3778,6 @@ static bool svm_smi_allowed(struct kvm_vcpu *vcpu)
 	if (!gif_set(svm))
 		return false;
 
-	if (is_guest_mode(&svm->vcpu) &&
-	    svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
-		/* TODO: Might need to set exit_info_1 and exit_info_2 here */
-		svm->vmcb->control.exit_code = SVM_EXIT_SMI;
-		svm->nested.exit_required = true;
-		return false;
-	}
-
 	return !is_smm(vcpu);
 }
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d8ae654340d4..4dc6d2b4b721 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -378,6 +378,11 @@ static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
 	return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
 }
 
+static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
+{
+	return (svm->nested.intercept & (1ULL << INTERCEPT_SMI));
+}
+
 static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
 {
 	return (svm->nested.intercept & (1ULL << INTERCEPT_NMI));
-- 
cgit v1.2.3


From 1b660b6baaafd8b9056740b83decd7fc74023627 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:44 -0700
Subject: KVM: VMX: Split out architectural interrupt/NMI blocking checks

Move the architectural (non-KVM specific) interrupt/NMI blocking checks
to a separate helper so that they can be used in a future patch by
vmx_check_nested_events().

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-8-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 35 ++++++++++++++++++++++-------------
 arch/x86/kvm/vmx/vmx.h |  2 ++
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f688e6e876e8..a5140ed7dbf9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4510,21 +4510,35 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 	}
 }
 
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+		return false;
+
+	if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+		return true;
+
+	return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+		(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
+		 GUEST_INTR_STATE_NMI));
+}
+
 static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return false;
 
-	if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
-		return true;
+	return !vmx_nmi_blocked(vcpu);
+}
 
-	if (!enable_vnmi &&
-	    to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
 		return false;
 
-	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
-		   | GUEST_INTR_STATE_NMI));
+	return !(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) ||
+	       (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+		(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
 
 static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
@@ -4532,12 +4546,7 @@ static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return false;
 
-	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
-		return true;
-
-	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+	return !vmx_interrupt_blocked(vcpu);
 }
 
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index edfb739e5907..b5e773267abe 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -344,6 +344,8 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
 void update_exception_bitmap(struct kvm_vcpu *vcpu);
 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From cae96af18452336aff85c95c6892aab1ed955eed Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 14:19:26 -0400
Subject: KVM: SVM: Split out architectural interrupt/NMI/SMI blocking checks

Move the architectural (non-KVM specific) interrupt/NMI/SMI blocking checks
to a separate helper so that they can be used in a future patch by
svm_check_nested_events().

No functional change intended.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 51 +++++++++++++++++++++++++++++++++++++++-----------
 arch/x86/kvm/svm/svm.h |  3 +++
 2 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 83b8bc305fe1..5738be661a1f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3065,22 +3065,33 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
-static bool svm_nmi_allowed(struct kvm_vcpu *vcpu)
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
 	bool ret;
 
-	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+	if (!gif_set(svm))
 		return true;
 
-	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
-	ret = ret && gif_set(svm);
+	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+		return false;
+
+	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
+	      (svm->vcpu.arch.hflags & HF_NMI_MASK);
 
 	return ret;
 }
 
+static bool svm_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	if (svm->nested.nested_run_pending)
+		return false;
+
+        return !svm_nmi_blocked(vcpu);
+}
+
 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3101,19 +3112,28 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 	}
 }
 
-static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
 
 	if (!gif_set(svm) ||
 	     (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
-		return false;
+		return true;
 
 	if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
-		return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
+		return !(svm->vcpu.arch.hflags & HF_HIF_MASK);
 	else
-		return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
+		return !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
+}
+
+static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	if (svm->nested.nested_run_pending)
+		return false;
+
+        return !svm_interrupt_blocked(vcpu);
 }
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -3770,15 +3790,24 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
 	vcpu->arch.mcg_cap &= 0x1ff;
 }
 
-static bool svm_smi_allowed(struct kvm_vcpu *vcpu)
+bool svm_smi_blocked(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	/* Per APM Vol.2 15.22.2 "Response to SMI" */
 	if (!gif_set(svm))
+		return true;
+
+	return is_smm(vcpu);
+}
+
+static bool svm_smi_allowed(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	if (svm->nested.nested_run_pending)
 		return false;
 
-	return !is_smm(vcpu);
+	return !svm_smi_blocked(vcpu);
 }
 
 static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 4dc6d2b4b721..f6d604b72a4c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -366,6 +366,9 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void svm_flush_tlb(struct kvm_vcpu *vcpu);
 void disable_nmi_singlestep(struct vcpu_svm *svm);
+bool svm_smi_blocked(struct kvm_vcpu *vcpu);
+bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
+bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
 
 /* nested.c */
 
-- 
cgit v1.2.3


From 15ff0b450b6541120803de40af5ca96dde1b56b2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:45 -0700
Subject: KVM: nVMX: Preserve IRQ/NMI priority irrespective of exiting behavior

Short circuit vmx_check_nested_events() if an unblocked IRQ/NMI is
pending and needs to be injected into L2, priority between coincident
events is not dependent on exiting behavior.

Fixes: b6b8a1451fc4 ("KVM: nVMX: Rework interception of IRQs and NMIs")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-9-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 643f86d7d537..224887edd39e 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3750,9 +3750,12 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+	if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
+		if (!nested_exit_on_nmi(vcpu))
+			goto no_vmexit;
+
 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
 				  INTR_INFO_VALID_MASK, 0);
@@ -3765,9 +3768,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
+	if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
+		if (!nested_exit_on_intr(vcpu))
+			goto no_vmexit;
 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
 		return 0;
 	}
-- 
cgit v1.2.3


From 1cd2f0b0dd9218fd11ced9ac97237f0c9517f49e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:46 -0700
Subject: KVM: nVMX: Prioritize SMI over nested IRQ/NMI

Check for an unblocked SMI in vmx_check_nested_events() so that pending
SMIs are correctly prioritized over IRQs and NMIs when the latter events
will trigger VM-Exit.  This also fixes an issue where an SMI that was
marked pending while processing a nested VM-Enter wouldn't trigger an
immediate exit, i.e. would be incorrectly delayed until L2 happened to
take a VM-Exit.

Fixes: 64d6067057d96 ("KVM: x86: stubs for SMM support")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-10-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 224887edd39e..669445136144 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3750,6 +3750,12 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
+	if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+		if (block_nested_events)
+			return -EBUSY;
+		goto no_vmexit;
+	}
+
 	if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
-- 
cgit v1.2.3


From fc6f7c03ad80693c953d5cdacfad41f174289531 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 18:02:45 -0400
Subject: KVM: nSVM: Report interrupts as allowed when in L2 and
 exit-on-interrupt is set

Report interrupts as allowed when the vCPU is in L2 and L2 is being run with
exit-on-interrupts enabled and EFLAGS.IF=1 (either on the host or on the guest
according to VINTR).  Interrupts are always unblocked from L1's perspective
in this case.

While moving nested_exit_on_intr to svm.h, use INTERCEPT_INTR properly instead
of assuming it's zero (which it is of course).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c |  5 -----
 arch/x86/kvm/svm/svm.c    | 23 +++++++++++++++++------
 arch/x86/kvm/svm/svm.h    |  5 +++++
 3 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index aaec6d0aa701..9c813e08e2cf 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -828,11 +828,6 @@ static void nested_svm_intr(struct vcpu_svm *svm)
 	nested_svm_vmexit(svm);
 }
 
-static bool nested_exit_on_intr(struct vcpu_svm *svm)
-{
-	return (svm->nested.intercept & 1ULL);
-}
-
 static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5738be661a1f..fba8bdcfed0e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3117,14 +3117,25 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
 
-	if (!gif_set(svm) ||
-	     (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
+	if (!gif_set(svm))
 		return true;
 
-	if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
-		return !(svm->vcpu.arch.hflags & HF_HIF_MASK);
-	else
-		return !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
+	if (is_guest_mode(vcpu)) {
+		/* As long as interrupts are being delivered...  */
+		if ((svm->vcpu.arch.hflags & HF_VINTR_MASK)
+		    ? !(svm->vcpu.arch.hflags & HF_HIF_MASK)
+		    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+			return true;
+
+		/* ... vmexits aren't blocked by the interrupt shadow  */
+		if (nested_exit_on_intr(svm))
+			return false;
+	} else {
+		if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+			return true;
+	}
+
+	return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
 }
 
 static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f6d604b72a4c..5cc559ab862d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -386,6 +386,11 @@ static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
 	return (svm->nested.intercept & (1ULL << INTERCEPT_SMI));
 }
 
+static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
+{
+	return (svm->nested.intercept & (1ULL << INTERCEPT_INTR));
+}
+
 static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
 {
 	return (svm->nested.intercept & (1ULL << INTERCEPT_NMI));
-- 
cgit v1.2.3


From 221e761090b4ffadf41acaca1e1f6dd97d84ef4f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 08:13:10 -0400
Subject: KVM: nSVM: Preserve IRQ/NMI/SMI priority irrespective of exiting
 behavior

Short circuit vmx_check_nested_events() if an unblocked IRQ/NMI/SMI is
pending and needs to be injected into L2, priority between coincident
events is not dependent on exiting behavior.

Fixes: b518ba9fa691 ("KVM: nSVM: implement check_nested_events for interrupts")
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 9c813e08e2cf..712db507c819 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -835,23 +835,29 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 		kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required ||
 		svm->nested.nested_run_pending;
 
-	if (vcpu->arch.smi_pending && nested_exit_on_smi(svm)) {
+	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
+		if (!nested_exit_on_smi(svm))
+			return 0;
 		nested_svm_smi(svm);
 		return 0;
 	}
 
-	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(svm)) {
+	if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
+		if (!nested_exit_on_nmi(svm))
+			return 0;
 		nested_svm_nmi(svm);
 		return 0;
 	}
 
-	if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
+	if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
+		if (!nested_exit_on_intr(svm))
+			return 0;
 		nested_svm_intr(svm);
 		return 0;
 	}
-- 
cgit v1.2.3


From 3b82b8d7fdf7c1c284cfa6ebe273435483bcbd9d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:47 -0700
Subject: KVM: x86: WARN on injected+pending exception even in nested case

WARN if a pending exception is coincident with an injected exception
before calling check_nested_events() so that the WARN will fire even if
inject_pending_event() bails early because check_nested_events() detects
the conflict.  Bailing early isn't problematic (quite the opposite), but
suppressing the WARN is undesirable as it could mask a bug elsewhere in
KVM.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-11-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 446fbdd05bd4..cb250f16bdf5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7713,6 +7713,9 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 			kvm_x86_ops.set_irq(vcpu);
 	}
 
+	WARN_ON_ONCE(vcpu->arch.exception.injected &&
+		     vcpu->arch.exception.pending);
+
 	/*
 	 * Call check_nested_events() even if we reinjected a previous event
 	 * in order for caller to determine if it should require immediate-exit
@@ -7731,7 +7734,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 					vcpu->arch.exception.has_error_code,
 					vcpu->arch.exception.error_code);
 
-		WARN_ON_ONCE(vcpu->arch.exception.injected);
 		vcpu->arch.exception.pending = false;
 		vcpu->arch.exception.injected = true;
 
-- 
cgit v1.2.3


From db438592807e059e3b730f556c22ae201f4d4ebc Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:48 -0700
Subject: KVM: VMX: Use vmx_interrupt_blocked() directly from vmx_handle_exit()

Use vmx_interrupt_blocked() instead of bouncing through
vmx_interrupt_allowed() when handling edge cases in vmx_handle_exit().
The nested_run_pending check in vmx_interrupt_allowed() should never
evaluate true in the VM-Exit path.

Hoist the WARN in handle_invalid_guest_state() up to vmx_handle_exit()
to enforce the above assumption for the !enable_vnmi case, and to detect
any other potential bugs with nested VM-Enter.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-12-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a5140ed7dbf9..50a2e464562f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5285,18 +5285,11 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 	bool intr_window_requested;
 	unsigned count = 130;
 
-	/*
-	 * We should never reach the point where we are emulating L2
-	 * due to invalid guest state as that means we incorrectly
-	 * allowed a nested VMEntry with an invalid vmcs12.
-	 */
-	WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
-
 	intr_window_requested = exec_controls_get(vmx) &
 				CPU_BASED_INTR_WINDOW_EXITING;
 
 	while (vmx->emulation_required && count-- != 0) {
-		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
+		if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
 			return handle_interrupt_window(&vmx->vcpu);
 
 		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
@@ -5913,6 +5906,14 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 	if (enable_pml)
 		vmx_flush_pml_buffer(vcpu);
 
+	/*
+	 * We should never reach this point with a pending nested VM-Enter, and
+	 * more specifically emulation of L2 due to invalid guest state (see
+	 * below) should never happen as that means we incorrectly allowed a
+	 * nested VM-Enter with an invalid vmcs12.
+	 */
+	WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required)
 		return handle_invalid_guest_state(vcpu);
@@ -5979,7 +5980,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 
 	if (unlikely(!enable_vnmi &&
 		     vmx->loaded_vmcs->soft_vnmi_blocked)) {
-		if (vmx_interrupt_allowed(vcpu)) {
+		if (!vmx_interrupt_blocked(vcpu)) {
 			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
 		} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
 			   vcpu->arch.nmi_pending) {
-- 
cgit v1.2.3


From 7ab0abdb553e62704be5b7879a0971acf9f36a06 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 22 Apr 2020 19:25:50 -0700
Subject: KVM: VMX: Use vmx_get_rflags() to query RFLAGS in
 vmx_interrupt_blocked()

Use vmx_get_rflags() instead of manually reading vmcs.GUEST_RFLAGS when
querying RFLAGS.IF so that multiple checks against interrupt blocking in
a single run loop only require a single VMREAD.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-14-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 50a2e464562f..71cd210ec368 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4536,7 +4536,7 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
 	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
 		return false;
 
-	return !(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) ||
+	return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
 	       (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 		(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
-- 
cgit v1.2.3


From c300ab9f08df9e4b9f39d53a0691e234330df124 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2020 14:08:58 -0400
Subject: KVM: x86: Replace late check_nested_events() hack with more precise
 fix

Add an argument to interrupt_allowed and nmi_allowed, to checking if
interrupt injection is blocked.  Use the hook to handle the case where
an interrupt arrives between check_nested_events() and the injection
logic.  Drop the retry of check_nested_events() that hack-a-fixed the
same condition.

Blocking injection is also a bit of a hack, e.g. KVM should do exiting
and non-exiting interrupt processing in a single pass, but it's a more
precise hack.  The old comment is also misleading, e.g. KVM_REQ_EVENT is
purely an optimization, setting it on every run loop (which KVM doesn't
do) should not affect functionality, only performance.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200423022550.15113-13-sean.j.christopherson@intel.com>
[Extend to SVM, add SMI and NMI.  Even though NMI and SMI cannot come
 asynchronously right now, making the fix generic is easy and removes a
 special case. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 +++---
 arch/x86/kvm/svm/svm.c          | 25 ++++++++++++++++++++-----
 arch/x86/kvm/vmx/vmx.c          | 17 ++++++++++++++---
 arch/x86/kvm/x86.c              | 36 ++++++++++++------------------------
 4 files changed, 49 insertions(+), 35 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44268642b3c6..efe6199c596c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1138,8 +1138,8 @@ struct kvm_x86_ops {
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu);
 	void (*cancel_injection)(struct kvm_vcpu *vcpu);
-	bool (*interrupt_allowed)(struct kvm_vcpu *vcpu);
-	bool (*nmi_allowed)(struct kvm_vcpu *vcpu);
+	bool (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
+	bool (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
 	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
@@ -1237,7 +1237,7 @@ struct kvm_x86_ops {
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
 
-	bool (*smi_allowed)(struct kvm_vcpu *vcpu);
+	bool (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
 	int (*enable_smi_window)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index fba8bdcfed0e..45c6e4b87eee 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3083,13 +3083,17 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-static bool svm_nmi_allowed(struct kvm_vcpu *vcpu)
+static bool svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	if (svm->nested.nested_run_pending)
 		return false;
 
-        return !svm_nmi_blocked(vcpu);
+	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
+	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
+		return false;
+
+	return !svm_nmi_blocked(vcpu);
 }
 
 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -3138,13 +3142,20 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
 	return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
 }
 
-static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	if (svm->nested.nested_run_pending)
 		return false;
 
-        return !svm_interrupt_blocked(vcpu);
+	/*
+	 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+	 * e.g. if the IRQ arrived asynchronously after checking nested events.
+	 */
+	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
+		return false;
+
+	return !svm_interrupt_blocked(vcpu);
 }
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -3812,12 +3823,16 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu)
 	return is_smm(vcpu);
 }
 
-static bool svm_smi_allowed(struct kvm_vcpu *vcpu)
+static bool svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	if (svm->nested.nested_run_pending)
 		return false;
 
+	/* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
+	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
+		return false;
+
 	return !svm_smi_blocked(vcpu);
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 71cd210ec368..e1f5fc919fd9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4523,11 +4523,15 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
 		 GUEST_INTR_STATE_NMI));
 }
 
-static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return false;
 
+	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
+	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+		return false;
+
 	return !vmx_nmi_blocked(vcpu);
 }
 
@@ -4541,11 +4545,18 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
 		(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
 
-static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return false;
 
+       /*
+        * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+        * e.g. if the IRQ arrived asynchronously after checking nested events.
+        */
+	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+		return false;
+
 	return !vmx_interrupt_blocked(vcpu);
 }
 
@@ -7688,7 +7699,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 			~FEAT_CTL_LMCE_ENABLED;
 }
 
-static bool vmx_smi_allowed(struct kvm_vcpu *vcpu)
+static bool vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	/* we need a nested vmexit to enter SMM, postpone if run is pending */
 	if (to_vmx(vcpu)->nested.nested_run_pending)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cb250f16bdf5..ec20368ac025 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7766,32 +7766,20 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 	if (kvm_event_needs_reinjection(vcpu))
 		return 0;
 
-	if (vcpu->arch.smi_pending && kvm_x86_ops.smi_allowed(vcpu)) {
+	if (vcpu->arch.smi_pending &&
+	    kvm_x86_ops.smi_allowed(vcpu, true)) {
 		vcpu->arch.smi_pending = false;
 		++vcpu->arch.smi_count;
 		enter_smm(vcpu);
-	} else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) {
+	} else if (vcpu->arch.nmi_pending &&
+		   kvm_x86_ops.nmi_allowed(vcpu, true)) {
 		--vcpu->arch.nmi_pending;
 		vcpu->arch.nmi_injected = true;
 		kvm_x86_ops.set_nmi(vcpu);
-	} else if (kvm_cpu_has_injectable_intr(vcpu)) {
-		/*
-		 * Because interrupts can be injected asynchronously, we are
-		 * calling check_nested_events again here to avoid a race condition.
-		 * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
-		 * proposal and current concerns.  Perhaps we should be setting
-		 * KVM_REQ_EVENT only on certain events and not unconditionally?
-		 */
-		if (is_guest_mode(vcpu)) {
-			r = kvm_x86_ops.nested_ops->check_events(vcpu);
-			if (r != 0)
-				return r;
-		}
-		if (kvm_x86_ops.interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
-					    false);
-			kvm_x86_ops.set_irq(vcpu);
-		}
+	} else if (kvm_cpu_has_injectable_intr(vcpu) &&
+		   kvm_x86_ops.interrupt_allowed(vcpu, true)) {
+		kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
+		kvm_x86_ops.set_irq(vcpu);
 	}
 
 	return 0;
@@ -10203,12 +10191,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 
 	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
 	    (vcpu->arch.nmi_pending &&
-	     kvm_x86_ops.nmi_allowed(vcpu)))
+	     kvm_x86_ops.nmi_allowed(vcpu, false)))
 		return true;
 
 	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
 	    (vcpu->arch.smi_pending &&
-	     kvm_x86_ops.smi_allowed(vcpu)))
+	     kvm_x86_ops.smi_allowed(vcpu, false)))
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -10260,7 +10248,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-	return kvm_x86_ops.interrupt_allowed(vcpu);
+	return kvm_x86_ops.interrupt_allowed(vcpu, false);
 }
 
 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -10425,7 +10413,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 	 * If interrupts are off we cannot even use an artificial
 	 * halt state.
 	 */
-	return kvm_x86_ops.interrupt_allowed(vcpu);
+	return kvm_arch_interrupt_allowed(vcpu);
 }
 
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From da4ad88cab5867ee240dfd0585e9d115a8cc47db Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Thu, 23 Apr 2020 22:48:37 -0700
Subject: kvm: Replace vcpu->swait with rcuwait

The use of any sort of waitqueue (simple or regular) for
wait/waking vcpus has always been an overkill and semantically
wrong. Because this is per-vcpu (which is blocked) there is
only ever a single waiting vcpu, thus no need for any sort of
queue.

As such, make use of the rcuwait primitive, with the following
considerations:

  - rcuwait already provides the proper barriers that serialize
  concurrent waiter and waker.

  - Task wakeup is done in rcu read critical region, with a
  stable task pointer.

  - Because there is no concurrency among waiters, we need
  not worry about rcuwait_wait_event() calls corrupting
  the wait->task. As a consequence, this saves the locking
  done in swait when modifying the queue. This also applies
  to per-vcore wait for powerpc kvm-hv.

The x86 tscdeadline_latency test mentioned in 8577370fb0cb
("KVM: Use simple waitqueue for vcpu->wq") shows that, on avg,
latency is reduced by around 15-20% with this change.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: kvmarm@lists.cs.columbia.edu
Cc: linux-mips@vger.kernel.org
Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Message-Id: <20200424054837.5138-6-dave@stgolabs.net>
[Avoid extra logic changes. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c                  |  6 ++----
 arch/powerpc/include/asm/kvm_book3s.h |  2 +-
 arch/powerpc/include/asm/kvm_host.h   |  2 +-
 arch/powerpc/kvm/book3s_hv.c          | 23 ++++++++++-------------
 arch/powerpc/kvm/powerpc.c            |  2 +-
 arch/x86/kvm/lapic.c                  |  2 +-
 include/linux/kvm_host.h              | 10 +++++-----
 virt/kvm/arm/arch_timer.c             |  3 ++-
 virt/kvm/arm/arm.c                    |  9 +++++----
 virt/kvm/async_pf.c                   |  3 +--
 virt/kvm/kvm_main.c                   | 19 +++++++++----------
 11 files changed, 38 insertions(+), 43 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9f50ceef9978..9787cdec33e6 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -283,8 +283,7 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer)
 	kvm_mips_callbacks->queue_timer_int(vcpu);
 
 	vcpu->arch.wait = 0;
-	if (swq_has_sleeper(&vcpu->wq))
-		swake_up_one(&vcpu->wq);
+	rcuwait_wake_up(&vcpu->wait);
 
 	return kvm_mips_count_timeout(vcpu);
 }
@@ -511,8 +510,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 
 	dvcpu->arch.wait = 0;
 
-	if (swq_has_sleeper(&dvcpu->wq))
-		swake_up_one(&dvcpu->wq);
+	rcuwait_wake_up(&dvcpu->wait);
 
 	return 0;
 }
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 506e4df2d730..6e5d85ba588d 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -78,7 +78,7 @@ struct kvmppc_vcore {
 	struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
 	struct list_head preempt_list;
 	spinlock_t lock;
-	struct swait_queue_head wq;
+	struct rcuwait wait;
 	spinlock_t stoltb_lock;	/* protects stolen_tb and preempt_tb */
 	u64 stolen_tb;
 	u64 preempt_tb;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1dc63101ffe1..337047ba4a56 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -751,7 +751,7 @@ struct kvm_vcpu_arch {
 	u8 irq_pending; /* Used by XIVE to signal pending guest irqs */
 	u32 last_inst;
 
-	struct swait_queue_head *wqp;
+	struct rcuwait *waitp;
 	struct kvmppc_vcore *vcore;
 	int ret;
 	int trap;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 93493f0cbfe8..7f59c47a5b9d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -230,13 +230,11 @@ static bool kvmppc_ipi_thread(int cpu)
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
 	int cpu;
-	struct swait_queue_head *wqp;
+	struct rcuwait *waitp;
 
-	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (swq_has_sleeper(wqp)) {
-		swake_up_one(wqp);
+	waitp = kvm_arch_vcpu_get_wait(vcpu);
+	if (rcuwait_wake_up(waitp))
 		++vcpu->stat.halt_wakeup;
-	}
 
 	cpu = READ_ONCE(vcpu->arch.thread_cpu);
 	if (cpu >= 0 && kvmppc_ipi_thread(cpu))
@@ -2125,7 +2123,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
 
 	spin_lock_init(&vcore->lock);
 	spin_lock_init(&vcore->stoltb_lock);
-	init_swait_queue_head(&vcore->wq);
+	rcuwait_init(&vcore->wait);
 	vcore->preempt_tb = TB_NIL;
 	vcore->lpcr = kvm->arch.lpcr;
 	vcore->first_vcpuid = id;
@@ -3784,7 +3782,6 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	ktime_t cur, start_poll, start_wait;
 	int do_sleep = 1;
 	u64 block_ns;
-	DECLARE_SWAITQUEUE(wait);
 
 	/* Poll for pending exceptions and ceded state */
 	cur = start_poll = ktime_get();
@@ -3812,10 +3809,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 		}
 	}
 
-	prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
-
+	prepare_to_rcuwait(&vc->wait);
+	set_current_state(TASK_INTERRUPTIBLE);
 	if (kvmppc_vcore_check_block(vc)) {
-		finish_swait(&vc->wq, &wait);
+		finish_rcuwait(&vc->wait);
 		do_sleep = 0;
 		/* If we polled, count this as a successful poll */
 		if (vc->halt_poll_ns)
@@ -3829,7 +3826,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	trace_kvmppc_vcore_blocked(vc, 0);
 	spin_unlock(&vc->lock);
 	schedule();
-	finish_swait(&vc->wq, &wait);
+	finish_rcuwait(&vc->wait);
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
 	trace_kvmppc_vcore_blocked(vc, 1);
@@ -3940,7 +3937,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			kvmppc_start_thread(vcpu, vc);
 			trace_kvm_guest_enter(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
-			swake_up_one(&vc->wq);
+		        rcuwait_wake_up(&vc->wait);
 		}
 
 	}
@@ -4279,7 +4276,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 	user_vrsave = mfspr(SPRN_VRSAVE);
 
-	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+	vcpu->arch.waitp = &vcpu->arch.vcore->wait;
 	vcpu->arch.pgdir = kvm->mm->pgd;
 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 052614e9d468..27ccff612903 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -752,7 +752,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	if (err)
 		goto out_vcpu_uninit;
 
-	vcpu->arch.wqp = &vcpu->wq;
+	vcpu->arch.waitp = &vcpu->wait;
 	kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id);
 	return 0;
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 38f7dc9c16ee..42cd2e3ec6fd 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1861,7 +1861,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 	/* If the preempt notifier has already run, it also called apic_timer_expired */
 	if (!apic->lapic_timer.hv_timer_in_use)
 		goto out;
-	WARN_ON(swait_active(&vcpu->wq));
+	WARN_ON(rcuwait_active(&vcpu->wait));
 	cancel_hv_timer(apic);
 	apic_timer_expired(apic);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index abfa71cb5d2d..161684696610 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -23,7 +23,7 @@
 #include <linux/irqflags.h>
 #include <linux/context_tracking.h>
 #include <linux/irqbypass.h>
-#include <linux/swait.h>
+#include <linux/rcuwait.h>
 #include <linux/refcount.h>
 #include <linux/nospec.h>
 #include <asm/signal.h>
@@ -277,7 +277,7 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	struct kvm_run *run;
 
-	struct swait_queue_head wq;
+	struct rcuwait wait;
 	struct pid __rcu *pid;
 	int sigset_active;
 	sigset_t sigset;
@@ -960,12 +960,12 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
 }
 #endif
 
-static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_WQP
-	return vcpu->arch.wqp;
+	return vcpu->arch.waitp;
 #else
-	return &vcpu->wq;
+	return &vcpu->wait;
 #endif
 }
 
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 93bd59b46848..d5024416e722 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -571,6 +571,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
 	struct timer_map map;
+	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
 
 	if (unlikely(!timer->enabled))
 		return;
@@ -593,7 +594,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	if (map.emul_ptimer)
 		soft_timer_cancel(&map.emul_ptimer->hrtimer);
 
-	if (swait_active(kvm_arch_vcpu_wq(vcpu)))
+	if (rcuwait_active(wait))
 		kvm_timer_blocking(vcpu);
 
 	/*
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index f5390ac2165b..d5db0d6141ff 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -579,16 +579,17 @@ void kvm_arm_resume_guest(struct kvm *kvm)
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		vcpu->arch.pause = false;
-		swake_up_one(kvm_arch_vcpu_wq(vcpu));
+		rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu));
 	}
 }
 
 static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
 {
-	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
+	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
 
-	swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
-				       (!vcpu->arch.pause)));
+	rcuwait_wait_event(wait,
+			   (!vcpu->arch.power_off) &&(!vcpu->arch.pause),
+			   TASK_INTERRUPTIBLE);
 
 	if (vcpu->arch.power_off || vcpu->arch.pause) {
 		/* Awaken to handle a signal, request we sleep again later. */
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 15e5b037f92d..10b533f641a6 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,8 +80,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	trace_kvm_async_pf_completed(addr, cr2_or_gpa);
 
-	if (swq_has_sleeper(&vcpu->wq))
-		swake_up_one(&vcpu->wq);
+	rcuwait_wake_up(&vcpu->wait);
 
 	mmput(mm);
 	kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7525f3838160..e12317f32c5e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -349,7 +349,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
-	init_swait_queue_head(&vcpu->wq);
+	rcuwait_init(&vcpu->wait);
 	kvm_async_pf_vcpu_init(vcpu);
 
 	vcpu->pre_pcpu = -1;
@@ -2678,7 +2678,6 @@ out:
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	ktime_t start, cur;
-	DECLARE_SWAITQUEUE(wait);
 	bool waited = false;
 	u64 block_ns;
 
@@ -2704,8 +2703,9 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		} while (single_task_running() && ktime_before(cur, stop));
 	}
 
+	prepare_to_rcuwait(&vcpu->wait);
 	for (;;) {
-		prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 
 		if (kvm_vcpu_check_block(vcpu) < 0)
 			break;
@@ -2713,8 +2713,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		waited = true;
 		schedule();
 	}
-
-	finish_swait(&vcpu->wq, &wait);
+	finish_rcuwait(&vcpu->wait);
 	cur = ktime_get();
 out:
 	kvm_arch_vcpu_unblocking(vcpu);
@@ -2746,11 +2745,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 {
-	struct swait_queue_head *wqp;
+	struct rcuwait *waitp;
 
-	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (swq_has_sleeper(wqp)) {
-		swake_up_one(wqp);
+	waitp = kvm_arch_vcpu_get_wait(vcpu);
+	if (rcuwait_wake_up(waitp)) {
 		WRITE_ONCE(vcpu->ready, true);
 		++vcpu->stat.halt_wakeup;
 		return true;
@@ -2892,7 +2890,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 				continue;
 			if (vcpu == me)
 				continue;
-			if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
+			if (rcuwait_active(&vcpu->wait) &&
+			    !vcpu_dy_runnable(vcpu))
 				continue;
 			if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
 				!kvm_arch_vcpu_in_kernel(vcpu))
-- 
cgit v1.2.3


From fede8076aab4c2280c673492f8f7a2e87712e8b4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 27 Apr 2020 11:55:59 -0400
Subject: KVM: x86: handle wrap around 32-bit address space

KVM is not handling the case where EIP wraps around the 32-bit address
space (that is, outside long mode).  This is needed both in vmx.c
and in emulate.c.  SVM with NRIPS is okay, but it can still print
an error to dmesg due to integer overflow.

Reported-by: Nick Peterson <everdox@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c |  2 ++
 arch/x86/kvm/svm/svm.c |  3 ---
 arch/x86/kvm/vmx/vmx.c | 15 ++++++++++++---
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bddaba9c68dd..de5476f8683e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5798,6 +5798,8 @@ writeback:
 	}
 
 	ctxt->eip = ctxt->_eip;
+	if (ctxt->mode != X86EMUL_MODE_PROT64)
+		ctxt->eip = (u32)ctxt->_eip;
 
 done:
 	if (rc == X86EMUL_PROPAGATE_FAULT) {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 45c6e4b87eee..83175933c0a1 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -319,9 +319,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 			return 0;
 	} else {
-		if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
-			pr_err("%s: ip 0x%lx next 0x%llx\n",
-			       __func__, kvm_rip_read(vcpu), svm->next_rip);
 		kvm_rip_write(vcpu, svm->next_rip);
 	}
 	svm_set_interrupt_shadow(vcpu, 0);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e1f5fc919fd9..f519fc7a22a3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1555,7 +1555,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 
 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
-	unsigned long rip;
+	unsigned long rip, orig_rip;
 
 	/*
 	 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1567,8 +1567,17 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	 */
 	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
 	    to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
-		rip = kvm_rip_read(vcpu);
-		rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		orig_rip = kvm_rip_read(vcpu);
+		rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+#ifdef CONFIG_X86_64
+		/*
+		 * We need to mask out the high 32 bits of RIP if not in 64-bit
+		 * mode, but just finding out that we are in 64-bit mode is
+		 * quite expensive.  Only do it if there was a carry.
+		 */
+		if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
+			rip = (u32)rip;
+#endif
 		kvm_rip_write(vcpu, rip);
 	} else {
 		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
-- 
cgit v1.2.3


From f27ad73a6ef791731fa75ca183d5c64a930b9cc1 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 27 Apr 2020 10:18:37 -0700
Subject: KVM: VMX: Use accessor to read vmcs.INTR_INFO when handling exception

Use vmx_get_intr_info() when grabbing the cached vmcs.INTR_INFO in
handle_exception_nmi() to ensure the cache isn't stale.  Bypassing the
caching accessor doesn't cause any known issues as the cache is always
refreshed by handle_exception_nmi_irqoff(), but the whole point of
adding the proper caching mechanism was to avoid such dependencies.

Fixes: 8791585837f6 ("KVM: VMX: Cache vmcs.EXIT_INTR_INFO using arch avail_reg flags")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200427171837.22613-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f519fc7a22a3..fbede089d0f7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4705,7 +4705,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	u32 vect_info;
 
 	vect_info = vmx->idt_vectoring_info;
-	intr_info = vmx->exit_intr_info;
+	intr_info = vmx_get_intr_info(vcpu);
 
 	if (is_machine_check(intr_info) || is_nmi(intr_info))
 		return 1; /* handled by handle_exception_nmi_irqoff() */
-- 
cgit v1.2.3


From 5c911beff20aa8639e7a1f28988736c13e03ed54 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 1 May 2020 09:31:17 -0700
Subject: KVM: nVMX: Skip IBPB when switching between vmcs01 and vmcs02

Skip the Indirect Branch Prediction Barrier that is triggered on a VMCS
switch when running with spectre_v2_user=on/auto if the switch is
between two VMCSes in the same guest, i.e. between vmcs01 and vmcs02.
The IBPB is intended to prevent one guest from attacking another, which
is unnecessary in the nested case as it's the same guest from KVM's
perspective.

This all but eliminates the overhead observed for nested VMX transitions
when running with CONFIG_RETPOLINE=y and spectre_v2_user=on/auto, which
can be significant, e.g. roughly 3x on current systems.

Reported-by: Alexander Graf <graf@amazon.com>
Cc: KarimAllah Raslan <karahmed@amazon.de>
Cc: stable@vger.kernel.org
Fixes: 15d45071523d ("KVM/x86: Add IBPB support")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200501163117.4655-1-sean.j.christopherson@intel.com>
[Invert direction of bool argument. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c |  2 +-
 arch/x86/kvm/vmx/vmx.c    | 18 ++++++++++++++----
 arch/x86/kvm/vmx/vmx.h    |  3 ++-
 3 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 669445136144..7f754b3bc6dd 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -303,7 +303,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 	cpu = get_cpu();
 	prev = vmx->loaded_vmcs;
 	vmx->loaded_vmcs = vmcs;
-	vmx_vcpu_load_vmcs(vcpu, cpu);
+	vmx_vcpu_load_vmcs(vcpu, cpu, prev);
 	vmx_sync_vmcs_host_state(vmx, prev);
 	put_cpu();
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fbede089d0f7..db842ce74e5d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1311,10 +1311,12 @@ after_clear_sn:
 		pi_set_on(pi_desc);
 }
 
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+			struct loaded_vmcs *buddy)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+	struct vmcs *prev;
 
 	if (!already_loaded) {
 		loaded_vmcs_clear(vmx->loaded_vmcs);
@@ -1333,10 +1335,18 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
 		local_irq_enable();
 	}
 
-	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+	prev = per_cpu(current_vmcs, cpu);
+	if (prev != vmx->loaded_vmcs->vmcs) {
 		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
 		vmcs_load(vmx->loaded_vmcs->vmcs);
-		indirect_branch_prediction_barrier();
+
+		/*
+		 * No indirect branch prediction barrier needed when switching
+		 * the active VMCS within a guest, e.g. on nested VM-Enter.
+		 * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
+		 */
+		if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
+			indirect_branch_prediction_barrier();
 	}
 
 	if (!already_loaded) {
@@ -1377,7 +1387,7 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	vmx_vcpu_load_vmcs(vcpu, cpu);
+	vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
 
 	vmx_vcpu_pi_load(vcpu, cpu);
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index b5e773267abe..d3d48acc6bd9 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -320,7 +320,8 @@ struct kvm_vmx {
 };
 
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+			struct loaded_vmcs *buddy);
 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 int allocate_vpid(void);
 void free_vpid(int vpid);
-- 
cgit v1.2.3


From 1af1bb05625bcdd09522f416b62bcc72cc2fdd3b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 6 May 2020 16:58:50 -0700
Subject: KVM: nVMX: Skip IBPB when temporarily switching between vmcs01 and
 vmcs02

Skip the Indirect Branch Prediction Barrier that is triggered on a VMCS
switch when temporarily loading vmcs02 to synchronize it to vmcs12, i.e.
give copy_vmcs02_to_vmcs12_rare() the same treatment as
vmx_switch_vmcs().

Make vmx_vcpu_load() static now that it's only referenced within vmx.c.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200506235850.22600-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 4 ++--
 arch/x86/kvm/vmx/vmx.c    | 2 +-
 arch/x86/kvm/vmx/vmx.h    | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 7f754b3bc6dd..b69f5a2fa362 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3910,12 +3910,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
 
 	cpu = get_cpu();
 	vmx->loaded_vmcs = &vmx->nested.vmcs02;
-	vmx_vcpu_load(&vmx->vcpu, cpu);
+	vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
 
 	sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
 
 	vmx->loaded_vmcs = &vmx->vmcs01;
-	vmx_vcpu_load(&vmx->vcpu, cpu);
+	vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
 	put_cpu();
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index db842ce74e5d..d044a05a38bd 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1383,7 +1383,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
  */
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index d3d48acc6bd9..373674d455e1 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -322,7 +322,6 @@ struct kvm_vmx {
 bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
 			struct loaded_vmcs *buddy);
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 int allocate_vpid(void);
 void free_vpid(int vpid);
 void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
-- 
cgit v1.2.3


From 56ba77a459a72a7d95be74355a40a91e1f6dd7f7 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 1 May 2020 21:32:25 -0700
Subject: KVM: x86: Save L1 TSC offset in 'struct kvm_vcpu_arch'

Save L1's TSC offset in 'struct kvm_vcpu_arch' and drop the kvm_x86_ops
hook read_l1_tsc_offset().  This avoids a retpoline (when configured)
when reading L1's effective TSC, which is done at least once on every
VM-Exit.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200502043234.12481-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm/svm.c          | 11 -----------
 arch/x86/kvm/vmx/vmx.c          | 12 ------------
 arch/x86/kvm/x86.c              |  9 ++++-----
 4 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index efe6199c596c..3ff671e35d76 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -708,6 +708,7 @@ struct kvm_vcpu_arch {
 		struct gfn_to_pfn_cache cache;
 	} st;
 
+	u64 l1_tsc_offset;
 	u64 tsc_offset;
 	u64 last_guest_tsc;
 	u64 last_host_tsc;
@@ -1165,7 +1166,6 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
-	u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
 	/* Returns actual tsc_offset set in active VMCS */
 	u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 83175933c0a1..18a63641b612 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -951,16 +951,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
-static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	if (is_guest_mode(vcpu))
-		return svm->nested.hsave->control.tsc_offset;
-
-	return vcpu->arch.tsc_offset;
-}
-
 static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -4075,7 +4065,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
-	.read_l1_tsc_offset = svm_read_l1_tsc_offset,
 	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
 
 	.load_mmu_pgd = svm_load_mmu_pgd,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d044a05a38bd..31a8d04a6c41 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1740,17 +1740,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
-static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
-{
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-	if (is_guest_mode(vcpu) &&
-	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
-		return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
-
-	return vcpu->arch.tsc_offset;
-}
-
 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -7881,7 +7870,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
-	.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
 	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
 
 	.load_mmu_pgd = vmx_load_mmu_pgd,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ec20368ac025..3622586a26a7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1910,7 +1910,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 
 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 {
-	u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
+	u64 curr_offset = vcpu->arch.l1_tsc_offset;
 	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 
@@ -1952,14 +1952,13 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-	u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
-
-	return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+	return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
+	vcpu->arch.l1_tsc_offset = offset;
 	vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
 }
 
@@ -2084,7 +2083,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 					   s64 adjustment)
 {
-	u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu);
+	u64 tsc_offset = vcpu->arch.l1_tsc_offset;
 	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
 }
 
-- 
cgit v1.2.3


From 0cc69204e77275105d5b0fc4cf3c970e3579457f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 1 May 2020 21:32:26 -0700
Subject: KVM: nVMX: Unconditionally validate CR3 during nested transitions

Unconditionally check the validity of the incoming CR3 during nested
VM-Enter/VM-Exit to avoid invoking kvm_read_cr3() in the common case
where the guest isn't using PAE paging.  If vmcs.GUEST_CR3 hasn't yet
been cached (common case), kvm_read_cr3() will trigger a VMREAD.  The
VMREAD (~30 cycles) alone is likely slower than nested_cr3_valid()
(~5 cycles if vcpu->arch.maxphyaddr gets a cache hit), and the poor
exchange only gets worse when retpolines are enabled as the call to
kvm_x86_ops.cache_reg() will incur a retpoline (60+ cycles).

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200502043234.12481-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b69f5a2fa362..90ba278dd572 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1119,21 +1119,20 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
 			       u32 *entry_failure_code)
 {
-	if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
-		if (CC(!nested_cr3_valid(vcpu, cr3))) {
-			*entry_failure_code = ENTRY_FAIL_DEFAULT;
-			return -EINVAL;
-		}
+	if (CC(!nested_cr3_valid(vcpu, cr3))) {
+		*entry_failure_code = ENTRY_FAIL_DEFAULT;
+		return -EINVAL;
+	}
 
-		/*
-		 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
-		 * must not be dereferenced.
-		 */
-		if (is_pae_paging(vcpu) && !nested_ept) {
-			if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
-				*entry_failure_code = ENTRY_FAIL_PDPTE;
-				return -EINVAL;
-			}
+	/*
+	 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+	 * must not be dereferenced.
+	 */
+	if (!nested_ept && is_pae_paging(vcpu) &&
+	    (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
+		if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
+			*entry_failure_code = ENTRY_FAIL_PDPTE;
+			return -EINVAL;
 		}
 	}
 
-- 
cgit v1.2.3


From f98c1e77127de7d9ff558570c25d02ef077df50f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 1 May 2020 21:32:30 -0700
Subject: KVM: VMX: Add proper cache tracking for CR4

Move CR4 caching into the standard register caching mechanism in order
to take advantage of the availability checks provided by regs_avail.
This avoids multiple VMREADs and retpolines (when configured) during
nested VMX transitions as kvm_read_cr4_bits() is invoked multiple times
on each transition, e.g. when stuffing CR0 and CR3.

As an added bonus, this eliminates a kvm_x86_ops hook, saves a retpoline
on SVM when reading CR4, and squashes the confusing naming discrepancy
of "cache_reg" vs. "decache_cr4_guest_bits".

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200502043234.12481-7-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/kvm_cache_regs.h   |  5 +++--
 arch/x86/kvm/svm/svm.c          |  5 -----
 arch/x86/kvm/vmx/vmx.c          | 18 +++++++++---------
 arch/x86/kvm/vmx/vmx.h          |  1 +
 5 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3ff671e35d76..3e2f582160fe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -168,6 +168,7 @@ enum kvm_reg {
 
 	VCPU_EXREG_PDPTR = NR_VCPU_REGS,
 	VCPU_EXREG_CR3,
+	VCPU_EXREG_CR4,
 	VCPU_EXREG_RFLAGS,
 	VCPU_EXREG_SEGMENTS,
 	VCPU_EXREG_EXIT_INFO_1,
@@ -1092,7 +1093,6 @@ struct kvm_x86_ops {
 			    struct kvm_segment *var, int seg);
 	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
 	void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
-	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
 	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
 	int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
 	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 62558b9bdda7..921a539bcb96 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -129,8 +129,9 @@ static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
 static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
 {
 	ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
-	if (tmask & vcpu->arch.cr4_guest_owned_bits)
-		kvm_x86_ops.decache_cr4_guest_bits(vcpu);
+	if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
+	    !kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
+		kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4);
 	return vcpu->arch.cr4 & mask;
 }
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 18a63641b612..805297e9a1cb 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1525,10 +1525,6 @@ static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 }
 
-static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
 static void update_cr0_intercept(struct vcpu_svm *svm)
 {
 	ulong gcr0 = svm->vcpu.arch.cr0;
@@ -4007,7 +4003,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.get_cpl = svm_get_cpl,
 	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
 	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
-	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
 	.set_cr0 = svm_set_cr0,
 	.set_cr4 = svm_set_cr4,
 	.set_efer = svm_set_efer,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 31a8d04a6c41..83e3fe083679 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2204,6 +2204,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 {
+	unsigned long guest_owned_bits;
+
 	kvm_register_mark_available(vcpu, reg);
 
 	switch (reg) {
@@ -2221,6 +2223,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 		if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
 			vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 		break;
+	case VCPU_EXREG_CR4:
+		guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+		vcpu->arch.cr4 &= ~guest_owned_bits;
+		vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -2922,14 +2930,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
 }
 
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-	ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
-
-	vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
-	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
-}
-
 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -3128,6 +3128,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return 1;
 
 	vcpu->arch.cr4 = cr4;
+	kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
 
 	if (!enable_unrestricted_guest) {
 		if (enable_ept) {
@@ -7809,7 +7810,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.get_cpl = vmx_get_cpl,
 	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
 	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
-	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
 	.set_cr0 = vmx_set_cr0,
 	.set_cr4 = vmx_set_cr4,
 	.set_efer = vmx_set_efer,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 373674d455e1..04bb557acdd2 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -453,6 +453,7 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
 				  | (1 << VCPU_EXREG_PDPTR)
 				  | (1 << VCPU_EXREG_SEGMENTS)
 				  | (1 << VCPU_EXREG_CR3)
+				  | (1 << VCPU_EXREG_CR4)
 				  | (1 << VCPU_EXREG_EXIT_INFO_1)
 				  | (1 << VCPU_EXREG_EXIT_INFO_2));
 	vcpu->arch.regs_dirty = 0;
-- 
cgit v1.2.3


From bd31fe495d0d1a67fe6f44f06dfef637f202241d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 1 May 2020 21:32:31 -0700
Subject: KVM: VMX: Add proper cache tracking for CR0

Move CR0 caching into the standard register caching mechanism in order
to take advantage of the availability checks provided by regs_avail.
This avoids multiple VMREADs in the (uncommon) case where kvm_read_cr0()
is called multiple times in a single VM-Exit, and more importantly
eliminates a kvm_x86_ops hook, saves a retpoline on SVM when reading
CR0, and squashes the confusing naming discrepancy of "cache_reg" vs.
"decache_cr0_guest_bits".

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200502043234.12481-8-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/kvm_cache_regs.h   |  5 +++--
 arch/x86/kvm/svm/svm.c          |  5 -----
 arch/x86/kvm/vmx/vmx.c          | 16 +++++++---------
 arch/x86/kvm/vmx/vmx.h          |  1 +
 5 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3e2f582160fe..ad882127079f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -167,6 +167,7 @@ enum kvm_reg {
 	NR_VCPU_REGS,
 
 	VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+	VCPU_EXREG_CR0,
 	VCPU_EXREG_CR3,
 	VCPU_EXREG_CR4,
 	VCPU_EXREG_RFLAGS,
@@ -1092,7 +1093,6 @@ struct kvm_x86_ops {
 	void (*set_segment)(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
 	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
-	void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
 	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
 	int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
 	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 921a539bcb96..ff2d0e9ca3bc 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -116,8 +116,9 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
 {
 	ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
-	if (tmask & vcpu->arch.cr0_guest_owned_bits)
-		kvm_x86_ops.decache_cr0_guest_bits(vcpu);
+	if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
+	    !kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
+		kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0);
 	return vcpu->arch.cr0 & mask;
 }
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 805297e9a1cb..9e24b5b6fae1 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1521,10 +1521,6 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 	mark_dirty(svm->vmcb, VMCB_DT);
 }
 
-static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
 static void update_cr0_intercept(struct vcpu_svm *svm)
 {
 	ulong gcr0 = svm->vcpu.arch.cr0;
@@ -4002,7 +3998,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.set_segment = svm_set_segment,
 	.get_cpl = svm_get_cpl,
 	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
-	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
 	.set_cr0 = svm_set_cr0,
 	.set_cr4 = svm_set_cr4,
 	.set_efer = svm_set_efer,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 83e3fe083679..a7dca7e305b4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2219,6 +2219,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 		if (enable_ept)
 			ept_save_pdptrs(vcpu);
 		break;
+	case VCPU_EXREG_CR0:
+		guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+		vcpu->arch.cr0 &= ~guest_owned_bits;
+		vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
+		break;
 	case VCPU_EXREG_CR3:
 		if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
 			vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
@@ -2922,14 +2928,6 @@ static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
 	vpid_sync_context(to_vmx(vcpu)->vpid);
 }
 
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-{
-	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
-
-	vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
-	vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
-}
-
 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -3019,6 +3017,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	vmcs_writel(CR0_READ_SHADOW, cr0);
 	vmcs_writel(GUEST_CR0, hw_cr0);
 	vcpu->arch.cr0 = cr0;
+	kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
 
 	/* depends on vcpu->arch.cr0 to be set to a new value */
 	vmx->emulation_required = emulation_required(vcpu);
@@ -7809,7 +7808,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.set_segment = vmx_set_segment,
 	.get_cpl = vmx_get_cpl,
 	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
-	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
 	.set_cr0 = vmx_set_cr0,
 	.set_cr4 = vmx_set_cr4,
 	.set_efer = vmx_set_efer,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 04bb557acdd2..298ddef79d00 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -452,6 +452,7 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
 				  | (1 << VCPU_EXREG_RFLAGS)
 				  | (1 << VCPU_EXREG_PDPTR)
 				  | (1 << VCPU_EXREG_SEGMENTS)
+				  | (1 << VCPU_EXREG_CR0)
 				  | (1 << VCPU_EXREG_CR3)
 				  | (1 << VCPU_EXREG_CR4)
 				  | (1 << VCPU_EXREG_EXIT_INFO_1)
-- 
cgit v1.2.3


From 0047fcade4cb1204cafdf62150081a39356f4dbd Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 1 May 2020 21:32:33 -0700
Subject: KVM: VMX: Move nested EPT out of kvm_x86_ops.get_tdp_level() hook

Separate the "core" TDP level handling from the nested EPT path to make
it clear that kvm_x86_ops.get_tdp_level() is used if and only if nested
EPT is not in use (kvm_init_shadow_ept_mmu() calculates the level from
the passed in vmcs12->eptp).  Add a WARN_ON() to enforce that the
kvm_x86_ops hook is not called for nested EPT.

This sets the stage for snapshotting the non-"nested EPT" TDP page level
during kvm_cpuid_update() to avoid the retpoline associated with
kvm_x86_ops.get_tdp_level() when resetting the MMU, a relatively
frequent operation when running a nested guest.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200502043234.12481-10-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a7dca7e305b4..7a43fbe05e2d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3023,15 +3023,23 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	vmx->emulation_required = emulation_required(vcpu);
 }
 
-static int get_ept_level(struct kvm_vcpu *vcpu)
+static int vmx_get_tdp_level(struct kvm_vcpu *vcpu)
 {
-	if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
-		return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
+	WARN_ON(is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)));
+
 	if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
 		return 5;
 	return 4;
 }
 
+static int get_ept_level(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+		return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
+
+	return vmx_get_tdp_level(vcpu);
+}
+
 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
 {
 	u64 eptp = VMX_EPTP_MT_WB;
@@ -7859,7 +7867,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.set_identity_map_addr = vmx_set_identity_map_addr,
-	.get_tdp_level = get_ept_level,
+	.get_tdp_level = vmx_get_tdp_level,
 	.get_mt_mask = vmx_get_mt_mask,
 
 	.get_exit_info = vmx_get_exit_info,
-- 
cgit v1.2.3


From e93fd3b3e89e9664039281fe7e56e6f764f2a909 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 1 May 2020 21:32:34 -0700
Subject: KVM: x86/mmu: Capture TDP level when updating CPUID

Snapshot the TDP level now that it's invariant (SVM) or dependent only
on host capabilities and guest CPUID (VMX).  This avoids having to call
kvm_x86_ops.get_tdp_level() when initializing a TDP MMU and/or
calculating the page role, and thus avoids the associated retpoline.

Drop the WARN in vmx_get_tdp_level() as updating CPUID while L2 is
active is legal, if dodgy.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200502043234.12481-11-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/cpuid.c            | 3 ++-
 arch/x86/kvm/mmu/mmu.c          | 6 +++---
 arch/x86/kvm/svm/nested.c       | 2 +-
 arch/x86/kvm/vmx/vmx.c          | 2 --
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ad882127079f..505cf19ace80 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -687,6 +687,7 @@ struct kvm_vcpu_arch {
 	struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
 
 	int maxphyaddr;
+	int tdp_level;
 
 	/* emulate context */
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6828be99b908..44dfaefdad0e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -124,8 +124,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 					   MSR_IA32_MISC_ENABLE_MWAIT);
 	}
 
-	/* Update physical-address width */
+	/* Note, maxphyaddr must be updated before tdp_level. */
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+	vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
 	kvm_mmu_reset_context(vcpu);
 
 	kvm_pmu_refresh(vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e618472c572b..10cb8db54cd0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4894,7 +4894,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
 	union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
 
 	role.base.ad_disabled = (shadow_accessed_mask == 0);
-	role.base.level = kvm_x86_ops.get_tdp_level(vcpu);
+	role.base.level = vcpu->arch.tdp_level;
 	role.base.direct = true;
 	role.base.gpte_is_8_bytes = true;
 
@@ -4915,7 +4915,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = NULL;
 	context->update_pte = nonpaging_update_pte;
-	context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu);
+	context->shadow_root_level = vcpu->arch.tdp_level;
 	context->direct_map = true;
 	context->get_guest_pgd = get_cr3;
 	context->get_pdptr = kvm_pdptr_read;
@@ -5680,7 +5680,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
 	 * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
 	 * skip allocating the PDP table.
 	 */
-	if (tdp_enabled && kvm_x86_ops.get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
+	if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL)
 		return 0;
 
 	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 712db507c819..a89a166d1cb8 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -86,7 +86,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
 	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
 	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
-	vcpu->arch.mmu->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu);
+	vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level;
 	reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 }
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7a43fbe05e2d..93b2a708b1da 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3025,8 +3025,6 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static int vmx_get_tdp_level(struct kvm_vcpu *vcpu)
 {
-	WARN_ON(is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)));
-
 	if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
 		return 5;
 	return 4;
-- 
cgit v1.2.3


From 68cda40d9f3c4cb880108eb22f974d9e3d5dc6c5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 11 May 2020 15:05:29 -0700
Subject: KVM: nVMX: Tweak handling of failure code for nested VM-Enter failure

Use an enum for passing around the failure code for a failed VM-Enter
that results in VM-Exit to provide a level of indirection from the final
resting place of the failure code, vmcs.EXIT_QUALIFICATION.  The exit
qualification field is an unsigned long, e.g. passing around
'u32 exit_qual' throws up red flags as it suggests KVM may be dropping
bits when reporting errors to L1.  This is a red herring because the
only defined failure codes are 0, 2, 3, and 4, i.e. don't come remotely
close to overflowing a u32.

Setting vmcs.EXIT_QUALIFICATION on entry failure is further complicated
by the MSR load list, which returns the (1-based) entry that failed, and
the number of MSRs to load is a 32-bit VMCS field.  At first blush, it
would appear that overflowing a u32 is possible, but the number of MSRs
that can be loaded is hardcapped at 4096 (limited by MSR_IA32_VMX_MISC).

In other words, there are two completely disparate types of data that
eventually get stuffed into vmcs.EXIT_QUALIFICATION, neither of which is
an 'unsigned long' in nature.  This was presumably the reasoning for
switching to 'u32' when the related code was refactored in commit
ca0bde28f2ed6 ("kvm: nVMX: Split VMCS checks from nested_vmx_run()").

Using an enum for the failure code addresses the technically-possible-
but-will-never-happen scenario where Intel defines a failure code that
doesn't fit in a 32-bit integer.  The enum variables and values will
either be automatically sized (gcc 5.4 behavior) or be subjected to some
combination of truncation.  The former case will simply work, while the
latter will trigger a compile-time warning unless the compiler is being
particularly unhelpful.

Separating the failure code from the failed MSR entry allows for
disassociating both from vmcs.EXIT_QUALIFICATION, which avoids the
conundrum where KVM has to choose between 'u32 exit_qual' and tracking
values as 'unsigned long' that have no business being tracked as such.
To cement the split, set vmcs12->exit_qualification directly from the
entry error code or failed MSR index instead of bouncing through a local
variable.

Opportunistically rename the variables in load_vmcs12_host_state() and
vmx_set_nested_state() to call out that they're ignored, set exit_reason
on demand on nested VM-Enter failure, and add a comment in
nested_vmx_load_msr() to call out that returning 'i + 1' can't wrap.

No functional change intended.

Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Jim Mattson <jmattson@google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200511220529.11402-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h | 10 ++++++----
 arch/x86/kvm/vmx/nested.c  | 47 +++++++++++++++++++++++++++-------------------
 2 files changed, 34 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 5e090d1f03f8..cd7de4b401fe 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -527,10 +527,12 @@ struct vmx_msr_entry {
 /*
  * Exit Qualifications for entry failure during or after loading guest state
  */
-#define ENTRY_FAIL_DEFAULT		0
-#define ENTRY_FAIL_PDPTE		2
-#define ENTRY_FAIL_NMI			3
-#define ENTRY_FAIL_VMCS_LINK_PTR	4
+enum vm_entry_failure_code {
+	ENTRY_FAIL_DEFAULT		= 0,
+	ENTRY_FAIL_PDPTE		= 2,
+	ENTRY_FAIL_NMI			= 3,
+	ENTRY_FAIL_VMCS_LINK_PTR	= 4,
+};
 
 /*
  * Exit Qualifications for EPT Violations
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 90ba278dd572..b2cd8d399644 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -922,6 +922,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 	}
 	return 0;
 fail:
+	/* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
 	return i + 1;
 }
 
@@ -1117,7 +1118,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
  * @entry_failure_code.
  */
 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
-			       u32 *entry_failure_code)
+			       enum vm_entry_failure_code *entry_failure_code)
 {
 	if (CC(!nested_cr3_valid(vcpu, cr3))) {
 		*entry_failure_code = ENTRY_FAIL_DEFAULT;
@@ -2469,7 +2470,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
  * is assigned to entry_failure_code on failure.
  */
 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-			  u32 *entry_failure_code)
+			  enum vm_entry_failure_code *entry_failure_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
@@ -2929,11 +2930,11 @@ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
 
 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
 					struct vmcs12 *vmcs12,
-					u32 *exit_qual)
+					enum vm_entry_failure_code *entry_failure_code)
 {
 	bool ia32e;
 
-	*exit_qual = ENTRY_FAIL_DEFAULT;
+	*entry_failure_code = ENTRY_FAIL_DEFAULT;
 
 	if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
 	    CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
@@ -2948,7 +2949,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 
 	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
-		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+		*entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
 		return -EINVAL;
 	}
 
@@ -3240,9 +3241,9 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	enum vm_entry_failure_code entry_failure_code;
 	bool evaluate_pending_interrupts;
-	u32 exit_reason = EXIT_REASON_INVALID_STATE;
-	u32 exit_qual;
+	u32 exit_reason, failed_index;
 
 	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
 		kvm_vcpu_flush_tlb_current(vcpu);
@@ -3290,24 +3291,33 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 			return NVMX_VMENTRY_VMFAIL;
 		}
 
-		if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
+		if (nested_vmx_check_guest_state(vcpu, vmcs12,
+						 &entry_failure_code)) {
+			exit_reason = EXIT_REASON_INVALID_STATE;
+			vmcs12->exit_qualification = entry_failure_code;
 			goto vmentry_fail_vmexit;
+		}
 	}
 
 	enter_guest_mode(vcpu);
 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
 		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
 
-	if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
+	if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
+		exit_reason = EXIT_REASON_INVALID_STATE;
+		vmcs12->exit_qualification = entry_failure_code;
 		goto vmentry_fail_vmexit_guest_mode;
+	}
 
 	if (from_vmentry) {
-		exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
-		exit_qual = nested_vmx_load_msr(vcpu,
-						vmcs12->vm_entry_msr_load_addr,
-						vmcs12->vm_entry_msr_load_count);
-		if (exit_qual)
+		failed_index = nested_vmx_load_msr(vcpu,
+						   vmcs12->vm_entry_msr_load_addr,
+						   vmcs12->vm_entry_msr_load_count);
+		if (failed_index) {
+			exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+			vmcs12->exit_qualification = failed_index;
 			goto vmentry_fail_vmexit_guest_mode;
+		}
 	} else {
 		/*
 		 * The MMU is not initialized to point at the right entities yet and
@@ -3371,7 +3381,6 @@ vmentry_fail_vmexit:
 
 	load_vmcs12_host_state(vcpu, vmcs12);
 	vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
-	vmcs12->exit_qualification = exit_qual;
 	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
 		vmx->nested.need_vmcs12_to_shadow_sync = true;
 	return NVMX_VMENTRY_VMEXIT;
@@ -4065,8 +4074,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 				   struct vmcs12 *vmcs12)
 {
+	enum vm_entry_failure_code ignored;
 	struct kvm_segment seg;
-	u32 entry_failure_code;
 
 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
 		vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -4101,7 +4110,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
 	 * couldn't have changed.
 	 */
-	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
 
 	if (!enable_ept)
@@ -6001,7 +6010,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12;
-	u32 exit_qual;
+	enum vm_entry_failure_code ignored;
 	struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
 		&user_kvm_nested_state->data.vmx[0];
 	int ret;
@@ -6142,7 +6151,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 
 	if (nested_vmx_check_controls(vcpu, vmcs12) ||
 	    nested_vmx_check_host_state(vcpu, vmcs12) ||
-	    nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
+	    nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
 		goto error_guest_mode;
 
 	vmx->nested.dirty_vmcs12 = true;
-- 
cgit v1.2.3


From a71936ab46f1da1539d97a98dfb2f94ee383d687 Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Wed, 29 Apr 2020 23:43:12 +0800
Subject: kvm: x86: Cleanup vcpu->arch.guest_xstate_size

vcpu->arch.guest_xstate_size lost its only user since commit df1daba7d1cb
("KVM: x86: support XSAVES usage in the host"), so clean it up.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Message-Id: <20200429154312.1411-1-xiaoyao.li@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/cpuid.c            | 4 +---
 arch/x86/kvm/x86.c              | 2 --
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 505cf19ace80..86295a01f5ca 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -657,7 +657,6 @@ struct kvm_vcpu_arch {
 
 	u64 xcr0;
 	u64 guest_supported_xcr0;
-	u32 guest_xstate_size;
 
 	struct kvm_pio_request pio;
 	void *pio_data;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 44dfaefdad0e..cd708b0b460a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -86,12 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
 	if (!best) {
 		vcpu->arch.guest_supported_xcr0 = 0;
-		vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 	} else {
 		vcpu->arch.guest_supported_xcr0 =
 			(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
-		vcpu->arch.guest_xstate_size = best->ebx =
-			xstate_required_size(vcpu->arch.xcr0, false);
+		best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
 	}
 
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3622586a26a7..6b958d6c9427 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9387,8 +9387,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	}
 	fx_init(vcpu);
 
-	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
 	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-- 
cgit v1.2.3


From b2f432f872d9b4eb07d35f7bd5aa68c48a756f1a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 27 Apr 2020 17:54:20 -0700
Subject: KVM: x86/mmu: Tweak PSE hugepage handling to avoid 2M vs 4M conundrum

Change the PSE hugepage handling in walk_addr_generic() to fire on any
page level greater than PT_PAGE_TABLE_LEVEL, a.k.a. PG_LEVEL_4K.  PSE
paging only has two levels, so "== 2" and "> 1" are functionally the
same, i.e. this is a nop.

A future patch will drop KVM's PT_*_LEVEL enums in favor of the kernel's
PG_LEVEL_* enums, at which point "walker->level == PG_LEVEL_2M" is
semantically incorrect (though still functionally ok).

No functional change intended.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200428005422.4235-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index efec7d27b8c5..ca39bd315f70 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -436,7 +436,7 @@ retry_walk:
 	gfn = gpte_to_gfn_lvl(pte, walker->level);
 	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
 
-	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
+	if (PTTYPE == 32 && walker->level > PT_PAGE_TABLE_LEVEL && is_cpuid_PSE36())
 		gfn += pse36_gfn_delta(pte);
 
 	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
-- 
cgit v1.2.3


From e662ec3e0705cfee5b36aecd62adbc36df85eab3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 27 Apr 2020 17:54:21 -0700
Subject: KVM: x86/mmu: Move max hugepage level to a separate #define

Rename PT_MAX_HUGEPAGE_LEVEL to KVM_MAX_HUGEPAGE_LEVEL and make it a
separate define in anticipation of dropping KVM's PT_*_LEVEL enums in
favor of the kernel's PG_LEVEL_* enums.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200428005422.4235-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++---
 arch/x86/kvm/mmu/mmu.c          | 17 +++++++++--------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 86295a01f5ca..77a97a72bb3e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -114,10 +114,9 @@ enum {
 	PT_PAGE_TABLE_LEVEL   = 1,
 	PT_DIRECTORY_LEVEL    = 2,
 	PT_PDPE_LEVEL         = 3,
-	/* set max level to the biggest one */
-	PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL,
 };
-#define KVM_NR_PAGE_SIZES	(PT_MAX_HUGEPAGE_LEVEL - \
+#define KVM_MAX_HUGEPAGE_LEVEL	PT_PDPE_LEVEL
+#define KVM_NR_PAGE_SIZES	(KVM_MAX_HUGEPAGE_LEVEL - \
 				 PT_PAGE_TABLE_LEVEL + 1)
 #define KVM_HPAGE_GFN_SHIFT(x)	(((x) - 1) * 9)
 #define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 10cb8db54cd0..7a712e903e93 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1199,7 +1199,7 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
 	struct kvm_lpage_info *linfo;
 	int i;
 
-	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+	for (i = PT_DIRECTORY_LEVEL; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
 		linfo = lpage_info_slot(gfn, slot, i);
 		linfo->disallow_lpage += count;
 		WARN_ON(linfo->disallow_lpage < 0);
@@ -1763,7 +1763,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 	int i;
 	bool write_protected = false;
 
-	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+	for (i = PT_PAGE_TABLE_LEVEL; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
 		rmap_head = __gfn_to_rmap(gfn, i, slot);
 		write_protected |= __rmap_write_protect(kvm, rmap_head, true);
 	}
@@ -1952,7 +1952,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
 			for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
-						 PT_MAX_HUGEPAGE_LEVEL,
+						 KVM_MAX_HUGEPAGE_LEVEL,
 						 gfn_start, gfn_end - 1,
 						 &iterator)
 				ret |= handler(kvm, iterator.rmap, memslot,
@@ -4214,7 +4214,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 {
 	int max_level;
 
-	for (max_level = PT_MAX_HUGEPAGE_LEVEL;
+	for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
 	     max_level > PT_PAGE_TABLE_LEVEL;
 	     max_level--) {
 		int page_num = KVM_PAGES_PER_HPAGE(max_level);
@@ -5641,7 +5641,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		      slot_level_handler fn, bool lock_flush_tlb)
 {
 	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
-				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+				 KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
 static __always_inline bool
@@ -5649,7 +5649,7 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, bool lock_flush_tlb)
 {
 	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
-				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+				 KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
 static __always_inline bool
@@ -5867,7 +5867,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 				continue;
 
 			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
-						PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+						PT_PAGE_TABLE_LEVEL,
+						KVM_MAX_HUGEPAGE_LEVEL,
 						start, end - 1, true);
 		}
 	}
@@ -5889,7 +5890,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 
 	spin_lock(&kvm->mmu_lock);
 	flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
-				start_level, PT_MAX_HUGEPAGE_LEVEL, false);
+				start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
 	spin_unlock(&kvm->mmu_lock);
 
 	/*
-- 
cgit v1.2.3


From 3bae0459bcd559506a2ca5807040ff722de5b136 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 27 Apr 2020 17:54:22 -0700
Subject: KVM: x86/mmu: Drop KVM's hugepage enums in favor of the kernel's
 enums

Replace KVM's PT_PAGE_TABLE_LEVEL, PT_DIRECTORY_LEVEL and PT_PDPE_LEVEL
with the kernel's PG_LEVEL_4K, PG_LEVEL_2M and PG_LEVEL_1G.  KVM's
enums are borderline impossible to remember and result in code that is
visually difficult to audit, e.g.

        if (!enable_ept)
                ept_lpage_level = 0;
        else if (cpu_has_vmx_ept_1g_page())
                ept_lpage_level = PT_PDPE_LEVEL;
        else if (cpu_has_vmx_ept_2m_page())
                ept_lpage_level = PT_DIRECTORY_LEVEL;
        else
                ept_lpage_level = PT_PAGE_TABLE_LEVEL;

versus

        if (!enable_ept)
                ept_lpage_level = 0;
        else if (cpu_has_vmx_ept_1g_page())
                ept_lpage_level = PG_LEVEL_1G;
        else if (cpu_has_vmx_ept_2m_page())
                ept_lpage_level = PG_LEVEL_2M;
        else
                ept_lpage_level = PG_LEVEL_4K;

No functional change intended.

Suggested-by: Barret Rhoden <brho@google.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200428005422.4235-4-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  12 ++---
 arch/x86/kvm/mmu/mmu.c          | 107 +++++++++++++++++++---------------------
 arch/x86/kvm/mmu/page_track.c   |   4 +-
 arch/x86/kvm/mmu/paging_tmpl.h  |  18 +++----
 arch/x86/kvm/mmu_audit.c        |   6 +--
 arch/x86/kvm/svm/svm.c          |   2 +-
 arch/x86/kvm/vmx/vmx.c          |   6 +--
 arch/x86/kvm/x86.c              |   4 +-
 8 files changed, 73 insertions(+), 86 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 77a97a72bb3e..b8d035694b87 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -110,14 +110,8 @@
 #define UNMAPPED_GVA (~(gpa_t)0)
 
 /* KVM Hugepage definitions for x86 */
-enum {
-	PT_PAGE_TABLE_LEVEL   = 1,
-	PT_DIRECTORY_LEVEL    = 2,
-	PT_PDPE_LEVEL         = 3,
-};
-#define KVM_MAX_HUGEPAGE_LEVEL	PT_PDPE_LEVEL
-#define KVM_NR_PAGE_SIZES	(KVM_MAX_HUGEPAGE_LEVEL - \
-				 PT_PAGE_TABLE_LEVEL + 1)
+#define KVM_MAX_HUGEPAGE_LEVEL	PG_LEVEL_1G
+#define KVM_NR_PAGE_SIZES	(KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
 #define KVM_HPAGE_GFN_SHIFT(x)	(((x) - 1) * 9)
 #define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
@@ -126,7 +120,7 @@ enum {
 
 static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
 {
-	/* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+	/* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
 	return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
 		(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 }
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7a712e903e93..fd94668c0f0f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -623,7 +623,7 @@ static int is_large_pte(u64 pte)
 
 static int is_last_spte(u64 pte, int level)
 {
-	if (level == PT_PAGE_TABLE_LEVEL)
+	if (level == PG_LEVEL_4K)
 		return 1;
 	if (is_large_pte(pte))
 		return 1;
@@ -1199,7 +1199,7 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
 	struct kvm_lpage_info *linfo;
 	int i;
 
-	for (i = PT_DIRECTORY_LEVEL; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+	for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
 		linfo = lpage_info_slot(gfn, slot, i);
 		linfo->disallow_lpage += count;
 		WARN_ON(linfo->disallow_lpage < 0);
@@ -1228,7 +1228,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	slot = __gfn_to_memslot(slots, gfn);
 
 	/* the non-leaf shadow pages are keeping readonly. */
-	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+	if (sp->role.level > PG_LEVEL_4K)
 		return kvm_slot_page_track_add_page(kvm, slot, gfn,
 						    KVM_PAGE_TRACK_WRITE);
 
@@ -1256,7 +1256,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	gfn = sp->gfn;
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, gfn);
-	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+	if (sp->role.level > PG_LEVEL_4K)
 		return kvm_slot_page_track_remove_page(kvm, slot, gfn,
 						       KVM_PAGE_TRACK_WRITE);
 
@@ -1401,7 +1401,7 @@ static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
 	unsigned long idx;
 
 	idx = gfn_to_index(gfn, slot->base_gfn, level);
-	return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
+	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
 }
 
 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
@@ -1532,8 +1532,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
 {
 	if (is_large_pte(*sptep)) {
-		WARN_ON(page_header(__pa(sptep))->role.level ==
-			PT_PAGE_TABLE_LEVEL);
+		WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K);
 		drop_spte(kvm, sptep);
 		--kvm->stat.lpages;
 		return true;
@@ -1685,7 +1684,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 
 	while (mask) {
 		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
-					  PT_PAGE_TABLE_LEVEL, slot);
+					  PG_LEVEL_4K, slot);
 		__rmap_write_protect(kvm, rmap_head, false);
 
 		/* clear the first set bit */
@@ -1711,7 +1710,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 
 	while (mask) {
 		rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
-					  PT_PAGE_TABLE_LEVEL, slot);
+					  PG_LEVEL_4K, slot);
 		__rmap_clear_dirty(kvm, rmap_head);
 
 		/* clear the first set bit */
@@ -1763,7 +1762,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 	int i;
 	bool write_protected = false;
 
-	for (i = PT_PAGE_TABLE_LEVEL; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+	for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
 		rmap_head = __gfn_to_rmap(gfn, i, slot);
 		write_protected |= __rmap_write_protect(kvm, rmap_head, true);
 	}
@@ -1951,7 +1950,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 
-			for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+			for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
 						 KVM_MAX_HUGEPAGE_LEVEL,
 						 gfn_start, gfn_end - 1,
 						 &iterator)
@@ -2346,7 +2345,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
 		if (!s->unsync)
 			continue;
 
-		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+		WARN_ON(s->role.level != PG_LEVEL_4K);
 		ret |= kvm_sync_page(vcpu, s, invalid_list);
 	}
 
@@ -2375,7 +2374,7 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
 		int level = sp->role.level;
 
 		parents->idx[level-1] = idx;
-		if (level == PT_PAGE_TABLE_LEVEL)
+		if (level == PG_LEVEL_4K)
 			break;
 
 		parents->parent[level-2] = sp;
@@ -2397,7 +2396,7 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
 
 	sp = pvec->page[0].sp;
 	level = sp->role.level;
-	WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+	WARN_ON(level == PG_LEVEL_4K);
 
 	parents->parent[level-2] = sp;
 
@@ -2545,11 +2544,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		 * be inconsistent with guest page table.
 		 */
 		account_shadowed(vcpu->kvm, sp);
-		if (level == PT_PAGE_TABLE_LEVEL &&
-		      rmap_write_protect(vcpu, gfn))
+		if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
 			kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
 
-		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+		if (level > PG_LEVEL_4K && need_sync)
 			flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
 	}
 	clear_page(sp->spt);
@@ -2600,7 +2598,7 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 
 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 {
-	if (iterator->level < PT_PAGE_TABLE_LEVEL)
+	if (iterator->level < PG_LEVEL_4K)
 		return false;
 
 	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
@@ -2721,7 +2719,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 	struct mmu_page_path parents;
 	struct kvm_mmu_pages pages;
 
-	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
+	if (parent->role.level == PG_LEVEL_4K)
 		return 0;
 
 	while (mmu_unsync_walk(parent, &pages)) {
@@ -2920,7 +2918,7 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 		if (sp->unsync)
 			continue;
 
-		WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+		WARN_ON(sp->role.level != PG_LEVEL_4K);
 		kvm_unsync_page(vcpu, sp);
 	}
 
@@ -3019,7 +3017,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (!speculative)
 		spte |= spte_shadow_accessed_mask(spte);
 
-	if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+	if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
 	    is_nx_huge_page_enabled()) {
 		pte_access &= ~ACC_EXEC_MASK;
 	}
@@ -3032,7 +3030,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (pte_access & ACC_USER_MASK)
 		spte |= shadow_user_mask;
 
-	if (level > PT_PAGE_TABLE_LEVEL)
+	if (level > PG_LEVEL_4K)
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
 		spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
@@ -3102,8 +3100,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
 		 * the parent of the now unreachable PTE.
 		 */
-		if (level > PT_PAGE_TABLE_LEVEL &&
-		    !is_large_pte(*sptep)) {
+		if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
 			struct kvm_mmu_page *child;
 			u64 pte = *sptep;
 
@@ -3227,7 +3224,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 	if (sp_ad_disabled(sp))
 		return;
 
-	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+	if (sp->role.level > PG_LEVEL_4K)
 		return;
 
 	__direct_pte_prefetch(vcpu, sp, sptep);
@@ -3240,12 +3237,8 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
 	pte_t *pte;
 	int level;
 
-	BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K ||
-		     PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M ||
-		     PT_PDPE_LEVEL != (int)PG_LEVEL_1G);
-
 	if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
-		return PT_PAGE_TABLE_LEVEL;
+		return PG_LEVEL_4K;
 
 	/*
 	 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
@@ -3259,7 +3252,7 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 	pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
 	if (unlikely(!pte))
-		return PT_PAGE_TABLE_LEVEL;
+		return PG_LEVEL_4K;
 
 	return level;
 }
@@ -3273,28 +3266,28 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
 	kvm_pfn_t mask;
 	int level;
 
-	if (unlikely(max_level == PT_PAGE_TABLE_LEVEL))
-		return PT_PAGE_TABLE_LEVEL;
+	if (unlikely(max_level == PG_LEVEL_4K))
+		return PG_LEVEL_4K;
 
 	if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
-		return PT_PAGE_TABLE_LEVEL;
+		return PG_LEVEL_4K;
 
 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
 	if (!slot)
-		return PT_PAGE_TABLE_LEVEL;
+		return PG_LEVEL_4K;
 
 	max_level = min(max_level, max_page_level);
-	for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) {
+	for ( ; max_level > PG_LEVEL_4K; max_level--) {
 		linfo = lpage_info_slot(gfn, slot, max_level);
 		if (!linfo->disallow_lpage)
 			break;
 	}
 
-	if (max_level == PT_PAGE_TABLE_LEVEL)
-		return PT_PAGE_TABLE_LEVEL;
+	if (max_level == PG_LEVEL_4K)
+		return PG_LEVEL_4K;
 
 	level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
-	if (level == PT_PAGE_TABLE_LEVEL)
+	if (level == PG_LEVEL_4K)
 		return level;
 
 	level = min(level, max_level);
@@ -3316,7 +3309,7 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
 	int level = *levelp;
 	u64 spte = *it.sptep;
 
-	if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+	if (it.level == level && level > PG_LEVEL_4K &&
 	    is_nx_huge_page_enabled() &&
 	    is_shadow_present_pte(spte) &&
 	    !is_large_pte(spte)) {
@@ -3573,7 +3566,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 			 *
 			 * See the comments in kvm_arch_commit_memory_region().
 			 */
-			if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+			if (sp->role.level > PG_LEVEL_4K)
 				break;
 		}
 
@@ -4132,7 +4125,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 		return r;
 
 	if (lpage_disallowed)
-		max_level = PT_PAGE_TABLE_LEVEL;
+		max_level = PG_LEVEL_4K;
 
 	if (fast_page_fault(vcpu, gpa, error_code))
 		return RET_PF_RETRY;
@@ -4168,7 +4161,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 	/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
 	return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
-				 PT_DIRECTORY_LEVEL, false);
+				 PG_LEVEL_2M, false);
 }
 
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
@@ -4215,7 +4208,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	int max_level;
 
 	for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
-	     max_level > PT_PAGE_TABLE_LEVEL;
+	     max_level > PG_LEVEL_4K;
 	     max_level--) {
 		int page_num = KVM_PAGES_PER_HPAGE(max_level);
 		gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
@@ -4376,11 +4369,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
 	gpte &= level - mmu->last_nonleaf_level;
 
 	/*
-	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
-	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
-	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+	 * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
+	 * iff level <= PG_LEVEL_4K, which for our purpose means
+	 * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
 	 */
-	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+	gpte |= level - PG_LEVEL_4K - 1;
 
 	return gpte & PT_PAGE_SIZE_MASK;
 }
@@ -5193,7 +5186,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp, u64 *spte,
 				  const void *new)
 {
-	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+	if (sp->role.level != PG_LEVEL_4K) {
 		++vcpu->kvm->stat.mmu_pde_zapped;
 		return;
         }
@@ -5251,7 +5244,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
 	 * Skip write-flooding detected for the sp whose level is 1, because
 	 * it can become unsync, then the guest page is not write-protected.
 	 */
-	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+	if (sp->role.level == PG_LEVEL_4K)
 		return false;
 
 	atomic_inc(&sp->write_flooding_count);
@@ -5582,9 +5575,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_page_level)
 	if (tdp_enabled)
 		max_page_level = tdp_page_level;
 	else if (boot_cpu_has(X86_FEATURE_GBPAGES))
-		max_page_level = PT_PDPE_LEVEL;
+		max_page_level = PG_LEVEL_1G;
 	else
-		max_page_level = PT_DIRECTORY_LEVEL;
+		max_page_level = PG_LEVEL_2M;
 }
 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
 
@@ -5640,7 +5633,7 @@ static __always_inline bool
 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		      slot_level_handler fn, bool lock_flush_tlb)
 {
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+	return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
 				 KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
@@ -5648,7 +5641,7 @@ static __always_inline bool
 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, bool lock_flush_tlb)
 {
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+	return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
 				 KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
@@ -5656,8 +5649,8 @@ static __always_inline bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		 slot_level_handler fn, bool lock_flush_tlb)
 {
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
-				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+	return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
+				 PG_LEVEL_4K, lock_flush_tlb);
 }
 
 static void free_mmu_pages(struct kvm_mmu *mmu)
@@ -5867,7 +5860,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 				continue;
 
 			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
-						PT_PAGE_TABLE_LEVEL,
+						PG_LEVEL_4K,
 						KVM_MAX_HUGEPAGE_LEVEL,
 						start, end - 1, true);
 		}
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index ddc1ec3bdacd..a7bcde34d1f2 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -61,7 +61,7 @@ static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
 {
 	int index, val;
 
-	index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+	index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
 
 	val = slot->arch.gfn_track[mode][index];
 
@@ -151,7 +151,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
 	if (!slot)
 		return false;
 
-	index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+	index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
 	return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
 }
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index ca39bd315f70..38c576495048 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -75,7 +75,7 @@
 #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
 
 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
-#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K)
 
 /*
  * The guest_walker structure emulates the behavior of the hardware page
@@ -198,7 +198,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 	    !(gpte & PT_GUEST_ACCESSED_MASK))
 		goto no_present;
 
-	if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+	if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K))
 		goto no_present;
 
 	return false;
@@ -436,7 +436,7 @@ retry_walk:
 	gfn = gpte_to_gfn_lvl(pte, walker->level);
 	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
 
-	if (PTTYPE == 32 && walker->level > PT_PAGE_TABLE_LEVEL && is_cpuid_PSE36())
+	if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
 		gfn += pse36_gfn_delta(pte);
 
 	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
@@ -552,7 +552,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * we call mmu_set_spte() with host_writable = true because
 	 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
 	 */
-	mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
+	mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn,
 		     true, true);
 
 	kvm_release_pfn_clean(pfn);
@@ -575,7 +575,7 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 	u64 mask;
 	int r, index;
 
-	if (level == PT_PAGE_TABLE_LEVEL) {
+	if (level == PG_LEVEL_4K) {
 		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
 		base_gpa = pte_gpa & ~mask;
 		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
@@ -600,7 +600,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 
 	sp = page_header(__pa(sptep));
 
-	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+	if (sp->role.level > PG_LEVEL_4K)
 		return;
 
 	if (sp->role.direct)
@@ -828,7 +828,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
 
 	if (lpage_disallowed || is_self_change_mapping)
-		max_level = PT_PAGE_TABLE_LEVEL;
+		max_level = PG_LEVEL_4K;
 	else
 		max_level = walker.level;
 
@@ -884,7 +884,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
 {
 	int offset = 0;
 
-	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+	WARN_ON(sp->role.level != PG_LEVEL_4K);
 
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
@@ -1070,7 +1070,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
 
 		set_spte_ret |= set_spte(vcpu, &sp->spt[i],
-					 pte_access, PT_PAGE_TABLE_LEVEL,
+					 pte_access, PG_LEVEL_4K,
 					 gfn, spte_to_pfn(sp->spt[i]),
 					 true, false, host_writable);
 	}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ca39f62aabc6..9d2844f87f6d 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -100,7 +100,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 	sp = page_header(__pa(sptep));
 
 	if (sp->unsync) {
-		if (level != PT_PAGE_TABLE_LEVEL) {
+		if (level != PG_LEVEL_4K) {
 			audit_printk(vcpu->kvm, "unsync sp: %p "
 				     "level = %d\n", sp, level);
 			return;
@@ -176,7 +176,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	int i;
 
-	if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+	if (sp->role.level != PG_LEVEL_4K)
 		return;
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
@@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, sp->gfn);
-	rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
+	rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
 
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
 		if (is_writable_pte(*sptep))
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9e24b5b6fae1..4c808cc059f1 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -888,7 +888,7 @@ static __init int svm_hardware_setup(void)
 	if (npt_enabled && !npt)
 		npt_enabled = false;
 
-	kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL);
+	kvm_configure_mmu(npt_enabled, PG_LEVEL_1G);
 	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 
 	if (nrips) {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 93b2a708b1da..30faae51573d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8011,11 +8011,11 @@ static __init int hardware_setup(void)
 	if (!enable_ept)
 		ept_lpage_level = 0;
 	else if (cpu_has_vmx_ept_1g_page())
-		ept_lpage_level = PT_PDPE_LEVEL;
+		ept_lpage_level = PG_LEVEL_1G;
 	else if (cpu_has_vmx_ept_2m_page())
-		ept_lpage_level = PT_DIRECTORY_LEVEL;
+		ept_lpage_level = PG_LEVEL_2M;
 	else
-		ept_lpage_level = PT_PAGE_TABLE_LEVEL;
+		ept_lpage_level = PG_LEVEL_4K;
 	kvm_configure_mmu(enable_ept, ept_lpage_level);
 
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b958d6c9427..a52ef81fb87a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10046,7 +10046,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 {
 	/* Still write protect RO slot */
 	if (new->flags & KVM_MEM_READONLY) {
-		kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL);
+		kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
 		return;
 	}
 
@@ -10086,7 +10086,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 		} else {
 			int level =
 				kvm_dirty_log_manual_protect_and_init_set(kvm) ?
-				PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL;
+				PG_LEVEL_2M : PG_LEVEL_4K;
 
 			/*
 			 * If we're with initial-all-set, we don't need
-- 
cgit v1.2.3


From 8123f265248c85603d55f4e97c68576f45eb1e4d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 27 Apr 2020 19:37:14 -0700
Subject: KVM: x86/mmu: Add a helper to consolidate root sp allocation

Add a helper, mmu_alloc_root(), to consolidate the allocation of a root
shadow page, which has the same basic mechanics for all flavors of TDP
and shadow paging.

Note, __pa(sp->spt) doesn't need to be protected by mmu_lock, sp->spt
points at a kernel page.

No functional change intended.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200428023714.31923-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 88 ++++++++++++++++++++++----------------------------
 1 file changed, 39 insertions(+), 49 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index fd94668c0f0f..907625fea7b3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3678,37 +3678,43 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	return ret;
 }
 
-static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
+			    u8 level, bool direct)
 {
 	struct kvm_mmu_page *sp;
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+
+	if (make_mmu_pages_available(vcpu)) {
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		return INVALID_PAGE;
+	}
+	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
+	++sp->root_count;
+
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	return __pa(sp->spt);
+}
+
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+	u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level;
+	hpa_t root;
 	unsigned i;
 
-	if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
-		spin_lock(&vcpu->kvm->mmu_lock);
-		if(make_mmu_pages_available(vcpu) < 0) {
-			spin_unlock(&vcpu->kvm->mmu_lock);
+	if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+		if (!VALID_PAGE(root))
 			return -ENOSPC;
-		}
-		sp = kvm_mmu_get_page(vcpu, 0, 0,
-				vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
-		++sp->root_count;
-		spin_unlock(&vcpu->kvm->mmu_lock);
-		vcpu->arch.mmu->root_hpa = __pa(sp->spt);
-	} else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
+		vcpu->arch.mmu->root_hpa = root;
+	} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
 		for (i = 0; i < 4; ++i) {
-			hpa_t root = vcpu->arch.mmu->pae_root[i];
+			MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
 
-			MMU_WARN_ON(VALID_PAGE(root));
-			spin_lock(&vcpu->kvm->mmu_lock);
-			if (make_mmu_pages_available(vcpu) < 0) {
-				spin_unlock(&vcpu->kvm->mmu_lock);
+			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
+					      i << 30, PT32_ROOT_LEVEL, true);
+			if (!VALID_PAGE(root))
 				return -ENOSPC;
-			}
-			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
-					i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
-			root = __pa(sp->spt);
-			++sp->root_count;
-			spin_unlock(&vcpu->kvm->mmu_lock);
 			vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
 		}
 		vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
@@ -3723,9 +3729,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu_page *sp;
 	u64 pdptr, pm_mask;
 	gfn_t root_gfn, root_pgd;
+	hpa_t root;
 	int i;
 
 	root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu);
@@ -3739,20 +3745,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	 * write-protect the guests page table root.
 	 */
 	if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
-		hpa_t root = vcpu->arch.mmu->root_hpa;
+		MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa));
 
-		MMU_WARN_ON(VALID_PAGE(root));
-
-		spin_lock(&vcpu->kvm->mmu_lock);
-		if (make_mmu_pages_available(vcpu) < 0) {
-			spin_unlock(&vcpu->kvm->mmu_lock);
+		root = mmu_alloc_root(vcpu, root_gfn, 0,
+				      vcpu->arch.mmu->shadow_root_level, false);
+		if (!VALID_PAGE(root))
 			return -ENOSPC;
-		}
-		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
-		root = __pa(sp->spt);
-		++sp->root_count;
-		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu->root_hpa = root;
 		goto set_root_pgd;
 	}
@@ -3767,9 +3765,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
 
 	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu->pae_root[i];
-
-		MMU_WARN_ON(VALID_PAGE(root));
+		MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
 		if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
 			pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
 			if (!(pdptr & PT_PRESENT_MASK)) {
@@ -3780,17 +3776,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 			if (mmu_check_root(vcpu, root_gfn))
 				return 1;
 		}
-		spin_lock(&vcpu->kvm->mmu_lock);
-		if (make_mmu_pages_available(vcpu) < 0) {
-			spin_unlock(&vcpu->kvm->mmu_lock);
-			return -ENOSPC;
-		}
-		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
-				      0, ACC_ALL);
-		root = __pa(sp->spt);
-		++sp->root_count;
-		spin_unlock(&vcpu->kvm->mmu_lock);
 
+		root = mmu_alloc_root(vcpu, root_gfn, i << 30,
+				      PT32_ROOT_LEVEL, false);
+		if (!VALID_PAGE(root))
+			return -ENOSPC;
 		vcpu->arch.mmu->pae_root[i] = root | pm_mask;
 	}
 	vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
-- 
cgit v1.2.3


From c16312f4fa75e786f5748a7b721a05e6b0d761fe Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 27 Apr 2020 22:50:35 +0200
Subject: KVM: VMX: Remove unneeded __ASM_SIZE usage with POP instruction

POP [mem] defaults to the word size, and the only legal non-default
size is 16 bits, e.g. a 32-bit POP will #UD in 64-bit mode and vice
versa, no need to use __ASM_SIZE macro to force operating mode.

Changes since v1:
- Fix commit message.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Message-Id: <20200427205035.1594232-1-ubizjak@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmenter.S | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 51d1a82742fd..e0a182cb3cdd 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -166,13 +166,13 @@ SYM_FUNC_START(__vmx_vcpu_run)
 	mov WORD_SIZE(%_ASM_SP), %_ASM_AX
 
 	/* Save all guest registers, including RAX from the stack */
-	__ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
-	mov %_ASM_CX,   VCPU_RCX(%_ASM_AX)
-	mov %_ASM_DX,   VCPU_RDX(%_ASM_AX)
-	mov %_ASM_BX,   VCPU_RBX(%_ASM_AX)
-	mov %_ASM_BP,   VCPU_RBP(%_ASM_AX)
-	mov %_ASM_SI,   VCPU_RSI(%_ASM_AX)
-	mov %_ASM_DI,   VCPU_RDI(%_ASM_AX)
+	pop           VCPU_RAX(%_ASM_AX)
+	mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+	mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+	mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+	mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+	mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+	mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
 #ifdef CONFIG_X86_64
 	mov %r8,  VCPU_R8 (%_ASM_AX)
 	mov %r9,  VCPU_R9 (%_ASM_AX)
-- 
cgit v1.2.3


From dd03bcaad0b1a62c8ea6297e6f2a5993c1c5cd30 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 16 Apr 2020 11:58:59 -0400
Subject: KVM: X86: Force ASYNC_PF_PER_VCPU to be power of two

Forcing the ASYNC_PF_PER_VCPU to be power of two is much easier to be
used rather than calling roundup_pow_of_two() from time to time.  Do
this by adding a BUILD_BUG_ON() inside the hash function.

Another point is that generally async pf does not allow concurrency
over ASYNC_PF_PER_VCPU after all (see kvm_setup_async_pf()), so it
does not make much sense either to have it not a power of two or some
of the entries will definitely be wasted.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20200416155859.267366-1-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/x86.c              | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b8d035694b87..04f44961858b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -763,7 +763,7 @@ struct kvm_vcpu_arch {
 
 	struct {
 		bool halted;
-		gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+		gfn_t gfns[ASYNC_PF_PER_VCPU];
 		struct gfn_to_hva_cache data;
 		u64 msr_val;
 		u32 id;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a52ef81fb87a..8e28f21fb39b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -257,7 +257,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
 	int i;
-	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+	for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
 		vcpu->arch.apf.gfns[i] = ~0;
 }
 
@@ -10310,12 +10310,14 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 
 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
 {
+	BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
+
 	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
 }
 
 static inline u32 kvm_async_pf_next_probe(u32 key)
 {
-	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+	return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
 }
 
 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -10333,7 +10335,7 @@ static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
 	int i;
 	u32 key = kvm_async_pf_hash_fn(gfn);
 
-	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+	for (i = 0; i < ASYNC_PF_PER_VCPU &&
 		     (vcpu->arch.apf.gfns[key] != gfn &&
 		      vcpu->arch.apf.gfns[key] != ~0); i++)
 		key = kvm_async_pf_next_probe(key);
-- 
cgit v1.2.3


From 0fd460446912dd96fd1e08995f57b14806c10478 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 16 Apr 2020 11:59:10 -0400
Subject: KVM: X86: Sanity check on gfn before removal

The index returned by kvm_async_pf_gfn_slot() will be removed when an
async pf gfn is going to be removed.  However kvm_async_pf_gfn_slot()
is not reliable in that it can return the last key it loops over even
if the gfn is not found in the async gfn array.  It should never
happen, but it's still better to sanity check against that to make
sure no unexpected gfn will be removed.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20200416155910.267514-1-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e28f21fb39b..370288fdedba 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10353,6 +10353,10 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 	u32 i, j, k;
 
 	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+
+	if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
+		return;
+
 	while (true) {
 		vcpu->arch.apf.gfns[i] = ~0;
 		do {
-- 
cgit v1.2.3


From 551896e0e060ca32947621e353b4918e90b88822 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 4 May 2020 17:57:06 +0200
Subject: KVM: VMX: Improve handle_external_interrupt_irqoff inline assembly

Improve handle_external_interrupt_irqoff inline assembly in several ways:
- remove unneeded %c operand modifiers and "$" prefixes
- use %rsp instead of _ASM_SP, since we are in CONFIG_X86_64 part
- use $-16 immediate to align %rsp
- remove unneeded use of __ASM_SIZE macro
- define "ss" named operand only for X86_64

The patch introduces no functional changes.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Message-Id: <20200504155706.2516956-1-ubizjak@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 30faae51573d..9ce182db7326 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6382,13 +6382,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 
 	asm volatile(
 #ifdef CONFIG_X86_64
-		"mov %%" _ASM_SP ", %[sp]\n\t"
-		"and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
-		"push $%c[ss]\n\t"
+		"mov %%rsp, %[sp]\n\t"
+		"and $-16, %%rsp\n\t"
+		"push %[ss]\n\t"
 		"push %[sp]\n\t"
 #endif
 		"pushf\n\t"
-		__ASM_SIZE(push) " $%c[cs]\n\t"
+		"push %[cs]\n\t"
 		CALL_NOSPEC
 		:
 #ifdef CONFIG_X86_64
@@ -6397,7 +6397,9 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
 		ASM_CALL_CONSTRAINT
 		:
 		[thunk_target]"r"(entry),
+#ifdef CONFIG_X86_64
 		[ss]"i"(__KERNEL_DS),
+#endif
 		[cs]"i"(__KERNEL_CS)
 	);
 
-- 
cgit v1.2.3


From 2408500dfc8f275f9057586b81466989f0fafc59 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 28 Apr 2020 16:10:24 -0700
Subject: KVM: nVMX: Truncate writes to vmcs.SYSENTER_EIP/ESP for 32-bit vCPU

Explicitly truncate the data written to vmcs.SYSENTER_EIP/ESP on WRMSR
if the virtual CPU doesn't support 64-bit mode.  The SYSENTER address
fields in the VMCS are natural width, i.e. bits 63:32 are dropped if the
CPU doesn't support Intel 64 architectures.  This behavior is visible to
the guest after a VM-Exit/VM-Exit roundtrip, e.g. if the guest sets bits
63:32 in the actual MSR.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200428231025.12766-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9ce182db7326..fc2b660aea7d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1943,6 +1943,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	return 0;
 }
 
+static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+						    u64 data)
+{
+#ifdef CONFIG_X86_64
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+		return (u32)data;
+#endif
+	return (unsigned long)data;
+}
+
 /*
  * Writes msr value into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -1980,13 +1990,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmcs_write32(GUEST_SYSENTER_CS, data);
 		break;
 	case MSR_IA32_SYSENTER_EIP:
-		if (is_guest_mode(vcpu))
+		if (is_guest_mode(vcpu)) {
+			data = nested_vmx_truncate_sysenter_addr(vcpu, data);
 			get_vmcs12(vcpu)->guest_sysenter_eip = data;
+		}
 		vmcs_writel(GUEST_SYSENTER_EIP, data);
 		break;
 	case MSR_IA32_SYSENTER_ESP:
-		if (is_guest_mode(vcpu))
+		if (is_guest_mode(vcpu)) {
+			data = nested_vmx_truncate_sysenter_addr(vcpu, data);
 			get_vmcs12(vcpu)->guest_sysenter_esp = data;
+		}
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
-- 
cgit v1.2.3


From 9e826feb8f114964cbdce026340b6cb9bde68a18 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 28 Apr 2020 16:10:25 -0700
Subject: KVM: nVMX: Drop superfluous VMREAD of vmcs02.GUEST_SYSENTER_*

Don't propagate GUEST_SYSENTER_* from vmcs02 to vmcs12 on nested VM-Exit
as the vmcs12 fields are updated in vmx_set_msr(), and writes to the
corresponding MSRs are always intercepted by KVM when running L2.

Dropping the propagation was intended to be done in the same commit that
added vmcs12 writes in vmx_set_msr()[1], but for reasons unknown was
only shuffled around[2][3].

[1] https://patchwork.kernel.org/patch/10933215
[2] https://patchwork.kernel.org/patch/10933215/#22682289
[3] https://lore.kernel.org/patchwork/patch/1088643

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200428231025.12766-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b2cd8d399644..135adc624592 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3952,10 +3952,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
 	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
 
-	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
-	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
-	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-
 	vmcs12->guest_interruptibility_info =
 		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 
-- 
cgit v1.2.3


From dcf068da7eb29362adf13f20e5c44a18d98ed9a3 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Tue, 28 Apr 2020 14:23:23 +0800
Subject: KVM: VMX: Introduce generic fastpath handler

Introduce generic fastpath handler to handle MSR fastpath, VMX-preemption
timer fastpath etc; move it after vmx_complete_interrupts() in order to
catch events delivered to the guest, and abort the fast path in later
patches.  While at it, move the kvm_exit tracepoint so that it is printed
for fastpath vmexits as well.

There is no observed performance effect for the IPI fastpath after this patch.

Tested-by: Haiwei Li <lihaiwei@tencent.com>
Cc: Haiwei Li <lihaiwei@tencent.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <1588055009-12677-2-git-send-email-wanpengli@tencent.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fc2b660aea7d..d883fbb63566 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5933,8 +5933,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
-	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
-
 	/*
 	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
 	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
@@ -6630,6 +6628,16 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 	}
 }
 
+static enum exit_fastpath_completion vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+	switch (to_vmx(vcpu)->exit_reason) {
+	case EXIT_REASON_MSR_WRITE:
+		return handle_fastpath_set_msr_irqoff(vcpu);
+	default:
+		return EXIT_FASTPATH_NONE;
+	}
+}
+
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
 static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -6784,20 +6792,21 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
 		kvm_machine_check();
 
+	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+
 	if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
 		return EXIT_FASTPATH_NONE;
 
-	if (!is_guest_mode(vcpu) && vmx->exit_reason == EXIT_REASON_MSR_WRITE)
-		exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
-	else
-		exit_fastpath = EXIT_FASTPATH_NONE;
-
 	vmx->loaded_vmcs->launched = 1;
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
 
+	if (is_guest_mode(vcpu))
+		return EXIT_FASTPATH_NONE;
+
+	exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
 	return exit_fastpath;
 }
 
-- 
cgit v1.2.3


From 2c4c41325540cf3abb12aef142c0e550f6afeffc Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 8 May 2020 16:53:48 -0700
Subject: KVM: x86: Print symbolic names of VMX VM-Exit flags in traces

Use __print_flags() to display the names of VMX flags in VM-Exit traces
and strip the flags when printing the basic exit reason, e.g. so that a
failed VM-Entry due to invalid guest state gets recorded as
"INVALID_STATE FAILED_VMENTRY" instead of "0x80000021".

Opportunstically fix misaligned variables in the kvm_exit and
kvm_nested_vmexit_inject tracepoints.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200508235348.19427-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/vmx.h |  3 +++
 arch/x86/kvm/trace.h            | 32 +++++++++++++++++---------------
 2 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index e95b72ec19bc..b8ff9e8ac0d5 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -150,6 +150,9 @@
 	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
 	{ EXIT_REASON_TPAUSE,                "TPAUSE" }
 
+#define VMX_EXIT_REASON_FLAGS \
+	{ VMX_EXIT_REASONS_FAILED_VMENTRY,	"FAILED_VMENTRY" }
+
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL       2
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 249062f24b94..54a10c98d746 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -225,6 +225,14 @@ TRACE_EVENT(kvm_apic,
 #define KVM_ISA_VMX   1
 #define KVM_ISA_SVM   2
 
+#define kvm_print_exit_reason(exit_reason, isa)				\
+	(isa == KVM_ISA_VMX) ?						\
+	__print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) :	\
+	__print_symbolic(exit_reason, SVM_EXIT_REASONS),		\
+	(isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "",	\
+	(isa == KVM_ISA_VMX) ?						\
+	__print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
+
 /*
  * Tracepoint for kvm guest exit:
  */
@@ -250,12 +258,10 @@ TRACE_EVENT(kvm_exit,
 					   &__entry->info2);
 	),
 
-	TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx",
+	TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx",
 		  __entry->vcpu_id,
-		 (__entry->isa == KVM_ISA_VMX) ?
-		 __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
-		 __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
-		 __entry->guest_rip, __entry->info1, __entry->info2)
+		  kvm_print_exit_reason(__entry->exit_reason, __entry->isa),
+		  __entry->guest_rip, __entry->info1, __entry->info2)
 );
 
 /*
@@ -588,12 +594,10 @@ TRACE_EVENT(kvm_nested_vmexit,
 		__entry->exit_int_info_err	= exit_int_info_err;
 		__entry->isa			= isa;
 	),
-	TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
+	TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx "
 		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
 		  __entry->rip,
-		 (__entry->isa == KVM_ISA_VMX) ?
-		 __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
-		 __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
+		  kvm_print_exit_reason(__entry->exit_code, __entry->isa),
 		  __entry->exit_info1, __entry->exit_info2,
 		  __entry->exit_int_info, __entry->exit_int_info_err)
 );
@@ -626,13 +630,11 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
 		__entry->isa			= isa;
 	),
 
-	TP_printk("reason: %s ext_inf1: 0x%016llx "
+	TP_printk("reason: %s%s%s ext_inf1: 0x%016llx "
 		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
-		 (__entry->isa == KVM_ISA_VMX) ?
-		 __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
-		 __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
-		__entry->exit_info1, __entry->exit_info2,
-		__entry->exit_int_info, __entry->exit_int_info_err)
+		  kvm_print_exit_reason(__entry->exit_code, __entry->isa),
+		  __entry->exit_info1, __entry->exit_info2,
+		  __entry->exit_int_info, __entry->exit_int_info_err)
 );
 
 /*
-- 
cgit v1.2.3


From 5a9f54435a488f8a1153efd36cccee3e7e0fc28b Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Tue, 28 Apr 2020 14:23:26 +0800
Subject: KVM: X86: Introduce kvm_vcpu_exit_request() helper

Introduce kvm_vcpu_exit_request() helper, we need to check some conditions
before enter guest again immediately, we skip invoking the exit handler and
go through full run loop if complete fastpath but there is stuff preventing
we enter guest again immediately.

Tested-by: Haiwei Li <lihaiwei@tencent.com>
Cc: Haiwei Li <lihaiwei@tencent.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1588055009-12677-5-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 10 ++++++++--
 arch/x86/kvm/x86.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 370288fdedba..29a41aa98929 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1573,6 +1573,13 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+{
+	return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
+		need_resched() || signal_pending(current);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
+
 /*
  * The fast path for frequent and performance sensitive wrmsr emulation,
  * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
@@ -8396,8 +8403,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
 		kvm_x86_ops.sync_pir_to_irr(vcpu);
 
-	if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
-	    || need_resched() || signal_pending(current)) {
+	if (kvm_vcpu_exit_request(vcpu)) {
 		vcpu->mode = OUTSIDE_GUEST_MODE;
 		smp_wmb();
 		local_irq_enable();
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7b5ed8ed628e..e02fe28254b6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -364,5 +364,6 @@ static inline bool kvm_dr7_valid(u64 data)
 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
 u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
 
 #endif
-- 
cgit v1.2.3


From 404d5d7bff0d419fe11c7eaebca9ec8f25258f95 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Tue, 28 Apr 2020 14:23:25 +0800
Subject: KVM: X86: Introduce more exit_fastpath_completion enum values

Adds a fastpath_t typedef since enum lines are a bit long, and replace
EXIT_FASTPATH_SKIP_EMUL_INS with two new exit_fastpath_completion enum values.

- EXIT_FASTPATH_EXIT_HANDLED  kvm will still go through it's full run loop,
                              but it would skip invoking the exit handler.

- EXIT_FASTPATH_REENTER_GUEST complete fastpath, guest can be re-entered
                              without invoking the exit handler or going
                              back to vcpu_run

Tested-by: Haiwei Li <lihaiwei@tencent.com>
Cc: Haiwei Li <lihaiwei@tencent.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1588055009-12677-4-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/svm/svm.c          | 15 +++++++--------
 arch/x86/kvm/vmx/vmx.c          | 26 ++++++++++++++++++--------
 arch/x86/kvm/x86.c              | 19 ++++++++++---------
 arch/x86/kvm/x86.h              |  2 +-
 5 files changed, 39 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 04f44961858b..c3906fb2b93f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -182,8 +182,10 @@ enum {
 
 enum exit_fastpath_completion {
 	EXIT_FASTPATH_NONE,
-	EXIT_FASTPATH_SKIP_EMUL_INS,
+	EXIT_FASTPATH_REENTER_GUEST,
+	EXIT_FASTPATH_EXIT_HANDLED,
 };
+typedef enum exit_fastpath_completion fastpath_t;
 
 struct x86_emulate_ctxt;
 struct x86_exception;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 4c808cc059f1..29e7f7bc284d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2893,8 +2893,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = control->exit_info_2;
 }
 
-static int handle_exit(struct kvm_vcpu *vcpu,
-	enum exit_fastpath_completion exit_fastpath)
+static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_run *kvm_run = vcpu->run;
@@ -2952,10 +2951,10 @@ static int handle_exit(struct kvm_vcpu *vcpu,
 		       __func__, svm->vmcb->control.exit_int_info,
 		       exit_code);
 
-	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
-		kvm_skip_emulated_instruction(vcpu);
+	if (exit_fastpath != EXIT_FASTPATH_NONE)
 		return 1;
-	} else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+
+	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
 	    || !svm_exit_handlers[exit_code]) {
 		vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
 		dump_vmcb(vcpu);
@@ -3324,7 +3323,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 	svm_complete_interrupts(svm);
 }
 
-static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
 	if (!is_guest_mode(vcpu) &&
 	    to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
@@ -3336,9 +3335,9 @@ static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu
 
 void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
 
-static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
-	enum exit_fastpath_completion exit_fastpath;
+	fastpath_t exit_fastpath;
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d883fbb63566..c7730d3aa706 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5926,8 +5926,7 @@ void dump_vmcs(void)
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
  */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu,
-	enum exit_fastpath_completion exit_fastpath)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exit_reason = vmx->exit_reason;
@@ -6034,10 +6033,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
 		}
 	}
 
-	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
-		kvm_skip_emulated_instruction(vcpu);
+	if (exit_fastpath != EXIT_FASTPATH_NONE)
 		return 1;
-	}
 
 	if (exit_reason >= kvm_vmx_max_exit_handlers)
 		goto unexpected_vmexit;
@@ -6628,7 +6625,7 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 	}
 }
 
-static enum exit_fastpath_completion vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
 	switch (to_vmx(vcpu)->exit_reason) {
 	case EXIT_REASON_MSR_WRITE:
@@ -6640,12 +6637,13 @@ static enum exit_fastpath_completion vmx_exit_handlers_fastpath(struct kvm_vcpu
 
 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
 
-static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
-	enum exit_fastpath_completion exit_fastpath;
+	fastpath_t exit_fastpath;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long cr3, cr4;
 
+reenter_guest:
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!enable_vnmi &&
 		     vmx->loaded_vmcs->soft_vnmi_blocked))
@@ -6807,6 +6805,18 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		return EXIT_FASTPATH_NONE;
 
 	exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
+	if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
+		if (!kvm_vcpu_exit_request(vcpu)) {
+			/*
+			 * FIXME: this goto should be a loop in vcpu_enter_guest,
+			 * but it would incur the cost of a retpoline for now.
+			 * Revisit once static calls are available.
+			 */
+			goto reenter_guest;
+		}
+		exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+	}
+
 	return exit_fastpath;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 29a41aa98929..71749fcb229e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1608,27 +1608,28 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
 	return 1;
 }
 
-enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 {
 	u32 msr = kvm_rcx_read(vcpu);
 	u64 data;
-	int ret = 0;
+	fastpath_t ret = EXIT_FASTPATH_NONE;
 
 	switch (msr) {
 	case APIC_BASE_MSR + (APIC_ICR >> 4):
 		data = kvm_read_edx_eax(vcpu);
-		ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
+		if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
+			kvm_skip_emulated_instruction(vcpu);
+			ret = EXIT_FASTPATH_EXIT_HANDLED;
+               }
 		break;
 	default:
-		return EXIT_FASTPATH_NONE;
+		break;
 	}
 
-	if (!ret) {
+	if (ret != EXIT_FASTPATH_NONE)
 		trace_kvm_msr_write(msr, data);
-		return EXIT_FASTPATH_SKIP_EMUL_INS;
-	}
 
-	return EXIT_FASTPATH_NONE;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
 
@@ -8205,7 +8206,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_int_win =
 		dm_request_for_irq_injection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
-	enum exit_fastpath_completion exit_fastpath;
+	fastpath_t exit_fastpath;
 
 	bool req_immediate_exit = false;
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e02fe28254b6..6eb62e97e59f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -274,7 +274,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
 bool kvm_vector_hashing_enabled(void);
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 			    int emulation_type, void *insn, int insn_len);
-enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
 
 extern u64 host_xcr0;
 extern u64 supported_xcr0;
-- 
cgit v1.2.3


From 379a3c8ee44440d5afa505230ed8cb5b0d0e314b Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Tue, 28 Apr 2020 14:23:27 +0800
Subject: KVM: VMX: Optimize posted-interrupt delivery for timer fastpath

While optimizing posted-interrupt delivery especially for the timer
fastpath scenario, I measured kvm_x86_ops.deliver_posted_interrupt()
to introduce substantial latency because the processor has to perform
all vmentry tasks, ack the posted interrupt notification vector,
read the posted-interrupt descriptor etc.

This is not only slow, it is also unnecessary when delivering an
interrupt to the current CPU (as is the case for the LAPIC timer) because
PIR->IRR and IRR->RVI synchronization is already performed on vmentry
Therefore skip kvm_vcpu_trigger_posted_interrupt in this case, and
instead do vmx_sync_pir_to_irr() on the EXIT_FASTPATH_REENTER_GUEST
fastpath as well.

Tested-by: Haiwei Li <lihaiwei@tencent.com>
Cc: Haiwei Li <lihaiwei@tencent.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1588055009-12677-6-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 5 ++++-
 virt/kvm/kvm_main.c    | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c7730d3aa706..8d881fcf648e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3936,7 +3936,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 	if (pi_test_and_set_on(&vmx->pi_desc))
 		return 0;
 
-	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+	if (vcpu != kvm_get_running_vcpu() &&
+	    !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
 		kvm_vcpu_kick(vcpu);
 
 	return 0;
@@ -6812,6 +6813,8 @@ reenter_guest:
 			 * but it would incur the cost of a retpoline for now.
 			 * Revisit once static calls are available.
 			 */
+			if (vcpu->arch.apicv_active)
+				vmx_sync_pir_to_irr(vcpu);
 			goto reenter_guest;
 		}
 		exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bef3d8d40685..11844fad60fd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4646,6 +4646,7 @@ struct kvm_vcpu *kvm_get_running_vcpu(void)
 
 	return vcpu;
 }
+EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
 
 /**
  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
-- 
cgit v1.2.3


From 199a8b84c455cfeb81bc060c6824dea473e54dc3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 5 May 2020 06:45:35 -0400
Subject: KVM: x86: introduce kvm_can_use_hv_timer

Replace the ad hoc test in vmx_set_hv_timer with a test in the caller,
start_hv_timer.  This test is not Intel-specific and would be duplicated
when introducing the fast path for the TSC deadline MSR.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c   | 13 ++++++++++---
 arch/x86/kvm/lapic.h   |  2 +-
 arch/x86/kvm/vmx/vmx.c |  4 ----
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 42cd2e3ec6fd..73e51abca21d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -110,11 +110,18 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
 	return apic->vcpu->vcpu_id;
 }
 
-bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
+static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
 {
 	return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
 }
-EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt);
+
+bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops.set_hv_timer
+	       && !(kvm_mwait_in_guest(vcpu->kvm) ||
+		    kvm_can_post_timer_interrupt(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer);
 
 static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
 {
@@ -1788,7 +1795,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
 	bool expired;
 
 	WARN_ON(preemptible());
-	if (!kvm_x86_ops.set_hv_timer)
+	if (!kvm_can_use_hv_timer(vcpu))
 		return false;
 
 	if (!ktimer->tscdeadline)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7f15f9e69efe..754f29beb83e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -250,7 +250,7 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
-bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu);
+bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu);
 
 static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
 {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8d881fcf648e..d2678078eee8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7405,10 +7405,6 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
 	u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
 	struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
 
-	if (kvm_mwait_in_guest(vcpu->kvm) ||
-		kvm_can_post_timer_interrupt(vcpu))
-		return -EOPNOTSUPP;
-
 	vmx = to_vmx(vcpu);
 	tscl = rdtsc();
 	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
-- 
cgit v1.2.3


From ae95f566b3d22ade75c67827f1171594dacc9a03 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Tue, 28 Apr 2020 14:23:28 +0800
Subject: KVM: X86: TSCDEADLINE MSR emulation fastpath

This patch implements a fast path for emulation of writes to the TSCDEADLINE
MSR.  Besides shortcutting various housekeeping tasks in the vCPU loop,
the fast path can also deliver the timer interrupt directly without going
through KVM_REQ_PENDING_TIMER because it runs in vCPU context.

Tested-by: Haiwei Li <lihaiwei@tencent.com>
Cc: Haiwei Li <lihaiwei@tencent.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1588055009-12677-7-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 18 ++++++++++++------
 arch/x86/kvm/x86.c   | 16 ++++++++++++++++
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73e51abca21d..2a3b57401a68 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1600,7 +1600,7 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
 	}
 }
 
-static void apic_timer_expired(struct kvm_lapic *apic)
+static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	struct kvm_timer *ktimer = &apic->lapic_timer;
@@ -1611,6 +1611,12 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 	if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
 		ktimer->expired_tscdeadline = ktimer->tscdeadline;
 
+	if (!from_timer_fn && vcpu->arch.apicv_active) {
+		WARN_ON(kvm_get_running_vcpu() != vcpu);
+		kvm_apic_inject_pending_timer_irqs(apic);
+		return;
+	}
+
 	if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
 		if (apic->lapic_timer.timer_advance_ns)
 			__kvm_wait_lapic_expire(vcpu);
@@ -1650,7 +1656,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 		expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
 		hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
 	} else
-		apic_timer_expired(apic);
+		apic_timer_expired(apic, false);
 
 	local_irq_restore(flags);
 }
@@ -1758,7 +1764,7 @@ static void start_sw_period(struct kvm_lapic *apic)
 
 	if (ktime_after(ktime_get(),
 			apic->lapic_timer.target_expiration)) {
-		apic_timer_expired(apic);
+		apic_timer_expired(apic, false);
 
 		if (apic_lvtt_oneshot(apic))
 			return;
@@ -1820,7 +1826,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
 		if (atomic_read(&ktimer->pending)) {
 			cancel_hv_timer(apic);
 		} else if (expired) {
-			apic_timer_expired(apic);
+			apic_timer_expired(apic, false);
 			cancel_hv_timer(apic);
 		}
 	}
@@ -1870,7 +1876,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 		goto out;
 	WARN_ON(rcuwait_active(&vcpu->wait));
 	cancel_hv_timer(apic);
-	apic_timer_expired(apic);
+	apic_timer_expired(apic, false);
 
 	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
 		advance_periodic_target_expiration(apic);
@@ -2376,7 +2382,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
 	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
 	struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
 
-	apic_timer_expired(apic);
+	apic_timer_expired(apic, true);
 
 	if (lapic_is_periodic(apic)) {
 		advance_periodic_target_expiration(apic);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 71749fcb229e..7fb7c1a8f1a5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1608,6 +1608,15 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
 	return 1;
 }
 
+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
+{
+	if (!kvm_can_use_hv_timer(vcpu))
+		return 1;
+
+	kvm_set_lapic_tscdeadline_msr(vcpu, data);
+	return 0;
+}
+
 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 {
 	u32 msr = kvm_rcx_read(vcpu);
@@ -1622,6 +1631,13 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 			ret = EXIT_FASTPATH_EXIT_HANDLED;
                }
 		break;
+	case MSR_IA32_TSCDEADLINE:
+		data = kvm_read_edx_eax(vcpu);
+		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
+			kvm_skip_emulated_instruction(vcpu);
+			ret = EXIT_FASTPATH_REENTER_GUEST;
+		}
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 26efe2fd92e50822674acce1dbc4f2ac6fc1788f Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Wed, 6 May 2020 11:44:01 -0400
Subject: KVM: VMX: Handle preemption timer fastpath

This patch implements a fastpath for the preemption timer vmexit.  The vmexit
can be handled quickly so it can be performed with interrupts off and going
back directly to the guest.

Testing on SKX Server.

cyclictest in guest(w/o mwait exposed, adaptive advance lapic timer is default -1):

5540.5ns -> 4602ns       17%

kvm-unit-test/vmexit.flat:

w/o avanced timer:
tscdeadline_immed: 3028.5  -> 2494.75  17.6%
tscdeadline:       5765.7  -> 5285      8.3%

w/ adaptive advance timer default -1:
tscdeadline_immed: 3123.75 -> 2583     17.3%
tscdeadline:       4663.75 -> 4537      2.7%

Tested-by: Haiwei Li <lihaiwei@tencent.com>
Cc: Haiwei Li <lihaiwei@tencent.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1588055009-12677-8-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d2678078eee8..5fa162f6e1dc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5601,14 +5601,22 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	if (!vmx->req_immediate_exit &&
-	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
 		kvm_lapic_expired_hv_timer(vcpu);
+		return EXIT_FASTPATH_REENTER_GUEST;
+	}
+
+	return EXIT_FASTPATH_NONE;
+}
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+	handle_fastpath_preemption_timer(vcpu);
 	return 1;
 }
 
@@ -6631,6 +6639,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 	switch (to_vmx(vcpu)->exit_reason) {
 	case EXIT_REASON_MSR_WRITE:
 		return handle_fastpath_set_msr_irqoff(vcpu);
+	case EXIT_REASON_PREEMPTION_TIMER:
+		return handle_fastpath_preemption_timer(vcpu);
 	default:
 		return EXIT_FASTPATH_NONE;
 	}
-- 
cgit v1.2.3


From e14b7786cb1cf39a12e79bb04e2f43afc2eabc0f Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Wed, 6 May 2020 08:17:55 -0500
Subject: KVM: SVM: Merge svm_enable_vintr into svm_set_vintr

Code clean up and remove unnecessary intercept check for
INTERCEPT_VINTR.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Message-Id: <1588771076-73790-4-git-send-email-suravee.suthikulpanit@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 29e7f7bc284d..801886142cc4 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1352,12 +1352,13 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	}
 }
 
-static inline void svm_enable_vintr(struct vcpu_svm *svm)
+static void svm_set_vintr(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control;
 
 	/* The following fields are ignored when AVIC is enabled */
 	WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
+	set_intercept(svm, INTERCEPT_VINTR);
 
 	/*
 	 * This is just a dummy VINTR to actually cause a vmexit to happen.
@@ -1371,13 +1372,6 @@ static inline void svm_enable_vintr(struct vcpu_svm *svm)
 	mark_dirty(svm->vmcb, VMCB_INTR);
 }
 
-static void svm_set_vintr(struct vcpu_svm *svm)
-{
-	set_intercept(svm, INTERCEPT_VINTR);
-	if (is_intercept(svm, INTERCEPT_VINTR))
-		svm_enable_vintr(svm);
-}
-
 static void svm_clear_vintr(struct vcpu_svm *svm)
 {
 	clr_intercept(svm, INTERCEPT_VINTR);
-- 
cgit v1.2.3


From de182481629c2dc248adbb4ec5df83cd9d633dd4 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Wed, 6 May 2020 08:17:56 -0500
Subject: KVM: SVM: Remove unnecessary V_IRQ unsetting

This has already been handled in the prior call to svm_clear_vintr().

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Message-Id: <1588771076-73790-5-git-send-email-suravee.suthikulpanit@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 801886142cc4..4e9cd2a73ad0 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2660,8 +2660,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 	 */
 	svm_toggle_avic_for_irq_window(&svm->vcpu, true);
 
-	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-	mark_dirty(svm->vmcb, VMCB_INTR);
 	++svm->vcpu.stat.irq_window_exits;
 	return 1;
 }
-- 
cgit v1.2.3


From 6c1c6e58356b87017839f71120d1c250b4e0f7ff Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 6 May 2020 13:46:53 -0700
Subject: KVM: nVMX: Remove unused 'ops' param from nested_vmx_hardware_setup()

Remove a 'struct kvm_x86_ops' param that got left behind when the nested
ops were moved to their own struct.

Fixes: 33b22172452f0 ("KVM: x86: move nested-related kvm_x86_ops to a separate struct")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200506204653.14683-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 3 +--
 arch/x86/kvm/vmx/nested.h | 3 +--
 arch/x86/kvm/vmx/vmx.c    | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 135adc624592..25f130d3158b 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6421,8 +6421,7 @@ void nested_vmx_hardware_unsetup(void)
 	}
 }
 
-__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
-				     int (*exit_handlers[])(struct kvm_vcpu *))
+__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 {
 	int i;
 
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 5cc72ae0e277..758bccc26cf9 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -19,8 +19,7 @@ enum nvmx_vmentry_status {
 void vmx_leave_nested(struct kvm_vcpu *vcpu);
 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
 void nested_vmx_hardware_unsetup(void);
-__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops,
-				     int (*exit_handlers[])(struct kvm_vcpu *));
+__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
 void nested_vmx_set_vmcs_shadowing_bitmap(void);
 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5fa162f6e1dc..68f4efb2b9c2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8119,8 +8119,7 @@ static __init int hardware_setup(void)
 		nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
 					   vmx_capability.ept);
 
-		r = nested_vmx_hardware_setup(&vmx_x86_ops,
-					      kvm_vmx_exit_handlers);
+		r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
 		if (r)
 			return r;
 	}
-- 
cgit v1.2.3


From 1739f3d56d24a121535b45267d85b7fe6f6cc4ab Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 8 May 2020 13:36:41 -0700
Subject: KVM: nVMX: Really make emulated nested preemption timer pinned

The PINNED bit is ignored by hrtimer_init. It is only considered when
starting the timer.

When the hrtimer isn't pinned to the same logical processor as the
vCPU thread to be interrupted, the emulated VMX-preemption timer
often fails to adhere to the architectural specification.

Fixes: f15a75eedc18e ("KVM: nVMX: make emulated nested preemption timer pinned")
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Peter Shier <pshier@google.com>
Reviewed-by: Oliver Upton <oupton@google.com>
Message-Id: <20200508203643.85477-2-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 25f130d3158b..3d50e50de626 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2113,7 +2113,7 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
 	preemption_timeout *= 1000000;
 	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
 	hrtimer_start(&vmx->nested.preemption_timer,
-		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL_PINNED);
 }
 
 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
-- 
cgit v1.2.3


From ada0098df6569b7f9e9495beb0f1e35718895aa0 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 8 May 2020 13:36:42 -0700
Subject: KVM: nVMX: Change emulated VMX-preemption timer hrtimer to absolute

Prepare for migration of this hrtimer, by changing it from relative to
absolute. (I couldn't get migration to work with a relative timer.)

Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Peter Shier <pshier@google.com>
Reviewed-by: Oliver Upton <oupton@google.com>
Message-Id: <20200508203643.85477-3-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3d50e50de626..51ebb60e1533 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2113,7 +2113,8 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
 	preemption_timeout *= 1000000;
 	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
 	hrtimer_start(&vmx->nested.preemption_timer,
-		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL_PINNED);
+		      ktime_add_ns(ktime_get(), preemption_timeout),
+		      HRTIMER_MODE_ABS_PINNED);
 }
 
 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
@@ -4673,7 +4674,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 		goto out_shadow_vmcs;
 
 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_REL_PINNED);
+		     HRTIMER_MODE_ABS_PINNED);
 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
 
 	vmx->nested.vpid02 = allocate_vpid();
-- 
cgit v1.2.3


From 93dff2fed2fb4a513196b7df05742c6fcdfd5178 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 8 May 2020 13:36:43 -0700
Subject: KVM: nVMX: Migrate the VMX-preemption timer

The hrtimer used to emulate the VMX-preemption timer must be pinned to
the same logical processor as the vCPU thread to be interrupted if we
want to have any hope of adhering to the architectural specification
of the VMX-preemption timer. Even with this change, the emulated
VMX-preemption timer VM-exit occasionally arrives too late.

Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Peter Shier <pshier@google.com>
Reviewed-by: Oliver Upton <oupton@google.com>
Message-Id: <20200508203643.85477-4-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/irq.c              |  2 ++
 arch/x86/kvm/vmx/vmx.c          | 11 +++++++++++
 3 files changed, 15 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c3906fb2b93f..a3ce391c9e7a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1247,6 +1247,8 @@ struct kvm_x86_ops {
 
 	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
 	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
+
+	void (*migrate_timers)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index e330e7d125f7..54f7ea68083b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -159,6 +159,8 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
 	__kvm_migrate_apic_timer(vcpu);
 	__kvm_migrate_pit_timer(vcpu);
+	if (kvm_x86_ops.migrate_timers)
+		kvm_x86_ops.migrate_timers(vcpu);
 }
 
 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 68f4efb2b9c2..6a03c27ff314 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7814,6 +7814,16 @@ static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
 	return to_vmx(vcpu)->nested.vmxon;
 }
 
+static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu)) {
+		struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
+
+		if (hrtimer_try_to_cancel(timer) == 1)
+			hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+	}
+}
+
 static void hardware_unsetup(void)
 {
 	if (nested)
@@ -7957,6 +7967,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 
 	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
 	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+	.migrate_timers = vmx_migrate_timers,
 };
 
 static __init int hardware_setup(void)
-- 
cgit v1.2.3


From cb953129bfe5c0f2da835a0469930873fb7e71df Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 8 May 2020 11:22:40 -0700
Subject: kvm: add halt-polling cpu usage stats

Two new stats for exposing halt-polling cpu usage:
halt_poll_success_ns
halt_poll_fail_ns

Thus sum of these 2 stats is the total cpu time spent polling. "success"
means the VCPU polled until a virtual interrupt was delivered. "fail"
means the VCPU had to schedule out (either because the maximum poll time
was reached or it needed to yield the CPU).

To avoid touching every arch's kvm_vcpu_stat struct, only update and
export halt-polling cpu usage stats if we're on x86.

Exporting cpu usage as a u64 and in nanoseconds means we will overflow at
~500 years, which seems reasonably large.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Jon Cargille <jcargill@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>

Message-Id: <20200508182240.68440-1-jcargill@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/include/asm/kvm_host.h |  2 ++
 arch/arm64/kvm/guest.c            |  2 ++
 arch/mips/include/asm/kvm_host.h  |  2 ++
 arch/mips/kvm/mips.c              |  2 ++
 arch/powerpc/kvm/booke.c          |  2 ++
 arch/s390/include/asm/kvm_host.h  |  2 ++
 arch/s390/kvm/kvm-s390.c          |  2 ++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/x86.c                |  2 ++
 virt/kvm/kvm_main.c               | 18 +++++++++++++++---
 10 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 32c8a675e5a4..3833736dd064 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -415,6 +415,8 @@ struct kvm_vm_stat {
 struct kvm_vcpu_stat {
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
+	u64 halt_poll_success_ns;
+	u64 halt_poll_fail_ns;
 	u64 halt_poll_invalid;
 	u64 halt_wakeup;
 	u64 hvc_exit_stat;
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 863a0d158fb8..55ebb9ea74f6 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -40,6 +40,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("mmio_exit_user", mmio_exit_user),
 	VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
 	VCPU_STAT("exits", exits),
+	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
 	{ NULL }
 };
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 2c343c346b79..e28b5a946e26 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -174,6 +174,8 @@ struct kvm_vcpu_stat {
 #endif
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
+	u64 halt_poll_success_ns;
+	u64 halt_poll_fail_ns;
 	u64 halt_poll_invalid;
 	u64 halt_wakeup;
 };
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9787cdec33e6..99ed08aff31e 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -72,6 +72,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
 	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
 	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
 	{NULL}
 };
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index c2984cb6dfa7..888afe8d35cc 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -54,6 +54,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("halt_wakeup", halt_wakeup),
 	VCPU_STAT("doorbell", dbell_exits),
 	VCPU_STAT("guest doorbell", gdbell_exits),
+	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
 	VM_STAT("remote_tlb_flush", remote_tlb_flush),
 	{ NULL }
 };
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d6bcd34f3ec3..176f74cc2257 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -375,6 +375,8 @@ struct kvm_vcpu_stat {
 	u64 halt_poll_invalid;
 	u64 halt_no_poll_steal;
 	u64 halt_wakeup;
+	u64 halt_poll_success_ns;
+	u64 halt_poll_fail_ns;
 	u64 instruction_lctl;
 	u64 instruction_lctlg;
 	u64 instruction_stctl;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 389ff1b7cd43..a560a368f92c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -75,6 +75,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
 	VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal),
 	VCPU_STAT("halt_wakeup", halt_wakeup),
+	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
 	VCPU_STAT("instruction_lctlg", instruction_lctlg),
 	VCPU_STAT("instruction_lctl", instruction_lctl),
 	VCPU_STAT("instruction_stctl", instruction_stctl),
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a3ce391c9e7a..fd78bd44b2d6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1031,6 +1031,8 @@ struct kvm_vcpu_stat {
 	u64 irq_injections;
 	u64 nmi_injections;
 	u64 req_event;
+	u64 halt_poll_success_ns;
+	u64 halt_poll_fail_ns;
 };
 
 struct x86_instruction_info;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fb7c1a8f1a5..471fccf7f850 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -217,6 +217,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	VCPU_STAT("nmi_injections", nmi_injections),
 	VCPU_STAT("req_event", req_event),
 	VCPU_STAT("l1d_flush", l1d_flush),
+	VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+	VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
 	VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
 	VM_STAT("mmu_pte_write", mmu_pte_write),
 	VM_STAT("mmu_pte_updated", mmu_pte_updated),
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 11844fad60fd..da6da386b591 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2667,18 +2667,27 @@ out:
 	return ret;
 }
 
+static inline void
+update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
+{
+	if (waited)
+		vcpu->stat.halt_poll_fail_ns += poll_ns;
+	else
+		vcpu->stat.halt_poll_success_ns += poll_ns;
+}
+
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
-	ktime_t start, cur;
+	ktime_t start, cur, poll_end;
 	bool waited = false;
 	u64 block_ns;
 
 	kvm_arch_vcpu_blocking(vcpu);
 
-	start = cur = ktime_get();
+	start = cur = poll_end = ktime_get();
 	if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
 		ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
@@ -2694,7 +2703,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 					++vcpu->stat.halt_poll_invalid;
 				goto out;
 			}
-			cur = ktime_get();
+			poll_end = cur = ktime_get();
 		} while (single_task_running() && ktime_before(cur, stop));
 	}
 
@@ -2714,6 +2723,9 @@ out:
 	kvm_arch_vcpu_unblocking(vcpu);
 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
 
+	update_halt_poll_stats(
+		vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
+
 	if (!kvm_arch_no_poll(vcpu)) {
 		if (!vcpu_valid_wakeup(vcpu)) {
 			shrink_halt_poll_ns(vcpu);
-- 
cgit v1.2.3


From 9ed24f4b712b855dcf7be3025b75b051cb73a2b7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 13 May 2020 11:40:34 +0100
Subject: KVM: arm64: Move virt/kvm/arm to arch/arm64

Now that the 32bit KVM/arm host is a distant memory, let's move the
whole of the KVM/arm64 code into the arm64 tree.

As they said in the song: Welcome Home (Sanitarium).

Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200513104034.74741-1-maz@kernel.org
---
 MAINTAINERS                           |    1 -
 arch/arm64/kvm/Makefile               |   44 +-
 arch/arm64/kvm/aarch32.c              |  204 +++
 arch/arm64/kvm/arch_timer.c           | 1180 ++++++++++++++
 arch/arm64/kvm/arm.c                  | 1681 ++++++++++++++++++++
 arch/arm64/kvm/handle_exit.c          |    2 +-
 arch/arm64/kvm/hyp/Makefile           |    9 +-
 arch/arm64/kvm/hyp/aarch32.c          |  140 ++
 arch/arm64/kvm/hyp/timer-sr.c         |   49 +
 arch/arm64/kvm/hyp/vgic-v3-sr.c       | 1126 +++++++++++++
 arch/arm64/kvm/hypercalls.c           |   71 +
 arch/arm64/kvm/mmio.c                 |  200 +++
 arch/arm64/kvm/mmu.c                  | 2447 +++++++++++++++++++++++++++++
 arch/arm64/kvm/perf.c                 |   57 +
 arch/arm64/kvm/pmu-emul.c             |  869 ++++++++++
 arch/arm64/kvm/psci.c                 |  564 +++++++
 arch/arm64/kvm/pvtime.c               |  131 ++
 arch/arm64/kvm/trace.h                |  216 +--
 arch/arm64/kvm/trace_arm.h            |  378 +++++
 arch/arm64/kvm/trace_handle_exit.h    |  215 +++
 arch/arm64/kvm/vgic-sys-reg-v3.c      |    2 +-
 arch/arm64/kvm/vgic/trace.h           |   38 +
 arch/arm64/kvm/vgic/vgic-debug.c      |  300 ++++
 arch/arm64/kvm/vgic/vgic-init.c       |  556 +++++++
 arch/arm64/kvm/vgic/vgic-irqfd.c      |  141 ++
 arch/arm64/kvm/vgic/vgic-its.c        | 2783 +++++++++++++++++++++++++++++++++
 arch/arm64/kvm/vgic/vgic-kvm-device.c |  741 +++++++++
 arch/arm64/kvm/vgic/vgic-mmio-v2.c    |  550 +++++++
 arch/arm64/kvm/vgic/vgic-mmio-v3.c    | 1063 +++++++++++++
 arch/arm64/kvm/vgic/vgic-mmio.c       | 1088 +++++++++++++
 arch/arm64/kvm/vgic/vgic-mmio.h       |  227 +++
 arch/arm64/kvm/vgic/vgic-v2.c         |  504 ++++++
 arch/arm64/kvm/vgic/vgic-v3.c         |  691 ++++++++
 arch/arm64/kvm/vgic/vgic-v4.c         |  453 ++++++
 arch/arm64/kvm/vgic/vgic.c            | 1011 ++++++++++++
 arch/arm64/kvm/vgic/vgic.h            |  321 ++++
 virt/kvm/arm/aarch32.c                |  204 ---
 virt/kvm/arm/arch_timer.c             | 1180 --------------
 virt/kvm/arm/arm.c                    | 1681 --------------------
 virt/kvm/arm/hyp/aarch32.c            |  140 --
 virt/kvm/arm/hyp/timer-sr.c           |   49 -
 virt/kvm/arm/hyp/vgic-v3-sr.c         | 1130 -------------
 virt/kvm/arm/hypercalls.c             |   71 -
 virt/kvm/arm/mmio.c                   |  200 ---
 virt/kvm/arm/mmu.c                    | 2447 -----------------------------
 virt/kvm/arm/perf.c                   |   57 -
 virt/kvm/arm/pmu.c                    |  869 ----------
 virt/kvm/arm/psci.c                   |  564 -------
 virt/kvm/arm/pvtime.c                 |  131 --
 virt/kvm/arm/trace.h                  |  379 -----
 virt/kvm/arm/vgic/trace.h             |   38 -
 virt/kvm/arm/vgic/vgic-debug.c        |  300 ----
 virt/kvm/arm/vgic/vgic-init.c         |  556 -------
 virt/kvm/arm/vgic/vgic-irqfd.c        |  141 --
 virt/kvm/arm/vgic/vgic-its.c          | 2783 ---------------------------------
 virt/kvm/arm/vgic/vgic-kvm-device.c   |  741 ---------
 virt/kvm/arm/vgic/vgic-mmio-v2.c      |  550 -------
 virt/kvm/arm/vgic/vgic-mmio-v3.c      | 1063 -------------
 virt/kvm/arm/vgic/vgic-mmio.c         | 1088 -------------
 virt/kvm/arm/vgic/vgic-mmio.h         |  227 ---
 virt/kvm/arm/vgic/vgic-v2.c           |  504 ------
 virt/kvm/arm/vgic/vgic-v3.c           |  693 --------
 virt/kvm/arm/vgic/vgic-v4.c           |  453 ------
 virt/kvm/arm/vgic/vgic.c              | 1011 ------------
 virt/kvm/arm/vgic/vgic.h              |  321 ----
 65 files changed, 19810 insertions(+), 19814 deletions(-)
 create mode 100644 arch/arm64/kvm/aarch32.c
 create mode 100644 arch/arm64/kvm/arch_timer.c
 create mode 100644 arch/arm64/kvm/arm.c
 create mode 100644 arch/arm64/kvm/hyp/aarch32.c
 create mode 100644 arch/arm64/kvm/hyp/timer-sr.c
 create mode 100644 arch/arm64/kvm/hyp/vgic-v3-sr.c
 create mode 100644 arch/arm64/kvm/hypercalls.c
 create mode 100644 arch/arm64/kvm/mmio.c
 create mode 100644 arch/arm64/kvm/mmu.c
 create mode 100644 arch/arm64/kvm/perf.c
 create mode 100644 arch/arm64/kvm/pmu-emul.c
 create mode 100644 arch/arm64/kvm/psci.c
 create mode 100644 arch/arm64/kvm/pvtime.c
 create mode 100644 arch/arm64/kvm/trace_arm.h
 create mode 100644 arch/arm64/kvm/trace_handle_exit.h
 create mode 100644 arch/arm64/kvm/vgic/trace.h
 create mode 100644 arch/arm64/kvm/vgic/vgic-debug.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-init.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-irqfd.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-its.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-kvm-device.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-mmio-v2.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-mmio-v3.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-mmio.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-mmio.h
 create mode 100644 arch/arm64/kvm/vgic/vgic-v2.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-v3.c
 create mode 100644 arch/arm64/kvm/vgic/vgic-v4.c
 create mode 100644 arch/arm64/kvm/vgic/vgic.c
 create mode 100644 arch/arm64/kvm/vgic/vgic.h
 delete mode 100644 virt/kvm/arm/aarch32.c
 delete mode 100644 virt/kvm/arm/arch_timer.c
 delete mode 100644 virt/kvm/arm/arm.c
 delete mode 100644 virt/kvm/arm/hyp/aarch32.c
 delete mode 100644 virt/kvm/arm/hyp/timer-sr.c
 delete mode 100644 virt/kvm/arm/hyp/vgic-v3-sr.c
 delete mode 100644 virt/kvm/arm/hypercalls.c
 delete mode 100644 virt/kvm/arm/mmio.c
 delete mode 100644 virt/kvm/arm/mmu.c
 delete mode 100644 virt/kvm/arm/perf.c
 delete mode 100644 virt/kvm/arm/pmu.c
 delete mode 100644 virt/kvm/arm/psci.c
 delete mode 100644 virt/kvm/arm/pvtime.c
 delete mode 100644 virt/kvm/arm/trace.h
 delete mode 100644 virt/kvm/arm/vgic/trace.h
 delete mode 100644 virt/kvm/arm/vgic/vgic-debug.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-init.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-irqfd.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-its.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-kvm-device.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-mmio-v2.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-mmio-v3.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-mmio.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-mmio.h
 delete mode 100644 virt/kvm/arm/vgic/vgic-v2.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-v3.c
 delete mode 100644 virt/kvm/arm/vgic/vgic-v4.c
 delete mode 100644 virt/kvm/arm/vgic/vgic.c
 delete mode 100644 virt/kvm/arm/vgic/vgic.h

(limited to 'arch')

diff --git a/MAINTAINERS b/MAINTAINERS
index 091ec22c1a23..6c5b928989ed 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9295,7 +9295,6 @@ F:	arch/arm64/include/asm/kvm*
 F:	arch/arm64/include/uapi/asm/kvm*
 F:	arch/arm64/kvm/
 F:	include/kvm/arm_*
-F:	virt/kvm/arm/
 
 KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips)
 L:	linux-mips@vger.kernel.org
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 5ffbdc39e780..7a3768538343 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -3,37 +3,37 @@
 # Makefile for Kernel-based Virtual Machine module
 #
 
-ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic
+ccflags-y += -I $(srctree)/$(src)
 
 KVM=../../../virt/kvm
 
 obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
 obj-$(CONFIG_KVM_ARM_HOST) += hyp/
 
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hypercalls.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/pvtime.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o
+kvm-$(CONFIG_KVM_ARM_HOST) += arm.o mmu.o mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += psci.o perf.o
+kvm-$(CONFIG_KVM_ARM_HOST) += hypercalls.o
+kvm-$(CONFIG_KVM_ARM_HOST) += pvtime.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
+kvm-$(CONFIG_KVM_ARM_HOST) += aarch32.o
+kvm-$(CONFIG_KVM_ARM_HOST) += arch_timer.o
+kvm-$(CONFIG_KVM_ARM_PMU)  += pmu-emul.o
 
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v4.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-debug.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
-kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-init.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-irqfd.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v4.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v2.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v3.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-kvm-device.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-its.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-debug.o
diff --git a/arch/arm64/kvm/aarch32.c b/arch/arm64/kvm/aarch32.c
new file mode 100644
index 000000000000..0a356aa91aa1
--- /dev/null
+++ b/arch/arm64/kvm/aarch32.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * (not much of an) Emulation layer for 32bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#define DFSR_FSC_EXTABT_LPAE	0x10
+#define DFSR_FSC_EXTABT_nLPAE	0x08
+#define DFSR_LPAE		BIT(9)
+
+/*
+ * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
+ */
+static const u8 return_offsets[8][2] = {
+	[0] = { 0, 0 },		/* Reset, unused */
+	[1] = { 4, 2 },		/* Undefined */
+	[2] = { 0, 0 },		/* SVC, unused */
+	[3] = { 4, 4 },		/* Prefetch abort */
+	[4] = { 8, 8 },		/* Data abort */
+	[5] = { 0, 0 },		/* HVC, unused */
+	[6] = { 4, 4 },		/* IRQ, unused */
+	[7] = { 4, 4 },		/* FIQ, unused */
+};
+
+/*
+ * When an exception is taken, most CPSR fields are left unchanged in the
+ * handler. However, some are explicitly overridden (e.g. M[4:0]).
+ *
+ * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
+ * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
+ * obsoleted by the ARMv7 virtualization extensions and is RES0.
+ *
+ * For the SPSR layout seen from AArch32, see:
+ * - ARM DDI 0406C.d, page B1-1148
+ * - ARM DDI 0487E.a, page G8-6264
+ *
+ * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
+ * - ARM DDI 0487E.a, page C5-426
+ *
+ * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
+ * MSB to LSB.
+ */
+static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
+{
+	u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
+	unsigned long old, new;
+
+	old = *vcpu_cpsr(vcpu);
+	new = 0;
+
+	new |= (old & PSR_AA32_N_BIT);
+	new |= (old & PSR_AA32_Z_BIT);
+	new |= (old & PSR_AA32_C_BIT);
+	new |= (old & PSR_AA32_V_BIT);
+	new |= (old & PSR_AA32_Q_BIT);
+
+	// CPSR.IT[7:0] are set to zero upon any exception
+	// See ARM DDI 0487E.a, section G1.12.3
+	// See ARM DDI 0406C.d, section B1.8.3
+
+	new |= (old & PSR_AA32_DIT_BIT);
+
+	// CPSR.SSBS is set to SCTLR.DSSBS upon any exception
+	// See ARM DDI 0487E.a, page G8-6244
+	if (sctlr & BIT(31))
+		new |= PSR_AA32_SSBS_BIT;
+
+	// CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
+	// SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
+	// See ARM DDI 0487E.a, page G8-6246
+	new |= (old & PSR_AA32_PAN_BIT);
+	if (!(sctlr & BIT(23)))
+		new |= PSR_AA32_PAN_BIT;
+
+	// SS does not exist in AArch32, so ignore
+
+	// CPSR.IL is set to zero upon any exception
+	// See ARM DDI 0487E.a, page G1-5527
+
+	new |= (old & PSR_AA32_GE_MASK);
+
+	// CPSR.IT[7:0] are set to zero upon any exception
+	// See prior comment above
+
+	// CPSR.E is set to SCTLR.EE upon any exception
+	// See ARM DDI 0487E.a, page G8-6245
+	// See ARM DDI 0406C.d, page B4-1701
+	if (sctlr & BIT(25))
+		new |= PSR_AA32_E_BIT;
+
+	// CPSR.A is unchanged upon an exception to Undefined, Supervisor
+	// CPSR.A is set upon an exception to other modes
+	// See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+	// See ARM DDI 0406C.d, page B1-1182
+	new |= (old & PSR_AA32_A_BIT);
+	if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
+		new |= PSR_AA32_A_BIT;
+
+	// CPSR.I is set upon any exception
+	// See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+	// See ARM DDI 0406C.d, page B1-1182
+	new |= PSR_AA32_I_BIT;
+
+	// CPSR.F is set upon an exception to FIQ
+	// CPSR.F is unchanged upon an exception to other modes
+	// See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+	// See ARM DDI 0406C.d, page B1-1182
+	new |= (old & PSR_AA32_F_BIT);
+	if (mode == PSR_AA32_MODE_FIQ)
+		new |= PSR_AA32_F_BIT;
+
+	// CPSR.T is set to SCTLR.TE upon any exception
+	// See ARM DDI 0487E.a, page G8-5514
+	// See ARM DDI 0406C.d, page B1-1181
+	if (sctlr & BIT(30))
+		new |= PSR_AA32_T_BIT;
+
+	new |= mode;
+
+	return new;
+}
+
+static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
+{
+	unsigned long spsr = *vcpu_cpsr(vcpu);
+	bool is_thumb = (spsr & PSR_AA32_T_BIT);
+	u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
+	u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
+
+	*vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
+
+	/* Note: These now point to the banked copies */
+	vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr));
+	*vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
+
+	/* Branch to exception vector */
+	if (sctlr & (1 << 13))
+		vect_offset += 0xffff0000;
+	else /* always have security exceptions */
+		vect_offset += vcpu_cp15(vcpu, c12_VBAR);
+
+	*vcpu_pc(vcpu) = vect_offset;
+}
+
+void kvm_inject_undef32(struct kvm_vcpu *vcpu)
+{
+	prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4);
+}
+
+/*
+ * Modelled after TakeDataAbortException() and TakePrefetchAbortException
+ * pseudocode.
+ */
+static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
+			 unsigned long addr)
+{
+	u32 vect_offset;
+	u32 *far, *fsr;
+	bool is_lpae;
+
+	if (is_pabt) {
+		vect_offset = 12;
+		far = &vcpu_cp15(vcpu, c6_IFAR);
+		fsr = &vcpu_cp15(vcpu, c5_IFSR);
+	} else { /* !iabt */
+		vect_offset = 16;
+		far = &vcpu_cp15(vcpu, c6_DFAR);
+		fsr = &vcpu_cp15(vcpu, c5_DFSR);
+	}
+
+	prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset);
+
+	*far = addr;
+
+	/* Give the guest an IMPLEMENTATION DEFINED exception */
+	is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
+	if (is_lpae) {
+		*fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
+	} else {
+		/* no need to shuffle FS[4] into DFSR[10] as its 0 */
+		*fsr = DFSR_FSC_EXTABT_nLPAE;
+	}
+}
+
+void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+	inject_abt32(vcpu, false, addr);
+}
+
+void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+	inject_abt32(vcpu, true, addr);
+}
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
new file mode 100644
index 000000000000..93bd59b46848
--- /dev/null
+++ b/arch/arm64/kvm/arch_timer.c
@@ -0,0 +1,1180 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/uaccess.h>
+
+#include <clocksource/arm_arch_timer.h>
+#include <asm/arch_timer.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#include <kvm/arm_vgic.h>
+#include <kvm/arm_arch_timer.h>
+
+#include "trace.h"
+
+static struct timecounter *timecounter;
+static unsigned int host_vtimer_irq;
+static unsigned int host_ptimer_irq;
+static u32 host_vtimer_irq_flags;
+static u32 host_ptimer_irq_flags;
+
+static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
+
+static const struct kvm_irq_level default_ptimer_irq = {
+	.irq	= 30,
+	.level	= 1,
+};
+
+static const struct kvm_irq_level default_vtimer_irq = {
+	.irq	= 27,
+	.level	= 1,
+};
+
+static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
+				 struct arch_timer_context *timer_ctx);
+static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+				struct arch_timer_context *timer,
+				enum kvm_arch_timer_regs treg,
+				u64 val);
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+			      struct arch_timer_context *timer,
+			      enum kvm_arch_timer_regs treg);
+
+u64 kvm_phys_timer_read(void)
+{
+	return timecounter->cc->read(timecounter->cc);
+}
+
+static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
+{
+	if (has_vhe()) {
+		map->direct_vtimer = vcpu_vtimer(vcpu);
+		map->direct_ptimer = vcpu_ptimer(vcpu);
+		map->emul_ptimer = NULL;
+	} else {
+		map->direct_vtimer = vcpu_vtimer(vcpu);
+		map->direct_ptimer = NULL;
+		map->emul_ptimer = vcpu_ptimer(vcpu);
+	}
+
+	trace_kvm_get_timer_map(vcpu->vcpu_id, map);
+}
+
+static inline bool userspace_irqchip(struct kvm *kvm)
+{
+	return static_branch_unlikely(&userspace_irqchip_in_use) &&
+		unlikely(!irqchip_in_kernel(kvm));
+}
+
+static void soft_timer_start(struct hrtimer *hrt, u64 ns)
+{
+	hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
+		      HRTIMER_MODE_ABS_HARD);
+}
+
+static void soft_timer_cancel(struct hrtimer *hrt)
+{
+	hrtimer_cancel(hrt);
+}
+
+static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
+{
+	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
+	struct arch_timer_context *ctx;
+	struct timer_map map;
+
+	/*
+	 * We may see a timer interrupt after vcpu_put() has been called which
+	 * sets the CPU's vcpu pointer to NULL, because even though the timer
+	 * has been disabled in timer_save_state(), the hardware interrupt
+	 * signal may not have been retired from the interrupt controller yet.
+	 */
+	if (!vcpu)
+		return IRQ_HANDLED;
+
+	get_timer_map(vcpu, &map);
+
+	if (irq == host_vtimer_irq)
+		ctx = map.direct_vtimer;
+	else
+		ctx = map.direct_ptimer;
+
+	if (kvm_timer_should_fire(ctx))
+		kvm_timer_update_irq(vcpu, true, ctx);
+
+	if (userspace_irqchip(vcpu->kvm) &&
+	    !static_branch_unlikely(&has_gic_active_state))
+		disable_percpu_irq(host_vtimer_irq);
+
+	return IRQ_HANDLED;
+}
+
+static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
+{
+	u64 cval, now;
+
+	cval = timer_ctx->cnt_cval;
+	now = kvm_phys_timer_read() - timer_ctx->cntvoff;
+
+	if (now < cval) {
+		u64 ns;
+
+		ns = cyclecounter_cyc2ns(timecounter->cc,
+					 cval - now,
+					 timecounter->mask,
+					 &timecounter->frac);
+		return ns;
+	}
+
+	return 0;
+}
+
+static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
+{
+	WARN_ON(timer_ctx && timer_ctx->loaded);
+	return timer_ctx &&
+	       !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
+		(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
+}
+
+/*
+ * Returns the earliest expiration time in ns among guest timers.
+ * Note that it will return 0 if none of timers can fire.
+ */
+static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
+{
+	u64 min_delta = ULLONG_MAX;
+	int i;
+
+	for (i = 0; i < NR_KVM_TIMERS; i++) {
+		struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
+
+		WARN(ctx->loaded, "timer %d loaded\n", i);
+		if (kvm_timer_irq_can_fire(ctx))
+			min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
+	}
+
+	/* If none of timers can fire, then return 0 */
+	if (min_delta == ULLONG_MAX)
+		return 0;
+
+	return min_delta;
+}
+
+static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
+{
+	struct arch_timer_cpu *timer;
+	struct kvm_vcpu *vcpu;
+	u64 ns;
+
+	timer = container_of(hrt, struct arch_timer_cpu, bg_timer);
+	vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
+
+	/*
+	 * Check that the timer has really expired from the guest's
+	 * PoV (NTP on the host may have forced it to expire
+	 * early). If we should have slept longer, restart it.
+	 */
+	ns = kvm_timer_earliest_exp(vcpu);
+	if (unlikely(ns)) {
+		hrtimer_forward_now(hrt, ns_to_ktime(ns));
+		return HRTIMER_RESTART;
+	}
+
+	kvm_vcpu_wake_up(vcpu);
+	return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
+{
+	struct arch_timer_context *ctx;
+	struct kvm_vcpu *vcpu;
+	u64 ns;
+
+	ctx = container_of(hrt, struct arch_timer_context, hrtimer);
+	vcpu = ctx->vcpu;
+
+	trace_kvm_timer_hrtimer_expire(ctx);
+
+	/*
+	 * Check that the timer has really expired from the guest's
+	 * PoV (NTP on the host may have forced it to expire
+	 * early). If not ready, schedule for a later time.
+	 */
+	ns = kvm_timer_compute_delta(ctx);
+	if (unlikely(ns)) {
+		hrtimer_forward_now(hrt, ns_to_ktime(ns));
+		return HRTIMER_RESTART;
+	}
+
+	kvm_timer_update_irq(vcpu, true, ctx);
+	return HRTIMER_NORESTART;
+}
+
+static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
+{
+	enum kvm_arch_timers index;
+	u64 cval, now;
+
+	if (!timer_ctx)
+		return false;
+
+	index = arch_timer_ctx_index(timer_ctx);
+
+	if (timer_ctx->loaded) {
+		u32 cnt_ctl = 0;
+
+		switch (index) {
+		case TIMER_VTIMER:
+			cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
+			break;
+		case TIMER_PTIMER:
+			cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
+			break;
+		case NR_KVM_TIMERS:
+			/* GCC is braindead */
+			cnt_ctl = 0;
+			break;
+		}
+
+		return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
+		        (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
+		       !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
+	}
+
+	if (!kvm_timer_irq_can_fire(timer_ctx))
+		return false;
+
+	cval = timer_ctx->cnt_cval;
+	now = kvm_phys_timer_read() - timer_ctx->cntvoff;
+
+	return cval <= now;
+}
+
+bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
+{
+	struct timer_map map;
+
+	get_timer_map(vcpu, &map);
+
+	return kvm_timer_should_fire(map.direct_vtimer) ||
+	       kvm_timer_should_fire(map.direct_ptimer) ||
+	       kvm_timer_should_fire(map.emul_ptimer);
+}
+
+/*
+ * Reflect the timer output level into the kvm_run structure
+ */
+void kvm_timer_update_run(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+	/* Populate the device bitmap with the timer states */
+	regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
+				    KVM_ARM_DEV_EL1_PTIMER);
+	if (kvm_timer_should_fire(vtimer))
+		regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
+	if (kvm_timer_should_fire(ptimer))
+		regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
+}
+
+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
+				 struct arch_timer_context *timer_ctx)
+{
+	int ret;
+
+	timer_ctx->irq.level = new_level;
+	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
+				   timer_ctx->irq.level);
+
+	if (!userspace_irqchip(vcpu->kvm)) {
+		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+					  timer_ctx->irq.irq,
+					  timer_ctx->irq.level,
+					  timer_ctx);
+		WARN_ON(ret);
+	}
+}
+
+/* Only called for a fully emulated timer */
+static void timer_emulate(struct arch_timer_context *ctx)
+{
+	bool should_fire = kvm_timer_should_fire(ctx);
+
+	trace_kvm_timer_emulate(ctx, should_fire);
+
+	if (should_fire != ctx->irq.level) {
+		kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
+		return;
+	}
+
+	/*
+	 * If the timer can fire now, we don't need to have a soft timer
+	 * scheduled for the future.  If the timer cannot fire at all,
+	 * then we also don't need a soft timer.
+	 */
+	if (!kvm_timer_irq_can_fire(ctx)) {
+		soft_timer_cancel(&ctx->hrtimer);
+		return;
+	}
+
+	soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
+}
+
+static void timer_save_state(struct arch_timer_context *ctx)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+	enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
+	unsigned long flags;
+
+	if (!timer->enabled)
+		return;
+
+	local_irq_save(flags);
+
+	if (!ctx->loaded)
+		goto out;
+
+	switch (index) {
+	case TIMER_VTIMER:
+		ctx->cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
+		ctx->cnt_cval = read_sysreg_el0(SYS_CNTV_CVAL);
+
+		/* Disable the timer */
+		write_sysreg_el0(0, SYS_CNTV_CTL);
+		isb();
+
+		break;
+	case TIMER_PTIMER:
+		ctx->cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
+		ctx->cnt_cval = read_sysreg_el0(SYS_CNTP_CVAL);
+
+		/* Disable the timer */
+		write_sysreg_el0(0, SYS_CNTP_CTL);
+		isb();
+
+		break;
+	case NR_KVM_TIMERS:
+		BUG();
+	}
+
+	trace_kvm_timer_save_state(ctx);
+
+	ctx->loaded = false;
+out:
+	local_irq_restore(flags);
+}
+
+/*
+ * Schedule the background timer before calling kvm_vcpu_block, so that this
+ * thread is removed from its waitqueue and made runnable when there's a timer
+ * interrupt to handle.
+ */
+static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+	struct timer_map map;
+
+	get_timer_map(vcpu, &map);
+
+	/*
+	 * If no timers are capable of raising interrupts (disabled or
+	 * masked), then there's no more work for us to do.
+	 */
+	if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
+	    !kvm_timer_irq_can_fire(map.direct_ptimer) &&
+	    !kvm_timer_irq_can_fire(map.emul_ptimer))
+		return;
+
+	/*
+	 * At least one guest time will expire. Schedule a background timer.
+	 * Set the earliest expiration time among the guest timers.
+	 */
+	soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
+}
+
+static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+	soft_timer_cancel(&timer->bg_timer);
+}
+
+static void timer_restore_state(struct arch_timer_context *ctx)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+	enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
+	unsigned long flags;
+
+	if (!timer->enabled)
+		return;
+
+	local_irq_save(flags);
+
+	if (ctx->loaded)
+		goto out;
+
+	switch (index) {
+	case TIMER_VTIMER:
+		write_sysreg_el0(ctx->cnt_cval, SYS_CNTV_CVAL);
+		isb();
+		write_sysreg_el0(ctx->cnt_ctl, SYS_CNTV_CTL);
+		break;
+	case TIMER_PTIMER:
+		write_sysreg_el0(ctx->cnt_cval, SYS_CNTP_CVAL);
+		isb();
+		write_sysreg_el0(ctx->cnt_ctl, SYS_CNTP_CTL);
+		break;
+	case NR_KVM_TIMERS:
+		BUG();
+	}
+
+	trace_kvm_timer_restore_state(ctx);
+
+	ctx->loaded = true;
+out:
+	local_irq_restore(flags);
+}
+
+static void set_cntvoff(u64 cntvoff)
+{
+	u32 low = lower_32_bits(cntvoff);
+	u32 high = upper_32_bits(cntvoff);
+
+	/*
+	 * Since kvm_call_hyp doesn't fully support the ARM PCS especially on
+	 * 32-bit systems, but rather passes register by register shifted one
+	 * place (we put the function address in r0/x0), we cannot simply pass
+	 * a 64-bit value as an argument, but have to split the value in two
+	 * 32-bit halves.
+	 */
+	kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
+}
+
+static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
+{
+	int r;
+	r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
+	WARN_ON(r);
+}
+
+static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
+{
+	struct kvm_vcpu *vcpu = ctx->vcpu;
+	bool phys_active = false;
+
+	/*
+	 * Update the timer output so that it is likely to match the
+	 * state we're about to restore. If the timer expires between
+	 * this point and the register restoration, we'll take the
+	 * interrupt anyway.
+	 */
+	kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
+
+	if (irqchip_in_kernel(vcpu->kvm))
+		phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
+
+	phys_active |= ctx->irq.level;
+
+	set_timer_irq_phys_active(ctx, phys_active);
+}
+
+static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+	/*
+	 * Update the timer output so that it is likely to match the
+	 * state we're about to restore. If the timer expires between
+	 * this point and the register restoration, we'll take the
+	 * interrupt anyway.
+	 */
+	kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
+
+	/*
+	 * When using a userspace irqchip with the architected timers and a
+	 * host interrupt controller that doesn't support an active state, we
+	 * must still prevent continuously exiting from the guest, and
+	 * therefore mask the physical interrupt by disabling it on the host
+	 * interrupt controller when the virtual level is high, such that the
+	 * guest can make forward progress.  Once we detect the output level
+	 * being de-asserted, we unmask the interrupt again so that we exit
+	 * from the guest when the timer fires.
+	 */
+	if (vtimer->irq.level)
+		disable_percpu_irq(host_vtimer_irq);
+	else
+		enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+}
+
+void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+	struct timer_map map;
+
+	if (unlikely(!timer->enabled))
+		return;
+
+	get_timer_map(vcpu, &map);
+
+	if (static_branch_likely(&has_gic_active_state)) {
+		kvm_timer_vcpu_load_gic(map.direct_vtimer);
+		if (map.direct_ptimer)
+			kvm_timer_vcpu_load_gic(map.direct_ptimer);
+	} else {
+		kvm_timer_vcpu_load_nogic(vcpu);
+	}
+
+	set_cntvoff(map.direct_vtimer->cntvoff);
+
+	kvm_timer_unblocking(vcpu);
+
+	timer_restore_state(map.direct_vtimer);
+	if (map.direct_ptimer)
+		timer_restore_state(map.direct_ptimer);
+
+	if (map.emul_ptimer)
+		timer_emulate(map.emul_ptimer);
+}
+
+bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
+	bool vlevel, plevel;
+
+	if (likely(irqchip_in_kernel(vcpu->kvm)))
+		return false;
+
+	vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
+	plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
+
+	return kvm_timer_should_fire(vtimer) != vlevel ||
+	       kvm_timer_should_fire(ptimer) != plevel;
+}
+
+void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+	struct timer_map map;
+
+	if (unlikely(!timer->enabled))
+		return;
+
+	get_timer_map(vcpu, &map);
+
+	timer_save_state(map.direct_vtimer);
+	if (map.direct_ptimer)
+		timer_save_state(map.direct_ptimer);
+
+	/*
+	 * Cancel soft timer emulation, because the only case where we
+	 * need it after a vcpu_put is in the context of a sleeping VCPU, and
+	 * in that case we already factor in the deadline for the physical
+	 * timer when scheduling the bg_timer.
+	 *
+	 * In any case, we re-schedule the hrtimer for the physical timer when
+	 * coming back to the VCPU thread in kvm_timer_vcpu_load().
+	 */
+	if (map.emul_ptimer)
+		soft_timer_cancel(&map.emul_ptimer->hrtimer);
+
+	if (swait_active(kvm_arch_vcpu_wq(vcpu)))
+		kvm_timer_blocking(vcpu);
+
+	/*
+	 * The kernel may decide to run userspace after calling vcpu_put, so
+	 * we reset cntvoff to 0 to ensure a consistent read between user
+	 * accesses to the virtual counter and kernel access to the physical
+	 * counter of non-VHE case. For VHE, the virtual counter uses a fixed
+	 * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
+	 */
+	set_cntvoff(0);
+}
+
+/*
+ * With a userspace irqchip we have to check if the guest de-asserted the
+ * timer and if so, unmask the timer irq signal on the host interrupt
+ * controller to ensure that we see future timer signals.
+ */
+static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+	if (!kvm_timer_should_fire(vtimer)) {
+		kvm_timer_update_irq(vcpu, false, vtimer);
+		if (static_branch_likely(&has_gic_active_state))
+			set_timer_irq_phys_active(vtimer, false);
+		else
+			enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+	}
+}
+
+void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+	if (unlikely(!timer->enabled))
+		return;
+
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+		unmask_vtimer_irq_user(vcpu);
+}
+
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+	struct timer_map map;
+
+	get_timer_map(vcpu, &map);
+
+	/*
+	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
+	 * and to 0 for ARMv7.  We provide an implementation that always
+	 * resets the timer to be disabled and unmasked and is compliant with
+	 * the ARMv7 architecture.
+	 */
+	vcpu_vtimer(vcpu)->cnt_ctl = 0;
+	vcpu_ptimer(vcpu)->cnt_ctl = 0;
+
+	if (timer->enabled) {
+		kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
+		kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
+
+		if (irqchip_in_kernel(vcpu->kvm)) {
+			kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
+			if (map.direct_ptimer)
+				kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
+		}
+	}
+
+	if (map.emul_ptimer)
+		soft_timer_cancel(&map.emul_ptimer->hrtimer);
+
+	return 0;
+}
+
+/* Make the updates of cntvoff for all vtimer contexts atomic */
+static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
+{
+	int i;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu *tmp;
+
+	mutex_lock(&kvm->lock);
+	kvm_for_each_vcpu(i, tmp, kvm)
+		vcpu_vtimer(tmp)->cntvoff = cntvoff;
+
+	/*
+	 * When called from the vcpu create path, the CPU being created is not
+	 * included in the loop above, so we just set it here as well.
+	 */
+	vcpu_vtimer(vcpu)->cntvoff = cntvoff;
+	mutex_unlock(&kvm->lock);
+}
+
+void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+
+	/* Synchronize cntvoff across all vtimers of a VM. */
+	update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
+	ptimer->cntvoff = 0;
+
+	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+	timer->bg_timer.function = kvm_bg_timer_expire;
+
+	hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+	hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+	vtimer->hrtimer.function = kvm_hrtimer_expire;
+	ptimer->hrtimer.function = kvm_hrtimer_expire;
+
+	vtimer->irq.irq = default_vtimer_irq.irq;
+	ptimer->irq.irq = default_ptimer_irq.irq;
+
+	vtimer->host_timer_irq = host_vtimer_irq;
+	ptimer->host_timer_irq = host_ptimer_irq;
+
+	vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
+	ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
+
+	vtimer->vcpu = vcpu;
+	ptimer->vcpu = vcpu;
+}
+
+static void kvm_timer_init_interrupt(void *info)
+{
+	enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+	enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
+}
+
+int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
+{
+	struct arch_timer_context *timer;
+
+	switch (regid) {
+	case KVM_REG_ARM_TIMER_CTL:
+		timer = vcpu_vtimer(vcpu);
+		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
+		break;
+	case KVM_REG_ARM_TIMER_CNT:
+		timer = vcpu_vtimer(vcpu);
+		update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
+		break;
+	case KVM_REG_ARM_TIMER_CVAL:
+		timer = vcpu_vtimer(vcpu);
+		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
+		break;
+	case KVM_REG_ARM_PTIMER_CTL:
+		timer = vcpu_ptimer(vcpu);
+		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
+		break;
+	case KVM_REG_ARM_PTIMER_CVAL:
+		timer = vcpu_ptimer(vcpu);
+		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
+		break;
+
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static u64 read_timer_ctl(struct arch_timer_context *timer)
+{
+	/*
+	 * Set ISTATUS bit if it's expired.
+	 * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
+	 * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
+	 * regardless of ENABLE bit for our implementation convenience.
+	 */
+	if (!kvm_timer_compute_delta(timer))
+		return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT;
+	else
+		return timer->cnt_ctl;
+}
+
+u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
+{
+	switch (regid) {
+	case KVM_REG_ARM_TIMER_CTL:
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_vtimer(vcpu), TIMER_REG_CTL);
+	case KVM_REG_ARM_TIMER_CNT:
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_vtimer(vcpu), TIMER_REG_CNT);
+	case KVM_REG_ARM_TIMER_CVAL:
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_vtimer(vcpu), TIMER_REG_CVAL);
+	case KVM_REG_ARM_PTIMER_CTL:
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_ptimer(vcpu), TIMER_REG_CTL);
+	case KVM_REG_ARM_PTIMER_CNT:
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_ptimer(vcpu), TIMER_REG_CNT);
+	case KVM_REG_ARM_PTIMER_CVAL:
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_ptimer(vcpu), TIMER_REG_CVAL);
+	}
+	return (u64)-1;
+}
+
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+			      struct arch_timer_context *timer,
+			      enum kvm_arch_timer_regs treg)
+{
+	u64 val;
+
+	switch (treg) {
+	case TIMER_REG_TVAL:
+		val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
+		val &= lower_32_bits(val);
+		break;
+
+	case TIMER_REG_CTL:
+		val = read_timer_ctl(timer);
+		break;
+
+	case TIMER_REG_CVAL:
+		val = timer->cnt_cval;
+		break;
+
+	case TIMER_REG_CNT:
+		val = kvm_phys_timer_read() - timer->cntvoff;
+		break;
+
+	default:
+		BUG();
+	}
+
+	return val;
+}
+
+u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
+			      enum kvm_arch_timers tmr,
+			      enum kvm_arch_timer_regs treg)
+{
+	u64 val;
+
+	preempt_disable();
+	kvm_timer_vcpu_put(vcpu);
+
+	val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
+
+	kvm_timer_vcpu_load(vcpu);
+	preempt_enable();
+
+	return val;
+}
+
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+				struct arch_timer_context *timer,
+				enum kvm_arch_timer_regs treg,
+				u64 val)
+{
+	switch (treg) {
+	case TIMER_REG_TVAL:
+		timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val;
+		break;
+
+	case TIMER_REG_CTL:
+		timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT;
+		break;
+
+	case TIMER_REG_CVAL:
+		timer->cnt_cval = val;
+		break;
+
+	default:
+		BUG();
+	}
+}
+
+void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
+				enum kvm_arch_timers tmr,
+				enum kvm_arch_timer_regs treg,
+				u64 val)
+{
+	preempt_disable();
+	kvm_timer_vcpu_put(vcpu);
+
+	kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
+
+	kvm_timer_vcpu_load(vcpu);
+	preempt_enable();
+}
+
+static int kvm_timer_starting_cpu(unsigned int cpu)
+{
+	kvm_timer_init_interrupt(NULL);
+	return 0;
+}
+
+static int kvm_timer_dying_cpu(unsigned int cpu)
+{
+	disable_percpu_irq(host_vtimer_irq);
+	return 0;
+}
+
+int kvm_timer_hyp_init(bool has_gic)
+{
+	struct arch_timer_kvm_info *info;
+	int err;
+
+	info = arch_timer_get_kvm_info();
+	timecounter = &info->timecounter;
+
+	if (!timecounter->cc) {
+		kvm_err("kvm_arch_timer: uninitialized timecounter\n");
+		return -ENODEV;
+	}
+
+	/* First, do the virtual EL1 timer irq */
+
+	if (info->virtual_irq <= 0) {
+		kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
+			info->virtual_irq);
+		return -ENODEV;
+	}
+	host_vtimer_irq = info->virtual_irq;
+
+	host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
+	if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
+	    host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
+		kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
+			host_vtimer_irq);
+		host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
+	}
+
+	err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
+				 "kvm guest vtimer", kvm_get_running_vcpus());
+	if (err) {
+		kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
+			host_vtimer_irq, err);
+		return err;
+	}
+
+	if (has_gic) {
+		err = irq_set_vcpu_affinity(host_vtimer_irq,
+					    kvm_get_running_vcpus());
+		if (err) {
+			kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+			goto out_free_irq;
+		}
+
+		static_branch_enable(&has_gic_active_state);
+	}
+
+	kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
+
+	/* Now let's do the physical EL1 timer irq */
+
+	if (info->physical_irq > 0) {
+		host_ptimer_irq = info->physical_irq;
+		host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
+		if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
+		    host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
+			kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
+				host_ptimer_irq);
+			host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
+		}
+
+		err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
+					 "kvm guest ptimer", kvm_get_running_vcpus());
+		if (err) {
+			kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
+				host_ptimer_irq, err);
+			return err;
+		}
+
+		if (has_gic) {
+			err = irq_set_vcpu_affinity(host_ptimer_irq,
+						    kvm_get_running_vcpus());
+			if (err) {
+				kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+				goto out_free_irq;
+			}
+		}
+
+		kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
+	} else if (has_vhe()) {
+		kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
+			info->physical_irq);
+		err = -ENODEV;
+		goto out_free_irq;
+	}
+
+	cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
+			  "kvm/arm/timer:starting", kvm_timer_starting_cpu,
+			  kvm_timer_dying_cpu);
+	return 0;
+out_free_irq:
+	free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
+	return err;
+}
+
+void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+	soft_timer_cancel(&timer->bg_timer);
+}
+
+static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
+{
+	int vtimer_irq, ptimer_irq;
+	int i, ret;
+
+	vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
+	ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
+	if (ret)
+		return false;
+
+	ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
+	ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
+	if (ret)
+		return false;
+
+	kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
+		if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
+		    vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
+			return false;
+	}
+
+	return true;
+}
+
+bool kvm_arch_timer_get_input_level(int vintid)
+{
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+	struct arch_timer_context *timer;
+
+	if (vintid == vcpu_vtimer(vcpu)->irq.irq)
+		timer = vcpu_vtimer(vcpu);
+	else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
+		timer = vcpu_ptimer(vcpu);
+	else
+		BUG();
+
+	return kvm_timer_should_fire(timer);
+}
+
+int kvm_timer_enable(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+	struct timer_map map;
+	int ret;
+
+	if (timer->enabled)
+		return 0;
+
+	/* Without a VGIC we do not map virtual IRQs to physical IRQs */
+	if (!irqchip_in_kernel(vcpu->kvm))
+		goto no_vgic;
+
+	if (!vgic_initialized(vcpu->kvm))
+		return -ENODEV;
+
+	if (!timer_irqs_are_valid(vcpu)) {
+		kvm_debug("incorrectly configured timer irqs\n");
+		return -EINVAL;
+	}
+
+	get_timer_map(vcpu, &map);
+
+	ret = kvm_vgic_map_phys_irq(vcpu,
+				    map.direct_vtimer->host_timer_irq,
+				    map.direct_vtimer->irq.irq,
+				    kvm_arch_timer_get_input_level);
+	if (ret)
+		return ret;
+
+	if (map.direct_ptimer) {
+		ret = kvm_vgic_map_phys_irq(vcpu,
+					    map.direct_ptimer->host_timer_irq,
+					    map.direct_ptimer->irq.irq,
+					    kvm_arch_timer_get_input_level);
+	}
+
+	if (ret)
+		return ret;
+
+no_vgic:
+	timer->enabled = 1;
+	return 0;
+}
+
+/*
+ * On VHE system, we only need to configure the EL2 timer trap register once,
+ * not for every world switch.
+ * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
+ * and this makes those bits have no effect for the host kernel execution.
+ */
+void kvm_timer_init_vhe(void)
+{
+	/* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */
+	u32 cnthctl_shift = 10;
+	u64 val;
+
+	/*
+	 * VHE systems allow the guest direct access to the EL1 physical
+	 * timer/counter.
+	 */
+	val = read_sysreg(cnthctl_el2);
+	val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
+	val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
+	write_sysreg(val, cnthctl_el2);
+}
+
+static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
+		vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
+	}
+}
+
+int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	int __user *uaddr = (int __user *)(long)attr->addr;
+	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	int irq;
+
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return -EINVAL;
+
+	if (get_user(irq, uaddr))
+		return -EFAULT;
+
+	if (!(irq_is_ppi(irq)))
+		return -EINVAL;
+
+	if (vcpu->arch.timer_cpu.enabled)
+		return -EBUSY;
+
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+		set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
+		break;
+	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+		set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
+		break;
+	default:
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	int __user *uaddr = (int __user *)(long)attr->addr;
+	struct arch_timer_context *timer;
+	int irq;
+
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+		timer = vcpu_vtimer(vcpu);
+		break;
+	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+		timer = vcpu_ptimer(vcpu);
+		break;
+	default:
+		return -ENXIO;
+	}
+
+	irq = timer->irq.irq;
+	return put_user(irq, uaddr);
+}
+
+int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+		return 0;
+	}
+
+	return -ENXIO;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
new file mode 100644
index 000000000000..c958bb37b769
--- /dev/null
+++ b/arch/arm64/kvm/arm.c
@@ -0,0 +1,1681 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/bug.h>
+#include <linux/cpu_pm.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/kvm.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/sched/stat.h>
+#include <trace/events/kvm.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_arm.h"
+
+#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/mman.h>
+#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/virt.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_coproc.h>
+#include <asm/sections.h>
+
+#include <kvm/arm_hypercalls.h>
+#include <kvm/arm_pmu.h>
+#include <kvm/arm_psci.h>
+
+#ifdef REQUIRES_VIRT
+__asm__(".arch_extension	virt");
+#endif
+
+DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+
+/* The VMID used in the VTTBR */
+static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
+static u32 kvm_next_vmid;
+static DEFINE_SPINLOCK(kvm_vmid_lock);
+
+static bool vgic_present;
+
+static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
+}
+
+int kvm_arch_hardware_setup(void *opaque)
+{
+	return 0;
+}
+
+int kvm_arch_check_processor_compat(void *opaque)
+{
+	return 0;
+}
+
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+			    struct kvm_enable_cap *cap)
+{
+	int r;
+
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+	case KVM_CAP_ARM_NISV_TO_USER:
+		r = 0;
+		kvm->arch.return_nisv_io_abort_to_user = true;
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+/**
+ * kvm_arch_init_vm - initializes a VM data structure
+ * @kvm:	pointer to the KVM struct
+ */
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+	int ret, cpu;
+
+	ret = kvm_arm_setup_stage2(kvm, type);
+	if (ret)
+		return ret;
+
+	kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
+	if (!kvm->arch.last_vcpu_ran)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;
+
+	ret = kvm_alloc_stage2_pgd(kvm);
+	if (ret)
+		goto out_fail_alloc;
+
+	ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
+	if (ret)
+		goto out_free_stage2_pgd;
+
+	kvm_vgic_early_init(kvm);
+
+	/* Mark the initial VMID generation invalid */
+	kvm->arch.vmid.vmid_gen = 0;
+
+	/* The maximum number of VCPUs is limited by the host's GIC model */
+	kvm->arch.max_vcpus = vgic_present ?
+				kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
+
+	return ret;
+out_free_stage2_pgd:
+	kvm_free_stage2_pgd(kvm);
+out_fail_alloc:
+	free_percpu(kvm->arch.last_vcpu_ran);
+	kvm->arch.last_vcpu_ran = NULL;
+	return ret;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+
+/**
+ * kvm_arch_destroy_vm - destroy the VM data structure
+ * @kvm:	pointer to the KVM struct
+ */
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+	int i;
+
+	kvm_vgic_destroy(kvm);
+
+	free_percpu(kvm->arch.last_vcpu_ran);
+	kvm->arch.last_vcpu_ran = NULL;
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (kvm->vcpus[i]) {
+			kvm_vcpu_destroy(kvm->vcpus[i]);
+			kvm->vcpus[i] = NULL;
+		}
+	}
+	atomic_set(&kvm->online_vcpus, 0);
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+	int r;
+	switch (ext) {
+	case KVM_CAP_IRQCHIP:
+		r = vgic_present;
+		break;
+	case KVM_CAP_IOEVENTFD:
+	case KVM_CAP_DEVICE_CTRL:
+	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_SYNC_MMU:
+	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+	case KVM_CAP_ONE_REG:
+	case KVM_CAP_ARM_PSCI:
+	case KVM_CAP_ARM_PSCI_0_2:
+	case KVM_CAP_READONLY_MEM:
+	case KVM_CAP_MP_STATE:
+	case KVM_CAP_IMMEDIATE_EXIT:
+	case KVM_CAP_VCPU_EVENTS:
+	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
+	case KVM_CAP_ARM_NISV_TO_USER:
+	case KVM_CAP_ARM_INJECT_EXT_DABT:
+		r = 1;
+		break;
+	case KVM_CAP_ARM_SET_DEVICE_ADDR:
+		r = 1;
+		break;
+	case KVM_CAP_NR_VCPUS:
+		r = num_online_cpus();
+		break;
+	case KVM_CAP_MAX_VCPUS:
+		r = KVM_MAX_VCPUS;
+		break;
+	case KVM_CAP_MAX_VCPU_ID:
+		r = KVM_MAX_VCPU_ID;
+		break;
+	case KVM_CAP_MSI_DEVID:
+		if (!kvm)
+			r = -EINVAL;
+		else
+			r = kvm->arch.vgic.msis_require_devid;
+		break;
+	case KVM_CAP_ARM_USER_IRQ:
+		/*
+		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
+		 * (bump this number if adding more devices)
+		 */
+		r = 1;
+		break;
+	default:
+		r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
+		break;
+	}
+	return r;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+			unsigned int ioctl, unsigned long arg)
+{
+	return -EINVAL;
+}
+
+struct kvm *kvm_arch_alloc_vm(void)
+{
+	if (!has_vhe())
+		return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+	return vzalloc(sizeof(struct kvm));
+}
+
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+	if (!has_vhe())
+		kfree(kvm);
+	else
+		vfree(kvm);
+}
+
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
+		return -EBUSY;
+
+	if (id >= kvm->arch.max_vcpus)
+		return -EINVAL;
+
+	return 0;
+}
+
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+	int err;
+
+	/* Force users to call KVM_ARM_VCPU_INIT */
+	vcpu->arch.target = -1;
+	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+
+	/* Set up the timer */
+	kvm_timer_vcpu_init(vcpu);
+
+	kvm_pmu_vcpu_init(vcpu);
+
+	kvm_arm_reset_debug_ptr(vcpu);
+
+	kvm_arm_pvtime_vcpu_init(&vcpu->arch);
+
+	err = kvm_vgic_vcpu_init(vcpu);
+	if (err)
+		return err;
+
+	return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
+		static_branch_dec(&userspace_irqchip_in_use);
+
+	kvm_mmu_free_memory_caches(vcpu);
+	kvm_timer_vcpu_terminate(vcpu);
+	kvm_pmu_vcpu_destroy(vcpu);
+
+	kvm_arm_vcpu_destroy(vcpu);
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	return kvm_timer_is_pending(vcpu);
+}
+
+void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * If we're about to block (most likely because we've just hit a
+	 * WFI), we need to sync back the state of the GIC CPU interface
+	 * so that we have the latest PMR and group enables. This ensures
+	 * that kvm_arch_vcpu_runnable has up-to-date data to decide
+	 * whether we have pending interrupts.
+	 *
+	 * For the same reason, we want to tell GICv4 that we need
+	 * doorbells to be signalled, should an interrupt become pending.
+	 */
+	preempt_disable();
+	kvm_vgic_vmcr_sync(vcpu);
+	vgic_v4_put(vcpu, true);
+	preempt_enable();
+}
+
+void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	vgic_v4_load(vcpu);
+	preempt_enable();
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	int *last_ran;
+	kvm_host_data_t *cpu_data;
+
+	last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
+	cpu_data = this_cpu_ptr(&kvm_host_data);
+
+	/*
+	 * We might get preempted before the vCPU actually runs, but
+	 * over-invalidation doesn't affect correctness.
+	 */
+	if (*last_ran != vcpu->vcpu_id) {
+		kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
+		*last_ran = vcpu->vcpu_id;
+	}
+
+	vcpu->cpu = cpu;
+	vcpu->arch.host_cpu_context = &cpu_data->host_ctxt;
+
+	kvm_vgic_load(vcpu);
+	kvm_timer_vcpu_load(vcpu);
+	kvm_vcpu_load_sysregs(vcpu);
+	kvm_arch_vcpu_load_fp(vcpu);
+	kvm_vcpu_pmu_restore_guest(vcpu);
+	if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
+		kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
+
+	if (single_task_running())
+		vcpu_clear_wfx_traps(vcpu);
+	else
+		vcpu_set_wfx_traps(vcpu);
+
+	vcpu_ptrauth_setup_lazy(vcpu);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	kvm_arch_vcpu_put_fp(vcpu);
+	kvm_vcpu_put_sysregs(vcpu);
+	kvm_timer_vcpu_put(vcpu);
+	kvm_vgic_put(vcpu);
+	kvm_vcpu_pmu_restore_host(vcpu);
+
+	vcpu->cpu = -1;
+}
+
+static void vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.power_off = true;
+	kvm_make_request(KVM_REQ_SLEEP, vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state)
+{
+	if (vcpu->arch.power_off)
+		mp_state->mp_state = KVM_MP_STATE_STOPPED;
+	else
+		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state)
+{
+	int ret = 0;
+
+	switch (mp_state->mp_state) {
+	case KVM_MP_STATE_RUNNABLE:
+		vcpu->arch.power_off = false;
+		break;
+	case KVM_MP_STATE_STOPPED:
+		vcpu_power_off(vcpu);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+/**
+ * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
+ * @v:		The VCPU pointer
+ *
+ * If the guest CPU is not waiting for interrupts or an interrupt line is
+ * asserted, the CPU is by definition runnable.
+ */
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
+{
+	bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
+	return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
+		&& !v->arch.power_off && !v->arch.pause);
+}
+
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+	return vcpu_mode_priv(vcpu);
+}
+
+/* Just ensure a guest exit from a particular CPU */
+static void exit_vm_noop(void *info)
+{
+}
+
+void force_vm_exit(const cpumask_t *mask)
+{
+	preempt_disable();
+	smp_call_function_many(mask, exit_vm_noop, NULL, true);
+	preempt_enable();
+}
+
+/**
+ * need_new_vmid_gen - check that the VMID is still valid
+ * @vmid: The VMID to check
+ *
+ * return true if there is a new generation of VMIDs being used
+ *
+ * The hardware supports a limited set of values with the value zero reserved
+ * for the host, so we check if an assigned value belongs to a previous
+ * generation, which which requires us to assign a new value. If we're the
+ * first to use a VMID for the new generation, we must flush necessary caches
+ * and TLBs on all CPUs.
+ */
+static bool need_new_vmid_gen(struct kvm_vmid *vmid)
+{
+	u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
+	smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
+	return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
+}
+
+/**
+ * update_vmid - Update the vmid with a valid VMID for the current generation
+ * @kvm: The guest that struct vmid belongs to
+ * @vmid: The stage-2 VMID information struct
+ */
+static void update_vmid(struct kvm_vmid *vmid)
+{
+	if (!need_new_vmid_gen(vmid))
+		return;
+
+	spin_lock(&kvm_vmid_lock);
+
+	/*
+	 * We need to re-check the vmid_gen here to ensure that if another vcpu
+	 * already allocated a valid vmid for this vm, then this vcpu should
+	 * use the same vmid.
+	 */
+	if (!need_new_vmid_gen(vmid)) {
+		spin_unlock(&kvm_vmid_lock);
+		return;
+	}
+
+	/* First user of a new VMID generation? */
+	if (unlikely(kvm_next_vmid == 0)) {
+		atomic64_inc(&kvm_vmid_gen);
+		kvm_next_vmid = 1;
+
+		/*
+		 * On SMP we know no other CPUs can use this CPU's or each
+		 * other's VMID after force_vm_exit returns since the
+		 * kvm_vmid_lock blocks them from reentry to the guest.
+		 */
+		force_vm_exit(cpu_all_mask);
+		/*
+		 * Now broadcast TLB + ICACHE invalidation over the inner
+		 * shareable domain to make sure all data structures are
+		 * clean.
+		 */
+		kvm_call_hyp(__kvm_flush_vm_context);
+	}
+
+	vmid->vmid = kvm_next_vmid;
+	kvm_next_vmid++;
+	kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
+
+	smp_wmb();
+	WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen));
+
+	spin_unlock(&kvm_vmid_lock);
+}
+
+static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int ret = 0;
+
+	if (likely(vcpu->arch.has_run_once))
+		return 0;
+
+	if (!kvm_arm_vcpu_is_finalized(vcpu))
+		return -EPERM;
+
+	vcpu->arch.has_run_once = true;
+
+	if (likely(irqchip_in_kernel(kvm))) {
+		/*
+		 * Map the VGIC hardware resources before running a vcpu the
+		 * first time on this VM.
+		 */
+		if (unlikely(!vgic_ready(kvm))) {
+			ret = kvm_vgic_map_resources(kvm);
+			if (ret)
+				return ret;
+		}
+	} else {
+		/*
+		 * Tell the rest of the code that there are userspace irqchip
+		 * VMs in the wild.
+		 */
+		static_branch_inc(&userspace_irqchip_in_use);
+	}
+
+	ret = kvm_timer_enable(vcpu);
+	if (ret)
+		return ret;
+
+	ret = kvm_arm_pmu_v3_enable(vcpu);
+
+	return ret;
+}
+
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+	return vgic_initialized(kvm);
+}
+
+void kvm_arm_halt_guest(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		vcpu->arch.pause = true;
+	kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
+}
+
+void kvm_arm_resume_guest(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		vcpu->arch.pause = false;
+		swake_up_one(kvm_arch_vcpu_wq(vcpu));
+	}
+}
+
+static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
+{
+	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
+
+	swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
+				       (!vcpu->arch.pause)));
+
+	if (vcpu->arch.power_off || vcpu->arch.pause) {
+		/* Awaken to handle a signal, request we sleep again later. */
+		kvm_make_request(KVM_REQ_SLEEP, vcpu);
+	}
+
+	/*
+	 * Make sure we will observe a potential reset request if we've
+	 * observed a change to the power state. Pairs with the smp_wmb() in
+	 * kvm_psci_vcpu_on().
+	 */
+	smp_rmb();
+}
+
+static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.target >= 0;
+}
+
+static void check_vcpu_requests(struct kvm_vcpu *vcpu)
+{
+	if (kvm_request_pending(vcpu)) {
+		if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
+			vcpu_req_sleep(vcpu);
+
+		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
+			kvm_reset_vcpu(vcpu);
+
+		/*
+		 * Clear IRQ_PENDING requests that were made to guarantee
+		 * that a VCPU sees new virtual interrupts.
+		 */
+		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
+
+		if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
+			kvm_update_stolen_time(vcpu);
+
+		if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
+			/* The distributor enable bits were changed */
+			preempt_disable();
+			vgic_v4_put(vcpu, false);
+			vgic_v4_load(vcpu);
+			preempt_enable();
+		}
+	}
+}
+
+/**
+ * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
+ * @vcpu:	The VCPU pointer
+ * @run:	The kvm_run structure pointer used for userspace state exchange
+ *
+ * This function is called through the VCPU_RUN ioctl called from user space. It
+ * will execute VM code in a loop until the time slice for the process is used
+ * or some emulation is needed from user space in which case the function will
+ * return with return value 0 and with the kvm_run structure filled in with the
+ * required data for the requested emulation.
+ */
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	int ret;
+
+	if (unlikely(!kvm_vcpu_initialized(vcpu)))
+		return -ENOEXEC;
+
+	ret = kvm_vcpu_first_run_init(vcpu);
+	if (ret)
+		return ret;
+
+	if (run->exit_reason == KVM_EXIT_MMIO) {
+		ret = kvm_handle_mmio_return(vcpu, vcpu->run);
+		if (ret)
+			return ret;
+	}
+
+	if (run->immediate_exit)
+		return -EINTR;
+
+	vcpu_load(vcpu);
+
+	kvm_sigset_activate(vcpu);
+
+	ret = 1;
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	while (ret > 0) {
+		/*
+		 * Check conditions before entering the guest
+		 */
+		cond_resched();
+
+		update_vmid(&vcpu->kvm->arch.vmid);
+
+		check_vcpu_requests(vcpu);
+
+		/*
+		 * Preparing the interrupts to be injected also
+		 * involves poking the GIC, which must be done in a
+		 * non-preemptible context.
+		 */
+		preempt_disable();
+
+		kvm_pmu_flush_hwstate(vcpu);
+
+		local_irq_disable();
+
+		kvm_vgic_flush_hwstate(vcpu);
+
+		/*
+		 * Exit if we have a signal pending so that we can deliver the
+		 * signal to user space.
+		 */
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			run->exit_reason = KVM_EXIT_INTR;
+		}
+
+		/*
+		 * If we're using a userspace irqchip, then check if we need
+		 * to tell a userspace irqchip about timer or PMU level
+		 * changes and if so, exit to userspace (the actual level
+		 * state gets updated in kvm_timer_update_run and
+		 * kvm_pmu_update_run below).
+		 */
+		if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+			if (kvm_timer_should_notify_user(vcpu) ||
+			    kvm_pmu_should_notify_user(vcpu)) {
+				ret = -EINTR;
+				run->exit_reason = KVM_EXIT_INTR;
+			}
+		}
+
+		/*
+		 * Ensure we set mode to IN_GUEST_MODE after we disable
+		 * interrupts and before the final VCPU requests check.
+		 * See the comment in kvm_vcpu_exiting_guest_mode() and
+		 * Documentation/virt/kvm/vcpu-requests.rst
+		 */
+		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
+
+		if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) ||
+		    kvm_request_pending(vcpu)) {
+			vcpu->mode = OUTSIDE_GUEST_MODE;
+			isb(); /* Ensure work in x_flush_hwstate is committed */
+			kvm_pmu_sync_hwstate(vcpu);
+			if (static_branch_unlikely(&userspace_irqchip_in_use))
+				kvm_timer_sync_hwstate(vcpu);
+			kvm_vgic_sync_hwstate(vcpu);
+			local_irq_enable();
+			preempt_enable();
+			continue;
+		}
+
+		kvm_arm_setup_debug(vcpu);
+
+		/**************************************************************
+		 * Enter the guest
+		 */
+		trace_kvm_entry(*vcpu_pc(vcpu));
+		guest_enter_irqoff();
+
+		if (has_vhe()) {
+			ret = kvm_vcpu_run_vhe(vcpu);
+		} else {
+			ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
+		}
+
+		vcpu->mode = OUTSIDE_GUEST_MODE;
+		vcpu->stat.exits++;
+		/*
+		 * Back from guest
+		 *************************************************************/
+
+		kvm_arm_clear_debug(vcpu);
+
+		/*
+		 * We must sync the PMU state before the vgic state so
+		 * that the vgic can properly sample the updated state of the
+		 * interrupt line.
+		 */
+		kvm_pmu_sync_hwstate(vcpu);
+
+		/*
+		 * Sync the vgic state before syncing the timer state because
+		 * the timer code needs to know if the virtual timer
+		 * interrupts are active.
+		 */
+		kvm_vgic_sync_hwstate(vcpu);
+
+		/*
+		 * Sync the timer hardware state before enabling interrupts as
+		 * we don't want vtimer interrupts to race with syncing the
+		 * timer virtual interrupt state.
+		 */
+		if (static_branch_unlikely(&userspace_irqchip_in_use))
+			kvm_timer_sync_hwstate(vcpu);
+
+		kvm_arch_vcpu_ctxsync_fp(vcpu);
+
+		/*
+		 * We may have taken a host interrupt in HYP mode (ie
+		 * while executing the guest). This interrupt is still
+		 * pending, as we haven't serviced it yet!
+		 *
+		 * We're now back in SVC mode, with interrupts
+		 * disabled.  Enabling the interrupts now will have
+		 * the effect of taking the interrupt again, in SVC
+		 * mode this time.
+		 */
+		local_irq_enable();
+
+		/*
+		 * We do local_irq_enable() before calling guest_exit() so
+		 * that if a timer interrupt hits while running the guest we
+		 * account that tick as being spent in the guest.  We enable
+		 * preemption after calling guest_exit() so that if we get
+		 * preempted we make sure ticks after that is not counted as
+		 * guest time.
+		 */
+		guest_exit();
+		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+
+		/* Exit types that need handling before we can be preempted */
+		handle_exit_early(vcpu, run, ret);
+
+		preempt_enable();
+
+		ret = handle_exit(vcpu, run, ret);
+	}
+
+	/* Tell userspace about in-kernel device output levels */
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
+		kvm_timer_update_run(vcpu);
+		kvm_pmu_update_run(vcpu);
+	}
+
+	kvm_sigset_deactivate(vcpu);
+
+	vcpu_put(vcpu);
+	return ret;
+}
+
+static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
+{
+	int bit_index;
+	bool set;
+	unsigned long *hcr;
+
+	if (number == KVM_ARM_IRQ_CPU_IRQ)
+		bit_index = __ffs(HCR_VI);
+	else /* KVM_ARM_IRQ_CPU_FIQ */
+		bit_index = __ffs(HCR_VF);
+
+	hcr = vcpu_hcr(vcpu);
+	if (level)
+		set = test_and_set_bit(bit_index, hcr);
+	else
+		set = test_and_clear_bit(bit_index, hcr);
+
+	/*
+	 * If we didn't change anything, no need to wake up or kick other CPUs
+	 */
+	if (set == level)
+		return 0;
+
+	/*
+	 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
+	 * trigger a world-switch round on the running physical CPU to set the
+	 * virtual IRQ/FIQ fields in the HCR appropriately.
+	 */
+	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+	kvm_vcpu_kick(vcpu);
+
+	return 0;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
+			  bool line_status)
+{
+	u32 irq = irq_level->irq;
+	unsigned int irq_type, vcpu_idx, irq_num;
+	int nrcpus = atomic_read(&kvm->online_vcpus);
+	struct kvm_vcpu *vcpu = NULL;
+	bool level = irq_level->level;
+
+	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
+	vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
+	vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
+	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
+
+	trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
+
+	switch (irq_type) {
+	case KVM_ARM_IRQ_TYPE_CPU:
+		if (irqchip_in_kernel(kvm))
+			return -ENXIO;
+
+		if (vcpu_idx >= nrcpus)
+			return -EINVAL;
+
+		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+		if (!vcpu)
+			return -EINVAL;
+
+		if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
+			return -EINVAL;
+
+		return vcpu_interrupt_line(vcpu, irq_num, level);
+	case KVM_ARM_IRQ_TYPE_PPI:
+		if (!irqchip_in_kernel(kvm))
+			return -ENXIO;
+
+		if (vcpu_idx >= nrcpus)
+			return -EINVAL;
+
+		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+		if (!vcpu)
+			return -EINVAL;
+
+		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
+			return -EINVAL;
+
+		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
+	case KVM_ARM_IRQ_TYPE_SPI:
+		if (!irqchip_in_kernel(kvm))
+			return -ENXIO;
+
+		if (irq_num < VGIC_NR_PRIVATE_IRQS)
+			return -EINVAL;
+
+		return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
+	}
+
+	return -EINVAL;
+}
+
+static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+			       const struct kvm_vcpu_init *init)
+{
+	unsigned int i, ret;
+	int phys_target = kvm_target_cpu();
+
+	if (init->target != phys_target)
+		return -EINVAL;
+
+	/*
+	 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+	 * use the same target.
+	 */
+	if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
+		return -EINVAL;
+
+	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
+	for (i = 0; i < sizeof(init->features) * 8; i++) {
+		bool set = (init->features[i / 32] & (1 << (i % 32)));
+
+		if (set && i >= KVM_VCPU_MAX_FEATURES)
+			return -ENOENT;
+
+		/*
+		 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+		 * use the same feature set.
+		 */
+		if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
+		    test_bit(i, vcpu->arch.features) != set)
+			return -EINVAL;
+
+		if (set)
+			set_bit(i, vcpu->arch.features);
+	}
+
+	vcpu->arch.target = phys_target;
+
+	/* Now we know what it is, we can reset it. */
+	ret = kvm_reset_vcpu(vcpu);
+	if (ret) {
+		vcpu->arch.target = -1;
+		bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+	}
+
+	return ret;
+}
+
+static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
+					 struct kvm_vcpu_init *init)
+{
+	int ret;
+
+	ret = kvm_vcpu_set_target(vcpu, init);
+	if (ret)
+		return ret;
+
+	/*
+	 * Ensure a rebooted VM will fault in RAM pages and detect if the
+	 * guest MMU is turned off and flush the caches as needed.
+	 */
+	if (vcpu->arch.has_run_once)
+		stage2_unmap_vm(vcpu->kvm);
+
+	vcpu_reset_hcr(vcpu);
+
+	/*
+	 * Handle the "start in power-off" case.
+	 */
+	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
+		vcpu_power_off(vcpu);
+	else
+		vcpu->arch.power_off = false;
+
+	return 0;
+}
+
+static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
+		break;
+	}
+
+	return ret;
+}
+
+static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
+		break;
+	}
+
+	return ret;
+}
+
+static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
+		break;
+	}
+
+	return ret;
+}
+
+static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
+				   struct kvm_vcpu_events *events)
+{
+	memset(events, 0, sizeof(*events));
+
+	return __kvm_arm_vcpu_get_events(vcpu, events);
+}
+
+static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
+				   struct kvm_vcpu_events *events)
+{
+	int i;
+
+	/* check whether the reserved field is zero */
+	for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
+		if (events->reserved[i])
+			return -EINVAL;
+
+	/* check whether the pad field is zero */
+	for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
+		if (events->exception.pad[i])
+			return -EINVAL;
+
+	return __kvm_arm_vcpu_set_events(vcpu, events);
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+			 unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	struct kvm_device_attr attr;
+	long r;
+
+	switch (ioctl) {
+	case KVM_ARM_VCPU_INIT: {
+		struct kvm_vcpu_init init;
+
+		r = -EFAULT;
+		if (copy_from_user(&init, argp, sizeof(init)))
+			break;
+
+		r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
+		break;
+	}
+	case KVM_SET_ONE_REG:
+	case KVM_GET_ONE_REG: {
+		struct kvm_one_reg reg;
+
+		r = -ENOEXEC;
+		if (unlikely(!kvm_vcpu_initialized(vcpu)))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&reg, argp, sizeof(reg)))
+			break;
+
+		if (ioctl == KVM_SET_ONE_REG)
+			r = kvm_arm_set_reg(vcpu, &reg);
+		else
+			r = kvm_arm_get_reg(vcpu, &reg);
+		break;
+	}
+	case KVM_GET_REG_LIST: {
+		struct kvm_reg_list __user *user_list = argp;
+		struct kvm_reg_list reg_list;
+		unsigned n;
+
+		r = -ENOEXEC;
+		if (unlikely(!kvm_vcpu_initialized(vcpu)))
+			break;
+
+		r = -EPERM;
+		if (!kvm_arm_vcpu_is_finalized(vcpu))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
+			break;
+		n = reg_list.n;
+		reg_list.n = kvm_arm_num_regs(vcpu);
+		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
+			break;
+		r = -E2BIG;
+		if (n < reg_list.n)
+			break;
+		r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
+		break;
+	}
+	case KVM_SET_DEVICE_ATTR: {
+		r = -EFAULT;
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			break;
+		r = kvm_arm_vcpu_set_attr(vcpu, &attr);
+		break;
+	}
+	case KVM_GET_DEVICE_ATTR: {
+		r = -EFAULT;
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			break;
+		r = kvm_arm_vcpu_get_attr(vcpu, &attr);
+		break;
+	}
+	case KVM_HAS_DEVICE_ATTR: {
+		r = -EFAULT;
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			break;
+		r = kvm_arm_vcpu_has_attr(vcpu, &attr);
+		break;
+	}
+	case KVM_GET_VCPU_EVENTS: {
+		struct kvm_vcpu_events events;
+
+		if (kvm_arm_vcpu_get_events(vcpu, &events))
+			return -EINVAL;
+
+		if (copy_to_user(argp, &events, sizeof(events)))
+			return -EFAULT;
+
+		return 0;
+	}
+	case KVM_SET_VCPU_EVENTS: {
+		struct kvm_vcpu_events events;
+
+		if (copy_from_user(&events, argp, sizeof(events)))
+			return -EFAULT;
+
+		return kvm_arm_vcpu_set_events(vcpu, &events);
+	}
+	case KVM_ARM_VCPU_FINALIZE: {
+		int what;
+
+		if (!kvm_vcpu_initialized(vcpu))
+			return -ENOEXEC;
+
+		if (get_user(what, (const int __user *)argp))
+			return -EFAULT;
+
+		return kvm_arm_vcpu_finalize(vcpu, what);
+	}
+	default:
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+
+}
+
+void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+					struct kvm_memory_slot *memslot)
+{
+	kvm_flush_remote_tlbs(kvm);
+}
+
+static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
+					struct kvm_arm_device_addr *dev_addr)
+{
+	unsigned long dev_id, type;
+
+	dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
+		KVM_ARM_DEVICE_ID_SHIFT;
+	type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
+		KVM_ARM_DEVICE_TYPE_SHIFT;
+
+	switch (dev_id) {
+	case KVM_ARM_DEVICE_VGIC_V2:
+		if (!vgic_present)
+			return -ENXIO;
+		return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
+	default:
+		return -ENODEV;
+	}
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+		       unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm = filp->private_data;
+	void __user *argp = (void __user *)arg;
+
+	switch (ioctl) {
+	case KVM_CREATE_IRQCHIP: {
+		int ret;
+		if (!vgic_present)
+			return -ENXIO;
+		mutex_lock(&kvm->lock);
+		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		mutex_unlock(&kvm->lock);
+		return ret;
+	}
+	case KVM_ARM_SET_DEVICE_ADDR: {
+		struct kvm_arm_device_addr dev_addr;
+
+		if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
+			return -EFAULT;
+		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
+	}
+	case KVM_ARM_PREFERRED_TARGET: {
+		int err;
+		struct kvm_vcpu_init init;
+
+		err = kvm_vcpu_preferred_target(&init);
+		if (err)
+			return err;
+
+		if (copy_to_user(argp, &init, sizeof(init)))
+			return -EFAULT;
+
+		return 0;
+	}
+	default:
+		return -EINVAL;
+	}
+}
+
+static void cpu_init_hyp_mode(void)
+{
+	phys_addr_t pgd_ptr;
+	unsigned long hyp_stack_ptr;
+	unsigned long stack_page;
+	unsigned long vector_ptr;
+
+	/* Switch from the HYP stub to our own HYP init vector */
+	__hyp_set_vectors(kvm_get_idmap_vector());
+
+	pgd_ptr = kvm_mmu_get_httbr();
+	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
+	hyp_stack_ptr = stack_page + PAGE_SIZE;
+	vector_ptr = (unsigned long)kvm_get_hyp_vector();
+
+	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
+	__cpu_init_stage2();
+}
+
+static void cpu_hyp_reset(void)
+{
+	if (!is_kernel_in_hyp_mode())
+		__hyp_reset_vectors();
+}
+
+static void cpu_hyp_reinit(void)
+{
+	kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
+
+	cpu_hyp_reset();
+
+	if (is_kernel_in_hyp_mode())
+		kvm_timer_init_vhe();
+	else
+		cpu_init_hyp_mode();
+
+	kvm_arm_init_debug();
+
+	if (vgic_present)
+		kvm_vgic_init_cpu_hardware();
+}
+
+static void _kvm_arch_hardware_enable(void *discard)
+{
+	if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
+		cpu_hyp_reinit();
+		__this_cpu_write(kvm_arm_hardware_enabled, 1);
+	}
+}
+
+int kvm_arch_hardware_enable(void)
+{
+	_kvm_arch_hardware_enable(NULL);
+	return 0;
+}
+
+static void _kvm_arch_hardware_disable(void *discard)
+{
+	if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+		cpu_hyp_reset();
+		__this_cpu_write(kvm_arm_hardware_enabled, 0);
+	}
+}
+
+void kvm_arch_hardware_disable(void)
+{
+	_kvm_arch_hardware_disable(NULL);
+}
+
+#ifdef CONFIG_CPU_PM
+static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
+				    unsigned long cmd,
+				    void *v)
+{
+	/*
+	 * kvm_arm_hardware_enabled is left with its old value over
+	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
+	 * re-enable hyp.
+	 */
+	switch (cmd) {
+	case CPU_PM_ENTER:
+		if (__this_cpu_read(kvm_arm_hardware_enabled))
+			/*
+			 * don't update kvm_arm_hardware_enabled here
+			 * so that the hardware will be re-enabled
+			 * when we resume. See below.
+			 */
+			cpu_hyp_reset();
+
+		return NOTIFY_OK;
+	case CPU_PM_ENTER_FAILED:
+	case CPU_PM_EXIT:
+		if (__this_cpu_read(kvm_arm_hardware_enabled))
+			/* The hardware was enabled before suspend. */
+			cpu_hyp_reinit();
+
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block hyp_init_cpu_pm_nb = {
+	.notifier_call = hyp_init_cpu_pm_notifier,
+};
+
+static void __init hyp_cpu_pm_init(void)
+{
+	cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
+}
+static void __init hyp_cpu_pm_exit(void)
+{
+	cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
+}
+#else
+static inline void hyp_cpu_pm_init(void)
+{
+}
+static inline void hyp_cpu_pm_exit(void)
+{
+}
+#endif
+
+static int init_common_resources(void)
+{
+	kvm_set_ipa_limit();
+
+	return 0;
+}
+
+static int init_subsystems(void)
+{
+	int err = 0;
+
+	/*
+	 * Enable hardware so that subsystem initialisation can access EL2.
+	 */
+	on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
+
+	/*
+	 * Register CPU lower-power notifier
+	 */
+	hyp_cpu_pm_init();
+
+	/*
+	 * Init HYP view of VGIC
+	 */
+	err = kvm_vgic_hyp_init();
+	switch (err) {
+	case 0:
+		vgic_present = true;
+		break;
+	case -ENODEV:
+	case -ENXIO:
+		vgic_present = false;
+		err = 0;
+		break;
+	default:
+		goto out;
+	}
+
+	/*
+	 * Init HYP architected timer support
+	 */
+	err = kvm_timer_hyp_init(vgic_present);
+	if (err)
+		goto out;
+
+	kvm_perf_init();
+	kvm_coproc_table_init();
+
+out:
+	on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+
+	return err;
+}
+
+static void teardown_hyp_mode(void)
+{
+	int cpu;
+
+	free_hyp_pgds();
+	for_each_possible_cpu(cpu)
+		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+}
+
+/**
+ * Inits Hyp-mode on all online CPUs
+ */
+static int init_hyp_mode(void)
+{
+	int cpu;
+	int err = 0;
+
+	/*
+	 * Allocate Hyp PGD and setup Hyp identity mapping
+	 */
+	err = kvm_mmu_init();
+	if (err)
+		goto out_err;
+
+	/*
+	 * Allocate stack pages for Hypervisor-mode
+	 */
+	for_each_possible_cpu(cpu) {
+		unsigned long stack_page;
+
+		stack_page = __get_free_page(GFP_KERNEL);
+		if (!stack_page) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+
+		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
+	}
+
+	/*
+	 * Map the Hyp-code called directly from the host
+	 */
+	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
+				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
+	if (err) {
+		kvm_err("Cannot map world-switch code\n");
+		goto out_err;
+	}
+
+	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
+				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
+	if (err) {
+		kvm_err("Cannot map rodata section\n");
+		goto out_err;
+	}
+
+	err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
+				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
+	if (err) {
+		kvm_err("Cannot map bss section\n");
+		goto out_err;
+	}
+
+	err = kvm_map_vectors();
+	if (err) {
+		kvm_err("Cannot map vectors\n");
+		goto out_err;
+	}
+
+	/*
+	 * Map the Hyp stack pages
+	 */
+	for_each_possible_cpu(cpu) {
+		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
+		err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+					  PAGE_HYP);
+
+		if (err) {
+			kvm_err("Cannot map hyp stack\n");
+			goto out_err;
+		}
+	}
+
+	for_each_possible_cpu(cpu) {
+		kvm_host_data_t *cpu_data;
+
+		cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
+		err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
+
+		if (err) {
+			kvm_err("Cannot map host CPU state: %d\n", err);
+			goto out_err;
+		}
+	}
+
+	err = hyp_map_aux_data();
+	if (err)
+		kvm_err("Cannot map host auxiliary data: %d\n", err);
+
+	return 0;
+
+out_err:
+	teardown_hyp_mode();
+	kvm_err("error initializing Hyp mode: %d\n", err);
+	return err;
+}
+
+static void check_kvm_target_cpu(void *ret)
+{
+	*(int *)ret = kvm_target_cpu();
+}
+
+struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	mpidr &= MPIDR_HWID_BITMASK;
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
+			return vcpu;
+	}
+	return NULL;
+}
+
+bool kvm_arch_has_irq_bypass(void)
+{
+	return true;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
+					  &irqfd->irq_entry);
+}
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
+				     &irqfd->irq_entry);
+}
+
+void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	kvm_arm_halt_guest(irqfd->kvm);
+}
+
+void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	kvm_arm_resume_guest(irqfd->kvm);
+}
+
+/**
+ * Initialize Hyp-mode and memory mappings on all CPUs.
+ */
+int kvm_arch_init(void *opaque)
+{
+	int err;
+	int ret, cpu;
+	bool in_hyp_mode;
+
+	if (!is_hyp_mode_available()) {
+		kvm_info("HYP mode not available\n");
+		return -ENODEV;
+	}
+
+	in_hyp_mode = is_kernel_in_hyp_mode();
+
+	if (!in_hyp_mode && kvm_arch_requires_vhe()) {
+		kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
+		return -ENODEV;
+	}
+
+	for_each_online_cpu(cpu) {
+		smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
+		if (ret < 0) {
+			kvm_err("Error, CPU %d not supported!\n", cpu);
+			return -ENODEV;
+		}
+	}
+
+	err = init_common_resources();
+	if (err)
+		return err;
+
+	err = kvm_arm_init_sve();
+	if (err)
+		return err;
+
+	if (!in_hyp_mode) {
+		err = init_hyp_mode();
+		if (err)
+			goto out_err;
+	}
+
+	err = init_subsystems();
+	if (err)
+		goto out_hyp;
+
+	if (in_hyp_mode)
+		kvm_info("VHE mode initialized successfully\n");
+	else
+		kvm_info("Hyp mode initialized successfully\n");
+
+	return 0;
+
+out_hyp:
+	hyp_cpu_pm_exit();
+	if (!in_hyp_mode)
+		teardown_hyp_mode();
+out_err:
+	return err;
+}
+
+/* NOP: Compiling as a module not supported */
+void kvm_arch_exit(void)
+{
+	kvm_perf_teardown();
+}
+
+static int arm_init(void)
+{
+	int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+	return rc;
+}
+
+module_init(arm_init);
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index aacfc55de44c..eb194696ef62 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -23,7 +23,7 @@
 #include <kvm/arm_hypercalls.h>
 
 #define CREATE_TRACE_POINTS
-#include "trace.h"
+#include "trace_handle_exit.h"
 
 typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
 
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index ea710f674cb6..dc18274a6826 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -6,12 +6,9 @@
 ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \
 		$(DISABLE_STACKLEAK_PLUGIN)
 
-KVM=../../../../virt/kvm
-
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/aarch32.o
-
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += aarch32.o
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o
 obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c
new file mode 100644
index 000000000000..25c0e47d57cb
--- /dev/null
+++ b/arch/arm64/kvm/hyp/aarch32.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyp portion of the (not much of an) Emulation layer for 32bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+/*
+ * stolen from arch/arm/kernel/opcodes.c
+ *
+ * condition code lookup table
+ * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
+ *
+ * bit position in short is condition code: NZCV
+ */
+static const unsigned short cc_map[16] = {
+	0xF0F0,			/* EQ == Z set            */
+	0x0F0F,			/* NE                     */
+	0xCCCC,			/* CS == C set            */
+	0x3333,			/* CC                     */
+	0xFF00,			/* MI == N set            */
+	0x00FF,			/* PL                     */
+	0xAAAA,			/* VS == V set            */
+	0x5555,			/* VC                     */
+	0x0C0C,			/* HI == C set && Z clear */
+	0xF3F3,			/* LS == C clear || Z set */
+	0xAA55,			/* GE == (N==V)           */
+	0x55AA,			/* LT == (N!=V)           */
+	0x0A05,			/* GT == (!Z && (N==V))   */
+	0xF5FA,			/* LE == (Z || (N!=V))    */
+	0xFFFF,			/* AL always              */
+	0			/* NV                     */
+};
+
+/*
+ * Check if a trapped instruction should have been executed or not.
+ */
+bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu)
+{
+	unsigned long cpsr;
+	u32 cpsr_cond;
+	int cond;
+
+	/* Top two bits non-zero?  Unconditional. */
+	if (kvm_vcpu_get_hsr(vcpu) >> 30)
+		return true;
+
+	/* Is condition field valid? */
+	cond = kvm_vcpu_get_condition(vcpu);
+	if (cond == 0xE)
+		return true;
+
+	cpsr = *vcpu_cpsr(vcpu);
+
+	if (cond < 0) {
+		/* This can happen in Thumb mode: examine IT state. */
+		unsigned long it;
+
+		it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
+
+		/* it == 0 => unconditional. */
+		if (it == 0)
+			return true;
+
+		/* The cond for this insn works out as the top 4 bits. */
+		cond = (it >> 4);
+	}
+
+	cpsr_cond = cpsr >> 28;
+
+	if (!((cc_map[cond] >> cpsr_cond) & 1))
+		return false;
+
+	return true;
+}
+
+/**
+ * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
+ * @vcpu:	The VCPU pointer
+ *
+ * When exceptions occur while instructions are executed in Thumb IF-THEN
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
+ * to do this little bit of work manually. The fields map like this:
+ *
+ * IT[7:0] -> CPSR[26:25],CPSR[15:10]
+ */
+static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
+{
+	unsigned long itbits, cond;
+	unsigned long cpsr = *vcpu_cpsr(vcpu);
+	bool is_arm = !(cpsr & PSR_AA32_T_BIT);
+
+	if (is_arm || !(cpsr & PSR_AA32_IT_MASK))
+		return;
+
+	cond = (cpsr & 0xe000) >> 13;
+	itbits = (cpsr & 0x1c00) >> (10 - 2);
+	itbits |= (cpsr & (0x3 << 25)) >> 25;
+
+	/* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
+	if ((itbits & 0x7) == 0)
+		itbits = cond = 0;
+	else
+		itbits = (itbits << 1) & 0x1f;
+
+	cpsr &= ~PSR_AA32_IT_MASK;
+	cpsr |= cond << 13;
+	cpsr |= (itbits & 0x1c) << (10 - 2);
+	cpsr |= (itbits & 0x3) << 25;
+	*vcpu_cpsr(vcpu) = cpsr;
+}
+
+/**
+ * kvm_skip_instr - skip a trapped instruction and proceed to the next
+ * @vcpu: The vcpu pointer
+ */
+void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
+{
+	u32 pc = *vcpu_pc(vcpu);
+	bool is_thumb;
+
+	is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
+	if (is_thumb && !is_wide_instr)
+		pc += 2;
+	else
+		pc += 4;
+
+	*vcpu_pc(vcpu) = pc;
+
+	kvm_adjust_itstate(vcpu);
+}
diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c
new file mode 100644
index 000000000000..ff76e6845fe4
--- /dev/null
+++ b/arch/arm64/kvm/hyp/timer-sr.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <clocksource/arm_arch_timer.h>
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_hyp.h>
+
+void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
+{
+	u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low;
+	write_sysreg(cntvoff, cntvoff_el2);
+}
+
+/*
+ * Should only be called on non-VHE systems.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
+void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
+{
+	u64 val;
+
+	/* Allow physical timer/counter access for the host */
+	val = read_sysreg(cnthctl_el2);
+	val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
+	write_sysreg(val, cnthctl_el2);
+}
+
+/*
+ * Should only be called on non-VHE systems.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
+void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
+{
+	u64 val;
+
+	/*
+	 * Disallow physical timer access for the guest
+	 * Physical counter access is allowed
+	 */
+	val = read_sysreg(cnthctl_el2);
+	val &= ~CNTHCTL_EL1PCEN;
+	val |= CNTHCTL_EL1PCTEN;
+	write_sysreg(val, cnthctl_el2);
+}
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
new file mode 100644
index 000000000000..49fedf6710f9
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -0,0 +1,1126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#define vtr_to_max_lr_idx(v)		((v) & 0xf)
+#define vtr_to_nr_pre_bits(v)		((((u32)(v) >> 26) & 7) + 1)
+#define vtr_to_nr_apr_regs(v)		(1 << (vtr_to_nr_pre_bits(v) - 5))
+
+static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
+{
+	switch (lr & 0xf) {
+	case 0:
+		return read_gicreg(ICH_LR0_EL2);
+	case 1:
+		return read_gicreg(ICH_LR1_EL2);
+	case 2:
+		return read_gicreg(ICH_LR2_EL2);
+	case 3:
+		return read_gicreg(ICH_LR3_EL2);
+	case 4:
+		return read_gicreg(ICH_LR4_EL2);
+	case 5:
+		return read_gicreg(ICH_LR5_EL2);
+	case 6:
+		return read_gicreg(ICH_LR6_EL2);
+	case 7:
+		return read_gicreg(ICH_LR7_EL2);
+	case 8:
+		return read_gicreg(ICH_LR8_EL2);
+	case 9:
+		return read_gicreg(ICH_LR9_EL2);
+	case 10:
+		return read_gicreg(ICH_LR10_EL2);
+	case 11:
+		return read_gicreg(ICH_LR11_EL2);
+	case 12:
+		return read_gicreg(ICH_LR12_EL2);
+	case 13:
+		return read_gicreg(ICH_LR13_EL2);
+	case 14:
+		return read_gicreg(ICH_LR14_EL2);
+	case 15:
+		return read_gicreg(ICH_LR15_EL2);
+	}
+
+	unreachable();
+}
+
+static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
+{
+	switch (lr & 0xf) {
+	case 0:
+		write_gicreg(val, ICH_LR0_EL2);
+		break;
+	case 1:
+		write_gicreg(val, ICH_LR1_EL2);
+		break;
+	case 2:
+		write_gicreg(val, ICH_LR2_EL2);
+		break;
+	case 3:
+		write_gicreg(val, ICH_LR3_EL2);
+		break;
+	case 4:
+		write_gicreg(val, ICH_LR4_EL2);
+		break;
+	case 5:
+		write_gicreg(val, ICH_LR5_EL2);
+		break;
+	case 6:
+		write_gicreg(val, ICH_LR6_EL2);
+		break;
+	case 7:
+		write_gicreg(val, ICH_LR7_EL2);
+		break;
+	case 8:
+		write_gicreg(val, ICH_LR8_EL2);
+		break;
+	case 9:
+		write_gicreg(val, ICH_LR9_EL2);
+		break;
+	case 10:
+		write_gicreg(val, ICH_LR10_EL2);
+		break;
+	case 11:
+		write_gicreg(val, ICH_LR11_EL2);
+		break;
+	case 12:
+		write_gicreg(val, ICH_LR12_EL2);
+		break;
+	case 13:
+		write_gicreg(val, ICH_LR13_EL2);
+		break;
+	case 14:
+		write_gicreg(val, ICH_LR14_EL2);
+		break;
+	case 15:
+		write_gicreg(val, ICH_LR15_EL2);
+		break;
+	}
+}
+
+static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n)
+{
+	switch (n) {
+	case 0:
+		write_gicreg(val, ICH_AP0R0_EL2);
+		break;
+	case 1:
+		write_gicreg(val, ICH_AP0R1_EL2);
+		break;
+	case 2:
+		write_gicreg(val, ICH_AP0R2_EL2);
+		break;
+	case 3:
+		write_gicreg(val, ICH_AP0R3_EL2);
+		break;
+	}
+}
+
+static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n)
+{
+	switch (n) {
+	case 0:
+		write_gicreg(val, ICH_AP1R0_EL2);
+		break;
+	case 1:
+		write_gicreg(val, ICH_AP1R1_EL2);
+		break;
+	case 2:
+		write_gicreg(val, ICH_AP1R2_EL2);
+		break;
+	case 3:
+		write_gicreg(val, ICH_AP1R3_EL2);
+		break;
+	}
+}
+
+static u32 __hyp_text __vgic_v3_read_ap0rn(int n)
+{
+	u32 val;
+
+	switch (n) {
+	case 0:
+		val = read_gicreg(ICH_AP0R0_EL2);
+		break;
+	case 1:
+		val = read_gicreg(ICH_AP0R1_EL2);
+		break;
+	case 2:
+		val = read_gicreg(ICH_AP0R2_EL2);
+		break;
+	case 3:
+		val = read_gicreg(ICH_AP0R3_EL2);
+		break;
+	default:
+		unreachable();
+	}
+
+	return val;
+}
+
+static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
+{
+	u32 val;
+
+	switch (n) {
+	case 0:
+		val = read_gicreg(ICH_AP1R0_EL2);
+		break;
+	case 1:
+		val = read_gicreg(ICH_AP1R1_EL2);
+		break;
+	case 2:
+		val = read_gicreg(ICH_AP1R2_EL2);
+		break;
+	case 3:
+		val = read_gicreg(ICH_AP1R3_EL2);
+		break;
+	default:
+		unreachable();
+	}
+
+	return val;
+}
+
+void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+
+	/*
+	 * Make sure stores to the GIC via the memory mapped interface
+	 * are now visible to the system register interface when reading the
+	 * LRs, and when reading back the VMCR on non-VHE systems.
+	 */
+	if (used_lrs || !has_vhe()) {
+		if (!cpu_if->vgic_sre) {
+			dsb(sy);
+			isb();
+		}
+	}
+
+	if (used_lrs || cpu_if->its_vpe.its_vm) {
+		int i;
+		u32 elrsr;
+
+		elrsr = read_gicreg(ICH_ELRSR_EL2);
+
+		write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
+
+		for (i = 0; i < used_lrs; i++) {
+			if (elrsr & (1 << i))
+				cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
+			else
+				cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
+
+			__gic_v3_set_lr(0, i);
+		}
+	}
+}
+
+void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	int i;
+
+	if (used_lrs || cpu_if->its_vpe.its_vm) {
+		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+
+		for (i = 0; i < used_lrs; i++)
+			__gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+	}
+
+	/*
+	 * Ensure that writes to the LRs, and on non-VHE systems ensure that
+	 * the write to the VMCR in __vgic_v3_activate_traps(), will have
+	 * reached the (re)distributors. This ensure the guest will read the
+	 * correct values from the memory-mapped interface.
+	 */
+	if (used_lrs || !has_vhe()) {
+		if (!cpu_if->vgic_sre) {
+			isb();
+			dsb(sy);
+		}
+	}
+}
+
+void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	/*
+	 * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
+	 * Group0 interrupt (as generated in GICv2 mode) to be
+	 * delivered as a FIQ to the guest, with potentially fatal
+	 * consequences. So we must make sure that ICC_SRE_EL1 has
+	 * been actually programmed with the value we want before
+	 * starting to mess with the rest of the GIC, and VMCR_EL2 in
+	 * particular.  This logic must be called before
+	 * __vgic_v3_restore_state().
+	 */
+	if (!cpu_if->vgic_sre) {
+		write_gicreg(0, ICC_SRE_EL1);
+		isb();
+		write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
+
+
+		if (has_vhe()) {
+			/*
+			 * Ensure that the write to the VMCR will have reached
+			 * the (re)distributors. This ensure the guest will
+			 * read the correct values from the memory-mapped
+			 * interface.
+			 */
+			isb();
+			dsb(sy);
+		}
+	}
+
+	/*
+	 * Prevent the guest from touching the GIC system registers if
+	 * SRE isn't enabled for GICv3 emulation.
+	 */
+	write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+		     ICC_SRE_EL2);
+
+	/*
+	 * If we need to trap system registers, we must write
+	 * ICH_HCR_EL2 anyway, even if no interrupts are being
+	 * injected,
+	 */
+	if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
+	    cpu_if->its_vpe.its_vm)
+		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+}
+
+void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u64 val;
+
+	if (!cpu_if->vgic_sre) {
+		cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
+	}
+
+	val = read_gicreg(ICC_SRE_EL2);
+	write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+
+	if (!cpu_if->vgic_sre) {
+		/* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+		isb();
+		write_gicreg(1, ICC_SRE_EL1);
+	}
+
+	/*
+	 * If we were trapping system registers, we enabled the VGIC even if
+	 * no interrupts were being injected, and we disable it again here.
+	 */
+	if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
+	    cpu_if->its_vpe.its_vm)
+		write_gicreg(0, ICH_HCR_EL2);
+}
+
+void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if;
+	u64 val;
+	u32 nr_pre_bits;
+
+	vcpu = kern_hyp_va(vcpu);
+	cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	val = read_gicreg(ICH_VTR_EL2);
+	nr_pre_bits = vtr_to_nr_pre_bits(val);
+
+	switch (nr_pre_bits) {
+	case 7:
+		cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
+		cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
+		/* Fall through */
+	case 6:
+		cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
+		/* Fall through */
+	default:
+		cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
+	}
+
+	switch (nr_pre_bits) {
+	case 7:
+		cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
+		cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
+		/* Fall through */
+	case 6:
+		cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
+		/* Fall through */
+	default:
+		cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
+	}
+}
+
+void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if;
+	u64 val;
+	u32 nr_pre_bits;
+
+	vcpu = kern_hyp_va(vcpu);
+	cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	val = read_gicreg(ICH_VTR_EL2);
+	nr_pre_bits = vtr_to_nr_pre_bits(val);
+
+	switch (nr_pre_bits) {
+	case 7:
+		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
+		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
+		/* Fall through */
+	case 6:
+		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
+		/* Fall through */
+	default:
+		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
+	}
+
+	switch (nr_pre_bits) {
+	case 7:
+		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
+		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
+		/* Fall through */
+	case 6:
+		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
+		/* Fall through */
+	default:
+		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
+	}
+}
+
+void __hyp_text __vgic_v3_init_lrs(void)
+{
+	int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
+	int i;
+
+	for (i = 0; i <= max_lr_idx; i++)
+		__gic_v3_set_lr(0, i);
+}
+
+u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
+{
+	return read_gicreg(ICH_VTR_EL2);
+}
+
+u64 __hyp_text __vgic_v3_read_vmcr(void)
+{
+	return read_gicreg(ICH_VMCR_EL2);
+}
+
+void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
+{
+	write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+static int __hyp_text __vgic_v3_bpr_min(void)
+{
+	/* See Pseudocode for VPriorityGroup */
+	return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
+}
+
+static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu)
+{
+	u32 esr = kvm_vcpu_get_hsr(vcpu);
+	u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
+
+	return crm != 8;
+}
+
+#define GICv3_IDLE_PRIORITY	0xff
+
+static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
+						    u32 vmcr,
+						    u64 *lr_val)
+{
+	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	u8 priority = GICv3_IDLE_PRIORITY;
+	int i, lr = -1;
+
+	for (i = 0; i < used_lrs; i++) {
+		u64 val = __gic_v3_get_lr(i);
+		u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+
+		/* Not pending in the state? */
+		if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
+			continue;
+
+		/* Group-0 interrupt, but Group-0 disabled? */
+		if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
+			continue;
+
+		/* Group-1 interrupt, but Group-1 disabled? */
+		if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
+			continue;
+
+		/* Not the highest priority? */
+		if (lr_prio >= priority)
+			continue;
+
+		/* This is a candidate */
+		priority = lr_prio;
+		*lr_val = val;
+		lr = i;
+	}
+
+	if (lr == -1)
+		*lr_val = ICC_IAR1_EL1_SPURIOUS;
+
+	return lr;
+}
+
+static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
+					       int intid, u64 *lr_val)
+{
+	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	int i;
+
+	for (i = 0; i < used_lrs; i++) {
+		u64 val = __gic_v3_get_lr(i);
+
+		if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
+		    (val & ICH_LR_ACTIVE_BIT)) {
+			*lr_val = val;
+			return i;
+		}
+	}
+
+	*lr_val = ICC_IAR1_EL1_SPURIOUS;
+	return -1;
+}
+
+static int __hyp_text __vgic_v3_get_highest_active_priority(void)
+{
+	u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
+	u32 hap = 0;
+	int i;
+
+	for (i = 0; i < nr_apr_regs; i++) {
+		u32 val;
+
+		/*
+		 * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
+		 * contain the active priority levels for this VCPU
+		 * for the maximum number of supported priority
+		 * levels, and we return the full priority level only
+		 * if the BPR is programmed to its minimum, otherwise
+		 * we return a combination of the priority level and
+		 * subpriority, as determined by the setting of the
+		 * BPR, but without the full subpriority.
+		 */
+		val  = __vgic_v3_read_ap0rn(i);
+		val |= __vgic_v3_read_ap1rn(i);
+		if (!val) {
+			hap += 32;
+			continue;
+		}
+
+		return (hap + __ffs(val)) << __vgic_v3_bpr_min();
+	}
+
+	return GICv3_IDLE_PRIORITY;
+}
+
+static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr)
+{
+	return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+}
+
+static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr)
+{
+	unsigned int bpr;
+
+	if (vmcr & ICH_VMCR_CBPR_MASK) {
+		bpr = __vgic_v3_get_bpr0(vmcr);
+		if (bpr < 7)
+			bpr++;
+	} else {
+		bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+	}
+
+	return bpr;
+}
+
+/*
+ * Convert a priority to a preemption level, taking the relevant BPR
+ * into account by zeroing the sub-priority bits.
+ */
+static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
+{
+	unsigned int bpr;
+
+	if (!grp)
+		bpr = __vgic_v3_get_bpr0(vmcr) + 1;
+	else
+		bpr = __vgic_v3_get_bpr1(vmcr);
+
+	return pri & (GENMASK(7, 0) << bpr);
+}
+
+/*
+ * The priority value is independent of any of the BPR values, so we
+ * normalize it using the minumal BPR value. This guarantees that no
+ * matter what the guest does with its BPR, we can always set/get the
+ * same value of a priority.
+ */
+static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
+{
+	u8 pre, ap;
+	u32 val;
+	int apr;
+
+	pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
+	ap = pre >> __vgic_v3_bpr_min();
+	apr = ap / 32;
+
+	if (!grp) {
+		val = __vgic_v3_read_ap0rn(apr);
+		__vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
+	} else {
+		val = __vgic_v3_read_ap1rn(apr);
+		__vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
+	}
+}
+
+static int __hyp_text __vgic_v3_clear_highest_active_priority(void)
+{
+	u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
+	u32 hap = 0;
+	int i;
+
+	for (i = 0; i < nr_apr_regs; i++) {
+		u32 ap0, ap1;
+		int c0, c1;
+
+		ap0 = __vgic_v3_read_ap0rn(i);
+		ap1 = __vgic_v3_read_ap1rn(i);
+		if (!ap0 && !ap1) {
+			hap += 32;
+			continue;
+		}
+
+		c0 = ap0 ? __ffs(ap0) : 32;
+		c1 = ap1 ? __ffs(ap1) : 32;
+
+		/* Always clear the LSB, which is the highest priority */
+		if (c0 < c1) {
+			ap0 &= ~BIT(c0);
+			__vgic_v3_write_ap0rn(ap0, i);
+			hap += c0;
+		} else {
+			ap1 &= ~BIT(c1);
+			__vgic_v3_write_ap1rn(ap1, i);
+			hap += c1;
+		}
+
+		/* Rescale to 8 bits of priority */
+		return hap << __vgic_v3_bpr_min();
+	}
+
+	return GICv3_IDLE_PRIORITY;
+}
+
+static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 lr_val;
+	u8 lr_prio, pmr;
+	int lr, grp;
+
+	grp = __vgic_v3_get_group(vcpu);
+
+	lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
+	if (lr < 0)
+		goto spurious;
+
+	if (grp != !!(lr_val & ICH_LR_GROUP))
+		goto spurious;
+
+	pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+	lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+	if (pmr <= lr_prio)
+		goto spurious;
+
+	if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
+		goto spurious;
+
+	lr_val &= ~ICH_LR_STATE;
+	/* No active state for LPIs */
+	if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI)
+		lr_val |= ICH_LR_ACTIVE_BIT;
+	__gic_v3_set_lr(lr_val, lr);
+	__vgic_v3_set_active_priority(lr_prio, vmcr, grp);
+	vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
+	return;
+
+spurious:
+	vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
+}
+
+static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val)
+{
+	lr_val &= ~ICH_LR_ACTIVE_BIT;
+	if (lr_val & ICH_LR_HW) {
+		u32 pid;
+
+		pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
+		gic_write_dir(pid);
+	}
+
+	__gic_v3_set_lr(lr_val, lr);
+}
+
+static void __hyp_text __vgic_v3_bump_eoicount(void)
+{
+	u32 hcr;
+
+	hcr = read_gicreg(ICH_HCR_EL2);
+	hcr += 1 << ICH_HCR_EOIcount_SHIFT;
+	write_gicreg(hcr, ICH_HCR_EL2);
+}
+
+static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu,
+					   u32 vmcr, int rt)
+{
+	u32 vid = vcpu_get_reg(vcpu, rt);
+	u64 lr_val;
+	int lr;
+
+	/* EOImode == 0, nothing to be done here */
+	if (!(vmcr & ICH_VMCR_EOIM_MASK))
+		return;
+
+	/* No deactivate to be performed on an LPI */
+	if (vid >= VGIC_MIN_LPI)
+		return;
+
+	lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
+	if (lr == -1) {
+		__vgic_v3_bump_eoicount();
+		return;
+	}
+
+	__vgic_v3_clear_active_lr(lr, lr_val);
+}
+
+static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u32 vid = vcpu_get_reg(vcpu, rt);
+	u64 lr_val;
+	u8 lr_prio, act_prio;
+	int lr, grp;
+
+	grp = __vgic_v3_get_group(vcpu);
+
+	/* Drop priority in any case */
+	act_prio = __vgic_v3_clear_highest_active_priority();
+
+	/* If EOIing an LPI, no deactivate to be performed */
+	if (vid >= VGIC_MIN_LPI)
+		return;
+
+	/* EOImode == 1, nothing to be done here */
+	if (vmcr & ICH_VMCR_EOIM_MASK)
+		return;
+
+	lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
+	if (lr == -1) {
+		__vgic_v3_bump_eoicount();
+		return;
+	}
+
+	lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+
+	/* If priorities or group do not match, the guest has fscked-up. */
+	if (grp != !!(lr_val & ICH_LR_GROUP) ||
+	    __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
+		return;
+
+	/* Let's now perform the deactivation */
+	__vgic_v3_clear_active_lr(lr, lr_val);
+}
+
+static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
+}
+
+static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
+}
+
+static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 val = vcpu_get_reg(vcpu, rt);
+
+	if (val & 1)
+		vmcr |= ICH_VMCR_ENG0_MASK;
+	else
+		vmcr &= ~ICH_VMCR_ENG0_MASK;
+
+	__vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 val = vcpu_get_reg(vcpu, rt);
+
+	if (val & 1)
+		vmcr |= ICH_VMCR_ENG1_MASK;
+	else
+		vmcr &= ~ICH_VMCR_ENG1_MASK;
+
+	__vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
+}
+
+static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
+}
+
+static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 val = vcpu_get_reg(vcpu, rt);
+	u8 bpr_min = __vgic_v3_bpr_min() - 1;
+
+	/* Enforce BPR limiting */
+	if (val < bpr_min)
+		val = bpr_min;
+
+	val <<= ICH_VMCR_BPR0_SHIFT;
+	val &= ICH_VMCR_BPR0_MASK;
+	vmcr &= ~ICH_VMCR_BPR0_MASK;
+	vmcr |= val;
+
+	__vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+	u64 val = vcpu_get_reg(vcpu, rt);
+	u8 bpr_min = __vgic_v3_bpr_min();
+
+	if (vmcr & ICH_VMCR_CBPR_MASK)
+		return;
+
+	/* Enforce BPR limiting */
+	if (val < bpr_min)
+		val = bpr_min;
+
+	val <<= ICH_VMCR_BPR1_SHIFT;
+	val &= ICH_VMCR_BPR1_MASK;
+	vmcr &= ~ICH_VMCR_BPR1_MASK;
+	vmcr |= val;
+
+	__vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
+{
+	u32 val;
+
+	if (!__vgic_v3_get_group(vcpu))
+		val = __vgic_v3_read_ap0rn(n);
+	else
+		val = __vgic_v3_read_ap1rn(n);
+
+	vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
+{
+	u32 val = vcpu_get_reg(vcpu, rt);
+
+	if (!__vgic_v3_get_group(vcpu))
+		__vgic_v3_write_ap0rn(val, n);
+	else
+		__vgic_v3_write_ap1rn(val, n);
+}
+
+static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	__vgic_v3_read_apxrn(vcpu, rt, 0);
+}
+
+static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	__vgic_v3_read_apxrn(vcpu, rt, 1);
+}
+
+static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	__vgic_v3_read_apxrn(vcpu, rt, 2);
+}
+
+static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	__vgic_v3_read_apxrn(vcpu, rt, 3);
+}
+
+static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu,
+					     u32 vmcr, int rt)
+{
+	__vgic_v3_write_apxrn(vcpu, rt, 0);
+}
+
+static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu,
+					     u32 vmcr, int rt)
+{
+	__vgic_v3_write_apxrn(vcpu, rt, 1);
+}
+
+static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu,
+					     u32 vmcr, int rt)
+{
+	__vgic_v3_write_apxrn(vcpu, rt, 2);
+}
+
+static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu,
+					     u32 vmcr, int rt)
+{
+	__vgic_v3_write_apxrn(vcpu, rt, 3);
+}
+
+static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	u64 lr_val;
+	int lr, lr_grp, grp;
+
+	grp = __vgic_v3_get_group(vcpu);
+
+	lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
+	if (lr == -1)
+		goto spurious;
+
+	lr_grp = !!(lr_val & ICH_LR_GROUP);
+	if (lr_grp != grp)
+		lr_val = ICC_IAR1_EL1_SPURIOUS;
+
+spurious:
+	vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
+}
+
+static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu,
+					  u32 vmcr, int rt)
+{
+	vmcr &= ICH_VMCR_PMR_MASK;
+	vmcr >>= ICH_VMCR_PMR_SHIFT;
+	vcpu_set_reg(vcpu, rt, vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu,
+					   u32 vmcr, int rt)
+{
+	u32 val = vcpu_get_reg(vcpu, rt);
+
+	val <<= ICH_VMCR_PMR_SHIFT;
+	val &= ICH_VMCR_PMR_MASK;
+	vmcr &= ~ICH_VMCR_PMR_MASK;
+	vmcr |= val;
+
+	write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu,
+					  u32 vmcr, int rt)
+{
+	u32 val = __vgic_v3_get_highest_active_priority();
+	vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu,
+					   u32 vmcr, int rt)
+{
+	u32 vtr, val;
+
+	vtr = read_gicreg(ICH_VTR_EL2);
+	/* PRIbits */
+	val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
+	/* IDbits */
+	val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
+	/* SEIS */
+	val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT;
+	/* A3V */
+	val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
+	/* EOImode */
+	val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
+	/* CBPR */
+	val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+
+	vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu,
+					    u32 vmcr, int rt)
+{
+	u32 val = vcpu_get_reg(vcpu, rt);
+
+	if (val & ICC_CTLR_EL1_CBPR_MASK)
+		vmcr |= ICH_VMCR_CBPR_MASK;
+	else
+		vmcr &= ~ICH_VMCR_CBPR_MASK;
+
+	if (val & ICC_CTLR_EL1_EOImode_MASK)
+		vmcr |= ICH_VMCR_EOIM_MASK;
+	else
+		vmcr &= ~ICH_VMCR_EOIM_MASK;
+
+	write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
+{
+	int rt;
+	u32 esr;
+	u32 vmcr;
+	void (*fn)(struct kvm_vcpu *, u32, int);
+	bool is_read;
+	u32 sysreg;
+
+	esr = kvm_vcpu_get_hsr(vcpu);
+	if (vcpu_mode_is_32bit(vcpu)) {
+		if (!kvm_condition_valid(vcpu)) {
+			__kvm_skip_instr(vcpu);
+			return 1;
+		}
+
+		sysreg = esr_cp15_to_sysreg(esr);
+	} else {
+		sysreg = esr_sys64_to_sysreg(esr);
+	}
+
+	is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
+
+	switch (sysreg) {
+	case SYS_ICC_IAR0_EL1:
+	case SYS_ICC_IAR1_EL1:
+		if (unlikely(!is_read))
+			return 0;
+		fn = __vgic_v3_read_iar;
+		break;
+	case SYS_ICC_EOIR0_EL1:
+	case SYS_ICC_EOIR1_EL1:
+		if (unlikely(is_read))
+			return 0;
+		fn = __vgic_v3_write_eoir;
+		break;
+	case SYS_ICC_IGRPEN1_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_igrpen1;
+		else
+			fn = __vgic_v3_write_igrpen1;
+		break;
+	case SYS_ICC_BPR1_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_bpr1;
+		else
+			fn = __vgic_v3_write_bpr1;
+		break;
+	case SYS_ICC_AP0Rn_EL1(0):
+	case SYS_ICC_AP1Rn_EL1(0):
+		if (is_read)
+			fn = __vgic_v3_read_apxr0;
+		else
+			fn = __vgic_v3_write_apxr0;
+		break;
+	case SYS_ICC_AP0Rn_EL1(1):
+	case SYS_ICC_AP1Rn_EL1(1):
+		if (is_read)
+			fn = __vgic_v3_read_apxr1;
+		else
+			fn = __vgic_v3_write_apxr1;
+		break;
+	case SYS_ICC_AP0Rn_EL1(2):
+	case SYS_ICC_AP1Rn_EL1(2):
+		if (is_read)
+			fn = __vgic_v3_read_apxr2;
+		else
+			fn = __vgic_v3_write_apxr2;
+		break;
+	case SYS_ICC_AP0Rn_EL1(3):
+	case SYS_ICC_AP1Rn_EL1(3):
+		if (is_read)
+			fn = __vgic_v3_read_apxr3;
+		else
+			fn = __vgic_v3_write_apxr3;
+		break;
+	case SYS_ICC_HPPIR0_EL1:
+	case SYS_ICC_HPPIR1_EL1:
+		if (unlikely(!is_read))
+			return 0;
+		fn = __vgic_v3_read_hppir;
+		break;
+	case SYS_ICC_IGRPEN0_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_igrpen0;
+		else
+			fn = __vgic_v3_write_igrpen0;
+		break;
+	case SYS_ICC_BPR0_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_bpr0;
+		else
+			fn = __vgic_v3_write_bpr0;
+		break;
+	case SYS_ICC_DIR_EL1:
+		if (unlikely(is_read))
+			return 0;
+		fn = __vgic_v3_write_dir;
+		break;
+	case SYS_ICC_RPR_EL1:
+		if (unlikely(!is_read))
+			return 0;
+		fn = __vgic_v3_read_rpr;
+		break;
+	case SYS_ICC_CTLR_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_ctlr;
+		else
+			fn = __vgic_v3_write_ctlr;
+		break;
+	case SYS_ICC_PMR_EL1:
+		if (is_read)
+			fn = __vgic_v3_read_pmr;
+		else
+			fn = __vgic_v3_write_pmr;
+		break;
+	default:
+		return 0;
+	}
+
+	vmcr = __vgic_v3_read_vmcr();
+	rt = kvm_vcpu_sys_get_rt(vcpu);
+	fn(vcpu, vmcr, rt);
+
+	__kvm_skip_instr(vcpu);
+
+	return 1;
+}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
new file mode 100644
index 000000000000..550dfa3e53cd
--- /dev/null
+++ b/arch/arm64/kvm/hypercalls.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+#include <kvm/arm_hypercalls.h>
+#include <kvm/arm_psci.h>
+
+int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
+{
+	u32 func_id = smccc_get_function(vcpu);
+	long val = SMCCC_RET_NOT_SUPPORTED;
+	u32 feature;
+	gpa_t gpa;
+
+	switch (func_id) {
+	case ARM_SMCCC_VERSION_FUNC_ID:
+		val = ARM_SMCCC_VERSION_1_1;
+		break;
+	case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
+		feature = smccc_get_arg1(vcpu);
+		switch (feature) {
+		case ARM_SMCCC_ARCH_WORKAROUND_1:
+			switch (kvm_arm_harden_branch_predictor()) {
+			case KVM_BP_HARDEN_UNKNOWN:
+				break;
+			case KVM_BP_HARDEN_WA_NEEDED:
+				val = SMCCC_RET_SUCCESS;
+				break;
+			case KVM_BP_HARDEN_NOT_REQUIRED:
+				val = SMCCC_RET_NOT_REQUIRED;
+				break;
+			}
+			break;
+		case ARM_SMCCC_ARCH_WORKAROUND_2:
+			switch (kvm_arm_have_ssbd()) {
+			case KVM_SSBD_FORCE_DISABLE:
+			case KVM_SSBD_UNKNOWN:
+				break;
+			case KVM_SSBD_KERNEL:
+				val = SMCCC_RET_SUCCESS;
+				break;
+			case KVM_SSBD_FORCE_ENABLE:
+			case KVM_SSBD_MITIGATED:
+				val = SMCCC_RET_NOT_REQUIRED;
+				break;
+			}
+			break;
+		case ARM_SMCCC_HV_PV_TIME_FEATURES:
+			val = SMCCC_RET_SUCCESS;
+			break;
+		}
+		break;
+	case ARM_SMCCC_HV_PV_TIME_FEATURES:
+		val = kvm_hypercall_pv_features(vcpu);
+		break;
+	case ARM_SMCCC_HV_PV_TIME_ST:
+		gpa = kvm_init_stolen_time(vcpu);
+		if (gpa != GPA_INVALID)
+			val = gpa;
+		break;
+	default:
+		return kvm_psci_call(vcpu);
+	}
+
+	smccc_set_retval(vcpu, val, 0, 0, 0);
+	return 1;
+}
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
new file mode 100644
index 000000000000..aedfcff99ac5
--- /dev/null
+++ b/arch/arm64/kvm/mmio.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
+{
+	void *datap = NULL;
+	union {
+		u8	byte;
+		u16	hword;
+		u32	word;
+		u64	dword;
+	} tmp;
+
+	switch (len) {
+	case 1:
+		tmp.byte	= data;
+		datap		= &tmp.byte;
+		break;
+	case 2:
+		tmp.hword	= data;
+		datap		= &tmp.hword;
+		break;
+	case 4:
+		tmp.word	= data;
+		datap		= &tmp.word;
+		break;
+	case 8:
+		tmp.dword	= data;
+		datap		= &tmp.dword;
+		break;
+	}
+
+	memcpy(buf, datap, len);
+}
+
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
+{
+	unsigned long data = 0;
+	union {
+		u16	hword;
+		u32	word;
+		u64	dword;
+	} tmp;
+
+	switch (len) {
+	case 1:
+		data = *(u8 *)buf;
+		break;
+	case 2:
+		memcpy(&tmp.hword, buf, len);
+		data = tmp.hword;
+		break;
+	case 4:
+		memcpy(&tmp.word, buf, len);
+		data = tmp.word;
+		break;
+	case 8:
+		memcpy(&tmp.dword, buf, len);
+		data = tmp.dword;
+		break;
+	}
+
+	return data;
+}
+
+/**
+ * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
+ *			     or in-kernel IO emulation
+ *
+ * @vcpu: The VCPU pointer
+ * @run:  The VCPU run struct containing the mmio data
+ */
+int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	unsigned long data;
+	unsigned int len;
+	int mask;
+
+	/* Detect an already handled MMIO return */
+	if (unlikely(!vcpu->mmio_needed))
+		return 0;
+
+	vcpu->mmio_needed = 0;
+
+	if (!kvm_vcpu_dabt_iswrite(vcpu)) {
+		len = kvm_vcpu_dabt_get_as(vcpu);
+		data = kvm_mmio_read_buf(run->mmio.data, len);
+
+		if (kvm_vcpu_dabt_issext(vcpu) &&
+		    len < sizeof(unsigned long)) {
+			mask = 1U << ((len * 8) - 1);
+			data = (data ^ mask) - mask;
+		}
+
+		if (!kvm_vcpu_dabt_issf(vcpu))
+			data = data & 0xffffffff;
+
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
+			       &data);
+		data = vcpu_data_host_to_guest(vcpu, data, len);
+		vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data);
+	}
+
+	/*
+	 * The MMIO instruction is emulated and should not be re-executed
+	 * in the guest.
+	 */
+	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
+	return 0;
+}
+
+int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
+		 phys_addr_t fault_ipa)
+{
+	unsigned long data;
+	unsigned long rt;
+	int ret;
+	bool is_write;
+	int len;
+	u8 data_buf[8];
+
+	/*
+	 * No valid syndrome? Ask userspace for help if it has
+	 * voluntered to do so, and bail out otherwise.
+	 */
+	if (!kvm_vcpu_dabt_isvalid(vcpu)) {
+		if (vcpu->kvm->arch.return_nisv_io_abort_to_user) {
+			run->exit_reason = KVM_EXIT_ARM_NISV;
+			run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu);
+			run->arm_nisv.fault_ipa = fault_ipa;
+			return 0;
+		}
+
+		kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n");
+		return -ENOSYS;
+	}
+
+	/* Page table accesses IO mem: tell guest to fix its TTBR */
+	if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+		kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+		return 1;
+	}
+
+	/*
+	 * Prepare MMIO operation. First decode the syndrome data we get
+	 * from the CPU. Then try if some in-kernel emulation feels
+	 * responsible, otherwise let user space do its magic.
+	 */
+	is_write = kvm_vcpu_dabt_iswrite(vcpu);
+	len = kvm_vcpu_dabt_get_as(vcpu);
+	rt = kvm_vcpu_dabt_get_rd(vcpu);
+
+	if (is_write) {
+		data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
+					       len);
+
+		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
+		kvm_mmio_write_buf(data_buf, len, data);
+
+		ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+				       data_buf);
+	} else {
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
+			       fault_ipa, NULL);
+
+		ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+				      data_buf);
+	}
+
+	/* Now prepare kvm_run for the potential return to userland. */
+	run->mmio.is_write	= is_write;
+	run->mmio.phys_addr	= fault_ipa;
+	run->mmio.len		= len;
+	vcpu->mmio_needed	= 1;
+
+	if (!ret) {
+		/* We handled the access successfully in the kernel. */
+		if (!is_write)
+			memcpy(run->mmio.data, data_buf, len);
+		vcpu->stat.mmio_exit_kernel++;
+		kvm_handle_mmio_return(vcpu, run);
+		return 1;
+	}
+
+	if (is_write)
+		memcpy(run->mmio.data, data_buf, len);
+	vcpu->stat.mmio_exit_user++;
+	run->exit_reason	= KVM_EXIT_MMIO;
+	return 0;
+}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
new file mode 100644
index 000000000000..e3b9ee268823
--- /dev/null
+++ b/arch/arm64/kvm/mmu.c
@@ -0,0 +1,2447 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/mman.h>
+#include <linux/kvm_host.h>
+#include <linux/io.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/signal.h>
+#include <trace/events/kvm.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_ras.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/virt.h>
+
+#include "trace.h"
+
+static pgd_t *boot_hyp_pgd;
+static pgd_t *hyp_pgd;
+static pgd_t *merged_hyp_pgd;
+static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
+
+static unsigned long hyp_idmap_start;
+static unsigned long hyp_idmap_end;
+static phys_addr_t hyp_idmap_vector;
+
+static unsigned long io_map_base;
+
+#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
+
+#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
+#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
+
+static bool is_iomap(unsigned long flags)
+{
+	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+}
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
+}
+
+/**
+ * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
+ * @kvm:	pointer to kvm structure.
+ *
+ * Interface to HYP function to flush all VM TLB entries
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+
+static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+{
+	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
+}
+
+/*
+ * D-Cache management functions. They take the page table entries by
+ * value, as they are flushing the cache using the kernel mapping (or
+ * kmap on 32bit).
+ */
+static void kvm_flush_dcache_pte(pte_t pte)
+{
+	__kvm_flush_dcache_pte(pte);
+}
+
+static void kvm_flush_dcache_pmd(pmd_t pmd)
+{
+	__kvm_flush_dcache_pmd(pmd);
+}
+
+static void kvm_flush_dcache_pud(pud_t pud)
+{
+	__kvm_flush_dcache_pud(pud);
+}
+
+static bool kvm_is_device_pfn(unsigned long pfn)
+{
+	return !pfn_valid(pfn);
+}
+
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm:	pointer to kvm structure.
+ * @addr:	IPA
+ * @pmd:	pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+	if (!pmd_thp_or_huge(*pmd))
+		return;
+
+	pmd_clear(pmd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	put_page(virt_to_page(pmd));
+}
+
+/**
+ * stage2_dissolve_pud() - clear and flush huge PUD entry
+ * @kvm:	pointer to kvm structure.
+ * @addr:	IPA
+ * @pud:	pud pointer for IPA
+ *
+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
+ */
+static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
+{
+	if (!stage2_pud_huge(kvm, *pudp))
+		return;
+
+	stage2_pud_clear(kvm, pudp);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	put_page(virt_to_page(pudp));
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+				  int min, int max)
+{
+	void *page;
+
+	BUG_ON(max > KVM_NR_MEM_OBJS);
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < max) {
+		page = (void *)__get_free_page(GFP_PGTABLE_USER);
+		if (!page)
+			return -ENOMEM;
+		cache->objects[cache->nobjs++] = page;
+	}
+	return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs)
+		free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
+{
+	void *p;
+
+	BUG_ON(!mc || !mc->nobjs);
+	p = mc->objects[--mc->nobjs];
+	return p;
+}
+
+static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
+{
+	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
+	stage2_pgd_clear(kvm, pgd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	stage2_pud_free(kvm, pud_table);
+	put_page(virt_to_page(pgd));
+}
+
+static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
+{
+	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
+	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
+	stage2_pud_clear(kvm, pud);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	stage2_pmd_free(kvm, pmd_table);
+	put_page(virt_to_page(pud));
+}
+
+static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
+{
+	pte_t *pte_table = pte_offset_kernel(pmd, 0);
+	VM_BUG_ON(pmd_thp_or_huge(*pmd));
+	pmd_clear(pmd);
+	kvm_tlb_flush_vmid_ipa(kvm, addr);
+	free_page((unsigned long)pte_table);
+	put_page(virt_to_page(pmd));
+}
+
+static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
+{
+	WRITE_ONCE(*ptep, new_pte);
+	dsb(ishst);
+}
+
+static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
+{
+	WRITE_ONCE(*pmdp, new_pmd);
+	dsb(ishst);
+}
+
+static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
+{
+	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
+}
+
+static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
+{
+	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
+	dsb(ishst);
+}
+
+static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
+{
+	WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
+	dsb(ishst);
+}
+
+/*
+ * Unmapping vs dcache management:
+ *
+ * If a guest maps certain memory pages as uncached, all writes will
+ * bypass the data cache and go directly to RAM.  However, the CPUs
+ * can still speculate reads (not writes) and fill cache lines with
+ * data.
+ *
+ * Those cache lines will be *clean* cache lines though, so a
+ * clean+invalidate operation is equivalent to an invalidate
+ * operation, because no cache lines are marked dirty.
+ *
+ * Those clean cache lines could be filled prior to an uncached write
+ * by the guest, and the cache coherent IO subsystem would therefore
+ * end up writing old data to disk.
+ *
+ * This is why right after unmapping a page/section and invalidating
+ * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
+ * the IO subsystem will never hit in the cache.
+ *
+ * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
+ * we then fully enforce cacheability of RAM, no matter what the guest
+ * does.
+ */
+static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
+		       phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t start_addr = addr;
+	pte_t *pte, *start_pte;
+
+	start_pte = pte = pte_offset_kernel(pmd, addr);
+	do {
+		if (!pte_none(*pte)) {
+			pte_t old_pte = *pte;
+
+			kvm_set_pte(pte, __pte(0));
+			kvm_tlb_flush_vmid_ipa(kvm, addr);
+
+			/* No need to invalidate the cache for device mappings */
+			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
+				kvm_flush_dcache_pte(old_pte);
+
+			put_page(virt_to_page(pte));
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	if (stage2_pte_table_empty(kvm, start_pte))
+		clear_stage2_pmd_entry(kvm, pmd, start_addr);
+}
+
+static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
+		       phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t next, start_addr = addr;
+	pmd_t *pmd, *start_pmd;
+
+	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
+	do {
+		next = stage2_pmd_addr_end(kvm, addr, end);
+		if (!pmd_none(*pmd)) {
+			if (pmd_thp_or_huge(*pmd)) {
+				pmd_t old_pmd = *pmd;
+
+				pmd_clear(pmd);
+				kvm_tlb_flush_vmid_ipa(kvm, addr);
+
+				kvm_flush_dcache_pmd(old_pmd);
+
+				put_page(virt_to_page(pmd));
+			} else {
+				unmap_stage2_ptes(kvm, pmd, addr, next);
+			}
+		}
+	} while (pmd++, addr = next, addr != end);
+
+	if (stage2_pmd_table_empty(kvm, start_pmd))
+		clear_stage2_pud_entry(kvm, pud, start_addr);
+}
+
+static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
+		       phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t next, start_addr = addr;
+	pud_t *pud, *start_pud;
+
+	start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
+	do {
+		next = stage2_pud_addr_end(kvm, addr, end);
+		if (!stage2_pud_none(kvm, *pud)) {
+			if (stage2_pud_huge(kvm, *pud)) {
+				pud_t old_pud = *pud;
+
+				stage2_pud_clear(kvm, pud);
+				kvm_tlb_flush_vmid_ipa(kvm, addr);
+				kvm_flush_dcache_pud(old_pud);
+				put_page(virt_to_page(pud));
+			} else {
+				unmap_stage2_pmds(kvm, pud, addr, next);
+			}
+		}
+	} while (pud++, addr = next, addr != end);
+
+	if (stage2_pud_table_empty(kvm, start_pud))
+		clear_stage2_pgd_entry(kvm, pgd, start_addr);
+}
+
+/**
+ * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * @kvm:   The VM pointer
+ * @start: The intermediate physical base address of the range to unmap
+ * @size:  The size of the area to unmap
+ *
+ * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
+ * be called while holding mmu_lock (unless for freeing the stage2 pgd before
+ * destroying the VM), otherwise another faulting VCPU may come in and mess
+ * with things behind our backs.
+ */
+static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
+{
+	pgd_t *pgd;
+	phys_addr_t addr = start, end = start + size;
+	phys_addr_t next;
+
+	assert_spin_locked(&kvm->mmu_lock);
+	WARN_ON(size & ~PAGE_MASK);
+
+	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+	do {
+		/*
+		 * Make sure the page table is still active, as another thread
+		 * could have possibly freed the page table, while we released
+		 * the lock.
+		 */
+		if (!READ_ONCE(kvm->arch.pgd))
+			break;
+		next = stage2_pgd_addr_end(kvm, addr, end);
+		if (!stage2_pgd_none(kvm, *pgd))
+			unmap_stage2_puds(kvm, pgd, addr, next);
+		/*
+		 * If the range is too large, release the kvm->mmu_lock
+		 * to prevent starvation and lockup detector warnings.
+		 */
+		if (next != end)
+			cond_resched_lock(&kvm->mmu_lock);
+	} while (pgd++, addr = next, addr != end);
+}
+
+static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
+			      phys_addr_t addr, phys_addr_t end)
+{
+	pte_t *pte;
+
+	pte = pte_offset_kernel(pmd, addr);
+	do {
+		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
+			kvm_flush_dcache_pte(*pte);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
+			      phys_addr_t addr, phys_addr_t end)
+{
+	pmd_t *pmd;
+	phys_addr_t next;
+
+	pmd = stage2_pmd_offset(kvm, pud, addr);
+	do {
+		next = stage2_pmd_addr_end(kvm, addr, end);
+		if (!pmd_none(*pmd)) {
+			if (pmd_thp_or_huge(*pmd))
+				kvm_flush_dcache_pmd(*pmd);
+			else
+				stage2_flush_ptes(kvm, pmd, addr, next);
+		}
+	} while (pmd++, addr = next, addr != end);
+}
+
+static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
+			      phys_addr_t addr, phys_addr_t end)
+{
+	pud_t *pud;
+	phys_addr_t next;
+
+	pud = stage2_pud_offset(kvm, pgd, addr);
+	do {
+		next = stage2_pud_addr_end(kvm, addr, end);
+		if (!stage2_pud_none(kvm, *pud)) {
+			if (stage2_pud_huge(kvm, *pud))
+				kvm_flush_dcache_pud(*pud);
+			else
+				stage2_flush_pmds(kvm, pud, addr, next);
+		}
+	} while (pud++, addr = next, addr != end);
+}
+
+static void stage2_flush_memslot(struct kvm *kvm,
+				 struct kvm_memory_slot *memslot)
+{
+	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
+	phys_addr_t next;
+	pgd_t *pgd;
+
+	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+	do {
+		next = stage2_pgd_addr_end(kvm, addr, end);
+		if (!stage2_pgd_none(kvm, *pgd))
+			stage2_flush_puds(kvm, pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the stage 2 page tables and invalidate any cache lines
+ * backing memory already mapped to the VM.
+ */
+static void stage2_flush_vm(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots)
+		stage2_flush_memslot(kvm, memslot);
+
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
+static void clear_hyp_pgd_entry(pgd_t *pgd)
+{
+	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
+	pgd_clear(pgd);
+	pud_free(NULL, pud_table);
+	put_page(virt_to_page(pgd));
+}
+
+static void clear_hyp_pud_entry(pud_t *pud)
+{
+	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
+	VM_BUG_ON(pud_huge(*pud));
+	pud_clear(pud);
+	pmd_free(NULL, pmd_table);
+	put_page(virt_to_page(pud));
+}
+
+static void clear_hyp_pmd_entry(pmd_t *pmd)
+{
+	pte_t *pte_table = pte_offset_kernel(pmd, 0);
+	VM_BUG_ON(pmd_thp_or_huge(*pmd));
+	pmd_clear(pmd);
+	pte_free_kernel(NULL, pte_table);
+	put_page(virt_to_page(pmd));
+}
+
+static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+	pte_t *pte, *start_pte;
+
+	start_pte = pte = pte_offset_kernel(pmd, addr);
+	do {
+		if (!pte_none(*pte)) {
+			kvm_set_pte(pte, __pte(0));
+			put_page(virt_to_page(pte));
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	if (hyp_pte_table_empty(start_pte))
+		clear_hyp_pmd_entry(pmd);
+}
+
+static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t next;
+	pmd_t *pmd, *start_pmd;
+
+	start_pmd = pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		/* Hyp doesn't use huge pmds */
+		if (!pmd_none(*pmd))
+			unmap_hyp_ptes(pmd, addr, next);
+	} while (pmd++, addr = next, addr != end);
+
+	if (hyp_pmd_table_empty(start_pmd))
+		clear_hyp_pud_entry(pud);
+}
+
+static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+{
+	phys_addr_t next;
+	pud_t *pud, *start_pud;
+
+	start_pud = pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		/* Hyp doesn't use huge puds */
+		if (!pud_none(*pud))
+			unmap_hyp_pmds(pud, addr, next);
+	} while (pud++, addr = next, addr != end);
+
+	if (hyp_pud_table_empty(start_pud))
+		clear_hyp_pgd_entry(pgd);
+}
+
+static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
+{
+	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
+}
+
+static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
+			      phys_addr_t start, u64 size)
+{
+	pgd_t *pgd;
+	phys_addr_t addr = start, end = start + size;
+	phys_addr_t next;
+
+	/*
+	 * We don't unmap anything from HYP, except at the hyp tear down.
+	 * Hence, we don't have to invalidate the TLBs here.
+	 */
+	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (!pgd_none(*pgd))
+			unmap_hyp_puds(pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
+static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
+}
+
+static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
+}
+
+/**
+ * free_hyp_pgds - free Hyp-mode page tables
+ *
+ * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
+ * therefore contains either mappings in the kernel memory area (above
+ * PAGE_OFFSET), or device mappings in the idmap range.
+ *
+ * boot_hyp_pgd should only map the idmap range, and is only used in
+ * the extended idmap case.
+ */
+void free_hyp_pgds(void)
+{
+	pgd_t *id_pgd;
+
+	mutex_lock(&kvm_hyp_pgd_mutex);
+
+	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
+
+	if (id_pgd) {
+		/* In case we never called hyp_mmu_init() */
+		if (!io_map_base)
+			io_map_base = hyp_idmap_start;
+		unmap_hyp_idmap_range(id_pgd, io_map_base,
+				      hyp_idmap_start + PAGE_SIZE - io_map_base);
+	}
+
+	if (boot_hyp_pgd) {
+		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
+		boot_hyp_pgd = NULL;
+	}
+
+	if (hyp_pgd) {
+		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
+				(uintptr_t)high_memory - PAGE_OFFSET);
+
+		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
+		hyp_pgd = NULL;
+	}
+	if (merged_hyp_pgd) {
+		clear_page(merged_hyp_pgd);
+		free_page((unsigned long)merged_hyp_pgd);
+		merged_hyp_pgd = NULL;
+	}
+
+	mutex_unlock(&kvm_hyp_pgd_mutex);
+}
+
+static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
+				    unsigned long end, unsigned long pfn,
+				    pgprot_t prot)
+{
+	pte_t *pte;
+	unsigned long addr;
+
+	addr = start;
+	do {
+		pte = pte_offset_kernel(pmd, addr);
+		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
+		get_page(virt_to_page(pte));
+		pfn++;
+	} while (addr += PAGE_SIZE, addr != end);
+}
+
+static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
+				   unsigned long end, unsigned long pfn,
+				   pgprot_t prot)
+{
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long addr, next;
+
+	addr = start;
+	do {
+		pmd = pmd_offset(pud, addr);
+
+		BUG_ON(pmd_sect(*pmd));
+
+		if (pmd_none(*pmd)) {
+			pte = pte_alloc_one_kernel(NULL);
+			if (!pte) {
+				kvm_err("Cannot allocate Hyp pte\n");
+				return -ENOMEM;
+			}
+			kvm_pmd_populate(pmd, pte);
+			get_page(virt_to_page(pmd));
+		}
+
+		next = pmd_addr_end(addr, end);
+
+		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
+		pfn += (next - addr) >> PAGE_SHIFT;
+	} while (addr = next, addr != end);
+
+	return 0;
+}
+
+static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
+				   unsigned long end, unsigned long pfn,
+				   pgprot_t prot)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	unsigned long addr, next;
+	int ret;
+
+	addr = start;
+	do {
+		pud = pud_offset(pgd, addr);
+
+		if (pud_none_or_clear_bad(pud)) {
+			pmd = pmd_alloc_one(NULL, addr);
+			if (!pmd) {
+				kvm_err("Cannot allocate Hyp pmd\n");
+				return -ENOMEM;
+			}
+			kvm_pud_populate(pud, pmd);
+			get_page(virt_to_page(pud));
+		}
+
+		next = pud_addr_end(addr, end);
+		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
+		if (ret)
+			return ret;
+		pfn += (next - addr) >> PAGE_SHIFT;
+	} while (addr = next, addr != end);
+
+	return 0;
+}
+
+static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
+				 unsigned long start, unsigned long end,
+				 unsigned long pfn, pgprot_t prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	unsigned long addr, next;
+	int err = 0;
+
+	mutex_lock(&kvm_hyp_pgd_mutex);
+	addr = start & PAGE_MASK;
+	end = PAGE_ALIGN(end);
+	do {
+		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
+
+		if (pgd_none(*pgd)) {
+			pud = pud_alloc_one(NULL, addr);
+			if (!pud) {
+				kvm_err("Cannot allocate Hyp pud\n");
+				err = -ENOMEM;
+				goto out;
+			}
+			kvm_pgd_populate(pgd, pud);
+			get_page(virt_to_page(pgd));
+		}
+
+		next = pgd_addr_end(addr, end);
+		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
+		if (err)
+			goto out;
+		pfn += (next - addr) >> PAGE_SHIFT;
+	} while (addr = next, addr != end);
+out:
+	mutex_unlock(&kvm_hyp_pgd_mutex);
+	return err;
+}
+
+static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
+{
+	if (!is_vmalloc_addr(kaddr)) {
+		BUG_ON(!virt_addr_valid(kaddr));
+		return __pa(kaddr);
+	} else {
+		return page_to_phys(vmalloc_to_page(kaddr)) +
+		       offset_in_page(kaddr);
+	}
+}
+
+/**
+ * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
+ * @from:	The virtual kernel start address of the range
+ * @to:		The virtual kernel end address of the range (exclusive)
+ * @prot:	The protection to be applied to this range
+ *
+ * The same virtual address as the kernel virtual address is also used
+ * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
+ * physical pages.
+ */
+int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+{
+	phys_addr_t phys_addr;
+	unsigned long virt_addr;
+	unsigned long start = kern_hyp_va((unsigned long)from);
+	unsigned long end = kern_hyp_va((unsigned long)to);
+
+	if (is_kernel_in_hyp_mode())
+		return 0;
+
+	start = start & PAGE_MASK;
+	end = PAGE_ALIGN(end);
+
+	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+		int err;
+
+		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
+		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
+					    virt_addr, virt_addr + PAGE_SIZE,
+					    __phys_to_pfn(phys_addr),
+					    prot);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
+					unsigned long *haddr, pgprot_t prot)
+{
+	pgd_t *pgd = hyp_pgd;
+	unsigned long base;
+	int ret = 0;
+
+	mutex_lock(&kvm_hyp_pgd_mutex);
+
+	/*
+	 * This assumes that we we have enough space below the idmap
+	 * page to allocate our VAs. If not, the check below will
+	 * kick. A potential alternative would be to detect that
+	 * overflow and switch to an allocation above the idmap.
+	 *
+	 * The allocated size is always a multiple of PAGE_SIZE.
+	 */
+	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
+	base = io_map_base - size;
+
+	/*
+	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
+	 * allocating the new area, as it would indicate we've
+	 * overflowed the idmap/IO address range.
+	 */
+	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
+		ret = -ENOMEM;
+	else
+		io_map_base = base;
+
+	mutex_unlock(&kvm_hyp_pgd_mutex);
+
+	if (ret)
+		goto out;
+
+	if (__kvm_cpu_uses_extended_idmap())
+		pgd = boot_hyp_pgd;
+
+	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
+				    base, base + size,
+				    __phys_to_pfn(phys_addr), prot);
+	if (ret)
+		goto out;
+
+	*haddr = base + offset_in_page(phys_addr);
+
+out:
+	return ret;
+}
+
+/**
+ * create_hyp_io_mappings - Map IO into both kernel and HYP
+ * @phys_addr:	The physical start address which gets mapped
+ * @size:	Size of the region being mapped
+ * @kaddr:	Kernel VA for this mapping
+ * @haddr:	HYP VA for this mapping
+ */
+int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
+			   void __iomem **kaddr,
+			   void __iomem **haddr)
+{
+	unsigned long addr;
+	int ret;
+
+	*kaddr = ioremap(phys_addr, size);
+	if (!*kaddr)
+		return -ENOMEM;
+
+	if (is_kernel_in_hyp_mode()) {
+		*haddr = *kaddr;
+		return 0;
+	}
+
+	ret = __create_hyp_private_mapping(phys_addr, size,
+					   &addr, PAGE_HYP_DEVICE);
+	if (ret) {
+		iounmap(*kaddr);
+		*kaddr = NULL;
+		*haddr = NULL;
+		return ret;
+	}
+
+	*haddr = (void __iomem *)addr;
+	return 0;
+}
+
+/**
+ * create_hyp_exec_mappings - Map an executable range into HYP
+ * @phys_addr:	The physical start address which gets mapped
+ * @size:	Size of the region being mapped
+ * @haddr:	HYP VA for this mapping
+ */
+int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
+			     void **haddr)
+{
+	unsigned long addr;
+	int ret;
+
+	BUG_ON(is_kernel_in_hyp_mode());
+
+	ret = __create_hyp_private_mapping(phys_addr, size,
+					   &addr, PAGE_HYP_EXEC);
+	if (ret) {
+		*haddr = NULL;
+		return ret;
+	}
+
+	*haddr = (void *)addr;
+	return 0;
+}
+
+/**
+ * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
+ * @kvm:	The KVM struct pointer for the VM.
+ *
+ * Allocates only the stage-2 HW PGD level table(s) of size defined by
+ * stage2_pgd_size(kvm).
+ *
+ * Note we don't need locking here as this is only called when the VM is
+ * created, which can only be done once.
+ */
+int kvm_alloc_stage2_pgd(struct kvm *kvm)
+{
+	phys_addr_t pgd_phys;
+	pgd_t *pgd;
+
+	if (kvm->arch.pgd != NULL) {
+		kvm_err("kvm_arch already initialized?\n");
+		return -EINVAL;
+	}
+
+	/* Allocate the HW PGD, making sure that each page gets its own refcount */
+	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
+	if (!pgd)
+		return -ENOMEM;
+
+	pgd_phys = virt_to_phys(pgd);
+	if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
+		return -EINVAL;
+
+	kvm->arch.pgd = pgd;
+	kvm->arch.pgd_phys = pgd_phys;
+	return 0;
+}
+
+static void stage2_unmap_memslot(struct kvm *kvm,
+				 struct kvm_memory_slot *memslot)
+{
+	hva_t hva = memslot->userspace_addr;
+	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+	phys_addr_t size = PAGE_SIZE * memslot->npages;
+	hva_t reg_end = hva + size;
+
+	/*
+	 * A memory region could potentially cover multiple VMAs, and any holes
+	 * between them, so iterate over all of them to find out if we should
+	 * unmap any of them.
+	 *
+	 *     +--------------------------------------------+
+	 * +---------------+----------------+   +----------------+
+	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
+	 * +---------------+----------------+   +----------------+
+	 *     |               memory region                |
+	 *     +--------------------------------------------+
+	 */
+	do {
+		struct vm_area_struct *vma = find_vma(current->mm, hva);
+		hva_t vm_start, vm_end;
+
+		if (!vma || vma->vm_start >= reg_end)
+			break;
+
+		/*
+		 * Take the intersection of this VMA with the memory region
+		 */
+		vm_start = max(hva, vma->vm_start);
+		vm_end = min(reg_end, vma->vm_end);
+
+		if (!(vma->vm_flags & VM_PFNMAP)) {
+			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
+			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
+		}
+		hva = vm_end;
+	} while (hva < reg_end);
+}
+
+/**
+ * stage2_unmap_vm - Unmap Stage-2 RAM mappings
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the memregions and unmap any reguler RAM
+ * backing memory already mapped to the VM.
+ */
+void stage2_unmap_vm(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	down_read(&current->mm->mmap_sem);
+	spin_lock(&kvm->mmu_lock);
+
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots)
+		stage2_unmap_memslot(kvm, memslot);
+
+	spin_unlock(&kvm->mmu_lock);
+	up_read(&current->mm->mmap_sem);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
+/**
+ * kvm_free_stage2_pgd - free all stage-2 tables
+ * @kvm:	The KVM struct pointer for the VM.
+ *
+ * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
+ * underlying level-2 and level-3 tables before freeing the actual level-1 table
+ * and setting the struct pointer to NULL.
+ */
+void kvm_free_stage2_pgd(struct kvm *kvm)
+{
+	void *pgd = NULL;
+
+	spin_lock(&kvm->mmu_lock);
+	if (kvm->arch.pgd) {
+		unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
+		pgd = READ_ONCE(kvm->arch.pgd);
+		kvm->arch.pgd = NULL;
+		kvm->arch.pgd_phys = 0;
+	}
+	spin_unlock(&kvm->mmu_lock);
+
+	/* Free the HW pgd, one page at a time */
+	if (pgd)
+		free_pages_exact(pgd, stage2_pgd_size(kvm));
+}
+
+static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+			     phys_addr_t addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+
+	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+	if (stage2_pgd_none(kvm, *pgd)) {
+		if (!cache)
+			return NULL;
+		pud = mmu_memory_cache_alloc(cache);
+		stage2_pgd_populate(kvm, pgd, pud);
+		get_page(virt_to_page(pgd));
+	}
+
+	return stage2_pud_offset(kvm, pgd, addr);
+}
+
+static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+			     phys_addr_t addr)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pud = stage2_get_pud(kvm, cache, addr);
+	if (!pud || stage2_pud_huge(kvm, *pud))
+		return NULL;
+
+	if (stage2_pud_none(kvm, *pud)) {
+		if (!cache)
+			return NULL;
+		pmd = mmu_memory_cache_alloc(cache);
+		stage2_pud_populate(kvm, pud, pmd);
+		get_page(virt_to_page(pud));
+	}
+
+	return stage2_pmd_offset(kvm, pud, addr);
+}
+
+static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
+			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
+{
+	pmd_t *pmd, old_pmd;
+
+retry:
+	pmd = stage2_get_pmd(kvm, cache, addr);
+	VM_BUG_ON(!pmd);
+
+	old_pmd = *pmd;
+	/*
+	 * Multiple vcpus faulting on the same PMD entry, can
+	 * lead to them sequentially updating the PMD with the
+	 * same value. Following the break-before-make
+	 * (pmd_clear() followed by tlb_flush()) process can
+	 * hinder forward progress due to refaults generated
+	 * on missing translations.
+	 *
+	 * Skip updating the page table if the entry is
+	 * unchanged.
+	 */
+	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+		return 0;
+
+	if (pmd_present(old_pmd)) {
+		/*
+		 * If we already have PTE level mapping for this block,
+		 * we must unmap it to avoid inconsistent TLB state and
+		 * leaking the table page. We could end up in this situation
+		 * if the memory slot was marked for dirty logging and was
+		 * reverted, leaving PTE level mappings for the pages accessed
+		 * during the period. So, unmap the PTE level mapping for this
+		 * block and retry, as we could have released the upper level
+		 * table in the process.
+		 *
+		 * Normal THP split/merge follows mmu_notifier callbacks and do
+		 * get handled accordingly.
+		 */
+		if (!pmd_thp_or_huge(old_pmd)) {
+			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
+			goto retry;
+		}
+		/*
+		 * Mapping in huge pages should only happen through a
+		 * fault.  If a page is merged into a transparent huge
+		 * page, the individual subpages of that huge page
+		 * should be unmapped through MMU notifiers before we
+		 * get here.
+		 *
+		 * Merging of CompoundPages is not supported; they
+		 * should become splitting first, unmapped, merged,
+		 * and mapped back in on-demand.
+		 */
+		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
+		pmd_clear(pmd);
+		kvm_tlb_flush_vmid_ipa(kvm, addr);
+	} else {
+		get_page(virt_to_page(pmd));
+	}
+
+	kvm_set_pmd(pmd, *new_pmd);
+	return 0;
+}
+
+static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+			       phys_addr_t addr, const pud_t *new_pudp)
+{
+	pud_t *pudp, old_pud;
+
+retry:
+	pudp = stage2_get_pud(kvm, cache, addr);
+	VM_BUG_ON(!pudp);
+
+	old_pud = *pudp;
+
+	/*
+	 * A large number of vcpus faulting on the same stage 2 entry,
+	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+	 * Skip updating the page tables if there is no change.
+	 */
+	if (pud_val(old_pud) == pud_val(*new_pudp))
+		return 0;
+
+	if (stage2_pud_present(kvm, old_pud)) {
+		/*
+		 * If we already have table level mapping for this block, unmap
+		 * the range for this block and retry.
+		 */
+		if (!stage2_pud_huge(kvm, old_pud)) {
+			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
+			goto retry;
+		}
+
+		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
+		stage2_pud_clear(kvm, pudp);
+		kvm_tlb_flush_vmid_ipa(kvm, addr);
+	} else {
+		get_page(virt_to_page(pudp));
+	}
+
+	kvm_set_pud(pudp, *new_pudp);
+	return 0;
+}
+
+/*
+ * stage2_get_leaf_entry - walk the stage2 VM page tables and return
+ * true if a valid and present leaf-entry is found. A pointer to the
+ * leaf-entry is returned in the appropriate level variable - pudpp,
+ * pmdpp, ptepp.
+ */
+static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
+				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
+{
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	*pudpp = NULL;
+	*pmdpp = NULL;
+	*ptepp = NULL;
+
+	pudp = stage2_get_pud(kvm, NULL, addr);
+	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
+		return false;
+
+	if (stage2_pud_huge(kvm, *pudp)) {
+		*pudpp = pudp;
+		return true;
+	}
+
+	pmdp = stage2_pmd_offset(kvm, pudp, addr);
+	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
+		return false;
+
+	if (pmd_thp_or_huge(*pmdp)) {
+		*pmdpp = pmdp;
+		return true;
+	}
+
+	ptep = pte_offset_kernel(pmdp, addr);
+	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
+		return false;
+
+	*ptepp = ptep;
+	return true;
+}
+
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+{
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+	bool found;
+
+	found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
+	if (!found)
+		return false;
+
+	if (pudp)
+		return kvm_s2pud_exec(pudp);
+	else if (pmdp)
+		return kvm_s2pmd_exec(pmdp);
+	else
+		return kvm_s2pte_exec(ptep);
+}
+
+static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+			  phys_addr_t addr, const pte_t *new_pte,
+			  unsigned long flags)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, old_pte;
+	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
+
+	VM_BUG_ON(logging_active && !cache);
+
+	/* Create stage-2 page table mapping - Levels 0 and 1 */
+	pud = stage2_get_pud(kvm, cache, addr);
+	if (!pud) {
+		/*
+		 * Ignore calls from kvm_set_spte_hva for unallocated
+		 * address ranges.
+		 */
+		return 0;
+	}
+
+	/*
+	 * While dirty page logging - dissolve huge PUD, then continue
+	 * on to allocate page.
+	 */
+	if (logging_active)
+		stage2_dissolve_pud(kvm, addr, pud);
+
+	if (stage2_pud_none(kvm, *pud)) {
+		if (!cache)
+			return 0; /* ignore calls from kvm_set_spte_hva */
+		pmd = mmu_memory_cache_alloc(cache);
+		stage2_pud_populate(kvm, pud, pmd);
+		get_page(virt_to_page(pud));
+	}
+
+	pmd = stage2_pmd_offset(kvm, pud, addr);
+	if (!pmd) {
+		/*
+		 * Ignore calls from kvm_set_spte_hva for unallocated
+		 * address ranges.
+		 */
+		return 0;
+	}
+
+	/*
+	 * While dirty page logging - dissolve huge PMD, then continue on to
+	 * allocate page.
+	 */
+	if (logging_active)
+		stage2_dissolve_pmd(kvm, addr, pmd);
+
+	/* Create stage-2 page mappings - Level 2 */
+	if (pmd_none(*pmd)) {
+		if (!cache)
+			return 0; /* ignore calls from kvm_set_spte_hva */
+		pte = mmu_memory_cache_alloc(cache);
+		kvm_pmd_populate(pmd, pte);
+		get_page(virt_to_page(pmd));
+	}
+
+	pte = pte_offset_kernel(pmd, addr);
+
+	if (iomap && pte_present(*pte))
+		return -EFAULT;
+
+	/* Create 2nd stage page table mapping - Level 3 */
+	old_pte = *pte;
+	if (pte_present(old_pte)) {
+		/* Skip page table update if there is no change */
+		if (pte_val(old_pte) == pte_val(*new_pte))
+			return 0;
+
+		kvm_set_pte(pte, __pte(0));
+		kvm_tlb_flush_vmid_ipa(kvm, addr);
+	} else {
+		get_page(virt_to_page(pte));
+	}
+
+	kvm_set_pte(pte, *new_pte);
+	return 0;
+}
+
+#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+	if (pte_young(*pte)) {
+		*pte = pte_mkold(*pte);
+		return 1;
+	}
+	return 0;
+}
+#else
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+	return __ptep_test_and_clear_young(pte);
+}
+#endif
+
+static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
+{
+	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
+}
+
+static int stage2_pudp_test_and_clear_young(pud_t *pud)
+{
+	return stage2_ptep_test_and_clear_young((pte_t *)pud);
+}
+
+/**
+ * kvm_phys_addr_ioremap - map a device range to guest IPA
+ *
+ * @kvm:	The KVM pointer
+ * @guest_ipa:	The IPA at which to insert the mapping
+ * @pa:		The physical address of the device
+ * @size:	The size of the mapping
+ */
+int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
+			  phys_addr_t pa, unsigned long size, bool writable)
+{
+	phys_addr_t addr, end;
+	int ret = 0;
+	unsigned long pfn;
+	struct kvm_mmu_memory_cache cache = { 0, };
+
+	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
+	pfn = __phys_to_pfn(pa);
+
+	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
+		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
+
+		if (writable)
+			pte = kvm_s2pte_mkwrite(pte);
+
+		ret = mmu_topup_memory_cache(&cache,
+					     kvm_mmu_cache_min_pages(kvm),
+					     KVM_NR_MEM_OBJS);
+		if (ret)
+			goto out;
+		spin_lock(&kvm->mmu_lock);
+		ret = stage2_set_pte(kvm, &cache, addr, &pte,
+						KVM_S2PTE_FLAG_IS_IOMAP);
+		spin_unlock(&kvm->mmu_lock);
+		if (ret)
+			goto out;
+
+		pfn++;
+	}
+
+out:
+	mmu_free_memory_cache(&cache);
+	return ret;
+}
+
+static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
+{
+	kvm_pfn_t pfn = *pfnp;
+	gfn_t gfn = *ipap >> PAGE_SHIFT;
+
+	if (kvm_is_transparent_hugepage(pfn)) {
+		unsigned long mask;
+		/*
+		 * The address we faulted on is backed by a transparent huge
+		 * page.  However, because we map the compound huge page and
+		 * not the individual tail page, we need to transfer the
+		 * refcount to the head page.  We have to be careful that the
+		 * THP doesn't start to split while we are adjusting the
+		 * refcounts.
+		 *
+		 * We are sure this doesn't happen, because mmu_notifier_retry
+		 * was successful and we are holding the mmu_lock, so if this
+		 * THP is trying to split, it will be blocked in the mmu
+		 * notifier before touching any of the pages, specifically
+		 * before being able to call __split_huge_page_refcount().
+		 *
+		 * We can therefore safely transfer the refcount from PG_tail
+		 * to PG_head and switch the pfn from a tail page to the head
+		 * page accordingly.
+		 */
+		mask = PTRS_PER_PMD - 1;
+		VM_BUG_ON((gfn & mask) != (pfn & mask));
+		if (pfn & mask) {
+			*ipap &= PMD_MASK;
+			kvm_release_pfn_clean(pfn);
+			pfn &= ~mask;
+			kvm_get_pfn(pfn);
+			*pfnp = pfn;
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * stage2_wp_ptes - write protect PMD range
+ * @pmd:	pointer to pmd entry
+ * @addr:	range start address
+ * @end:	range end address
+ */
+static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+	pte_t *pte;
+
+	pte = pte_offset_kernel(pmd, addr);
+	do {
+		if (!pte_none(*pte)) {
+			if (!kvm_s2pte_readonly(pte))
+				kvm_set_s2pte_readonly(pte);
+		}
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+/**
+ * stage2_wp_pmds - write protect PUD range
+ * kvm:		kvm instance for the VM
+ * @pud:	pointer to pud entry
+ * @addr:	range start address
+ * @end:	range end address
+ */
+static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
+			   phys_addr_t addr, phys_addr_t end)
+{
+	pmd_t *pmd;
+	phys_addr_t next;
+
+	pmd = stage2_pmd_offset(kvm, pud, addr);
+
+	do {
+		next = stage2_pmd_addr_end(kvm, addr, end);
+		if (!pmd_none(*pmd)) {
+			if (pmd_thp_or_huge(*pmd)) {
+				if (!kvm_s2pmd_readonly(pmd))
+					kvm_set_s2pmd_readonly(pmd);
+			} else {
+				stage2_wp_ptes(pmd, addr, next);
+			}
+		}
+	} while (pmd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_puds - write protect PGD range
+ * @pgd:	pointer to pgd entry
+ * @addr:	range start address
+ * @end:	range end address
+ */
+static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
+			    phys_addr_t addr, phys_addr_t end)
+{
+	pud_t *pud;
+	phys_addr_t next;
+
+	pud = stage2_pud_offset(kvm, pgd, addr);
+	do {
+		next = stage2_pud_addr_end(kvm, addr, end);
+		if (!stage2_pud_none(kvm, *pud)) {
+			if (stage2_pud_huge(kvm, *pud)) {
+				if (!kvm_s2pud_readonly(pud))
+					kvm_set_s2pud_readonly(pud);
+			} else {
+				stage2_wp_pmds(kvm, pud, addr, next);
+			}
+		}
+	} while (pud++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_range() - write protect stage2 memory region range
+ * @kvm:	The KVM pointer
+ * @addr:	Start address of range
+ * @end:	End address of range
+ */
+static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+	pgd_t *pgd;
+	phys_addr_t next;
+
+	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+	do {
+		/*
+		 * Release kvm_mmu_lock periodically if the memory region is
+		 * large. Otherwise, we may see kernel panics with
+		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
+		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
+		 * will also starve other vCPUs. We have to also make sure
+		 * that the page tables are not freed while we released
+		 * the lock.
+		 */
+		cond_resched_lock(&kvm->mmu_lock);
+		if (!READ_ONCE(kvm->arch.pgd))
+			break;
+		next = stage2_pgd_addr_end(kvm, addr, end);
+		if (stage2_pgd_present(kvm, *pgd))
+			stage2_wp_puds(kvm, pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
+ * @kvm:	The KVM pointer
+ * @slot:	The memory slot to write protect
+ *
+ * Called to start logging dirty pages after memory region
+ * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
+ * all present PUD, PMD and PTEs are write protected in the memory region.
+ * Afterwards read of dirty page log can be called.
+ *
+ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
+ */
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
+	phys_addr_t start, end;
+
+	if (WARN_ON_ONCE(!memslot))
+		return;
+
+	start = memslot->base_gfn << PAGE_SHIFT;
+	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+	spin_lock(&kvm->mmu_lock);
+	stage2_wp_range(kvm, start, end);
+	spin_unlock(&kvm->mmu_lock);
+	kvm_flush_remote_tlbs(kvm);
+}
+
+/**
+ * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * @kvm:	The KVM pointer
+ * @slot:	The memory slot associated with mask
+ * @gfn_offset:	The gfn offset in memory slot
+ * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
+ *		slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire kvm_mmu_lock.
+ */
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
+	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+	stage2_wp_range(kvm, start, end);
+}
+
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
+static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+	__clean_dcache_guest_page(pfn, size);
+}
+
+static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+	__invalidate_icache_guest_page(pfn, size);
+}
+
+static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
+{
+	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
+}
+
+static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
+					       unsigned long hva,
+					       unsigned long map_size)
+{
+	gpa_t gpa_start;
+	hva_t uaddr_start, uaddr_end;
+	size_t size;
+
+	size = memslot->npages * PAGE_SIZE;
+
+	gpa_start = memslot->base_gfn << PAGE_SHIFT;
+
+	uaddr_start = memslot->userspace_addr;
+	uaddr_end = uaddr_start + size;
+
+	/*
+	 * Pages belonging to memslots that don't have the same alignment
+	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
+	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
+	 *
+	 * Consider a layout like the following:
+	 *
+	 *    memslot->userspace_addr:
+	 *    +-----+--------------------+--------------------+---+
+	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
+	 *    +-----+--------------------+--------------------+---+
+	 *
+	 *    memslot->base_gfn << PAGE_SIZE:
+	 *      +---+--------------------+--------------------+-----+
+	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
+	 *      +---+--------------------+--------------------+-----+
+	 *
+	 * If we create those stage-2 blocks, we'll end up with this incorrect
+	 * mapping:
+	 *   d -> f
+	 *   e -> g
+	 *   f -> h
+	 */
+	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
+		return false;
+
+	/*
+	 * Next, let's make sure we're not trying to map anything not covered
+	 * by the memslot. This means we have to prohibit block size mappings
+	 * for the beginning and end of a non-block aligned and non-block sized
+	 * memory slot (illustrated by the head and tail parts of the
+	 * userspace view above containing pages 'abcde' and 'xyz',
+	 * respectively).
+	 *
+	 * Note that it doesn't matter if we do the check using the
+	 * userspace_addr or the base_gfn, as both are equally aligned (per
+	 * the check above) and equally sized.
+	 */
+	return (hva & ~(map_size - 1)) >= uaddr_start &&
+	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
+}
+
+static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+			  struct kvm_memory_slot *memslot, unsigned long hva,
+			  unsigned long fault_status)
+{
+	int ret;
+	bool write_fault, writable, force_pte = false;
+	bool exec_fault, needs_exec;
+	unsigned long mmu_seq;
+	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+	struct vm_area_struct *vma;
+	short vma_shift;
+	kvm_pfn_t pfn;
+	pgprot_t mem_type = PAGE_S2;
+	bool logging_active = memslot_is_logging(memslot);
+	unsigned long vma_pagesize, flags = 0;
+
+	write_fault = kvm_is_write_fault(vcpu);
+	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+	VM_BUG_ON(write_fault && exec_fault);
+
+	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
+		kvm_err("Unexpected L2 read permission error\n");
+		return -EFAULT;
+	}
+
+	/* Let's check if we will get back a huge page backed by hugetlbfs */
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma_intersection(current->mm, hva, hva + 1);
+	if (unlikely(!vma)) {
+		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+		up_read(&current->mm->mmap_sem);
+		return -EFAULT;
+	}
+
+	if (is_vm_hugetlb_page(vma))
+		vma_shift = huge_page_shift(hstate_vma(vma));
+	else
+		vma_shift = PAGE_SHIFT;
+
+	vma_pagesize = 1ULL << vma_shift;
+	if (logging_active ||
+	    (vma->vm_flags & VM_PFNMAP) ||
+	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+		force_pte = true;
+		vma_pagesize = PAGE_SIZE;
+	}
+
+	/*
+	 * The stage2 has a minimum of 2 level table (For arm64 see
+	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
+	 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
+	 * As for PUD huge maps, we must make sure that we have at least
+	 * 3 levels, i.e, PMD is not folded.
+	 */
+	if (vma_pagesize == PMD_SIZE ||
+	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
+		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+	up_read(&current->mm->mmap_sem);
+
+	/* We need minimum second+third level pages */
+	ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
+				     KVM_NR_MEM_OBJS);
+	if (ret)
+		return ret;
+
+	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	/*
+	 * Ensure the read of mmu_notifier_seq happens before we call
+	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
+	 * the page we just got a reference to gets unmapped before we have a
+	 * chance to grab the mmu_lock, which ensure that if the page gets
+	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
+	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
+	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
+	 */
+	smp_rmb();
+
+	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+	if (pfn == KVM_PFN_ERR_HWPOISON) {
+		kvm_send_hwpoison_signal(hva, vma_shift);
+		return 0;
+	}
+	if (is_error_noslot_pfn(pfn))
+		return -EFAULT;
+
+	if (kvm_is_device_pfn(pfn)) {
+		mem_type = PAGE_S2_DEVICE;
+		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+	} else if (logging_active) {
+		/*
+		 * Faults on pages in a memslot with logging enabled
+		 * should not be mapped with huge pages (it introduces churn
+		 * and performance degradation), so force a pte mapping.
+		 */
+		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+		/*
+		 * Only actually map the page as writable if this was a write
+		 * fault.
+		 */
+		if (!write_fault)
+			writable = false;
+	}
+
+	if (exec_fault && is_iomap(flags))
+		return -ENOEXEC;
+
+	spin_lock(&kvm->mmu_lock);
+	if (mmu_notifier_retry(kvm, mmu_seq))
+		goto out_unlock;
+
+	if (vma_pagesize == PAGE_SIZE && !force_pte) {
+		/*
+		 * Only PMD_SIZE transparent hugepages(THP) are
+		 * currently supported. This code will need to be
+		 * updated to support other THP sizes.
+		 *
+		 * Make sure the host VA and the guest IPA are sufficiently
+		 * aligned and that the block is contained within the memslot.
+		 */
+		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+		    transparent_hugepage_adjust(&pfn, &fault_ipa))
+			vma_pagesize = PMD_SIZE;
+	}
+
+	if (writable)
+		kvm_set_pfn_dirty(pfn);
+
+	if (fault_status != FSC_PERM && !is_iomap(flags))
+		clean_dcache_guest_page(pfn, vma_pagesize);
+
+	if (exec_fault)
+		invalidate_icache_guest_page(pfn, vma_pagesize);
+
+	/*
+	 * If we took an execution fault we have made the
+	 * icache/dcache coherent above and should now let the s2
+	 * mapping be executable.
+	 *
+	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
+	 * execute permissions, and we preserve whatever we have.
+	 */
+	needs_exec = exec_fault ||
+		(fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
+
+	if (vma_pagesize == PUD_SIZE) {
+		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
+
+		new_pud = kvm_pud_mkhuge(new_pud);
+		if (writable)
+			new_pud = kvm_s2pud_mkwrite(new_pud);
+
+		if (needs_exec)
+			new_pud = kvm_s2pud_mkexec(new_pud);
+
+		ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
+	} else if (vma_pagesize == PMD_SIZE) {
+		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
+
+		new_pmd = kvm_pmd_mkhuge(new_pmd);
+
+		if (writable)
+			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
+
+		if (needs_exec)
+			new_pmd = kvm_s2pmd_mkexec(new_pmd);
+
+		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
+	} else {
+		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
+
+		if (writable) {
+			new_pte = kvm_s2pte_mkwrite(new_pte);
+			mark_page_dirty(kvm, gfn);
+		}
+
+		if (needs_exec)
+			new_pte = kvm_s2pte_mkexec(new_pte);
+
+		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
+	}
+
+out_unlock:
+	spin_unlock(&kvm->mmu_lock);
+	kvm_set_pfn_accessed(pfn);
+	kvm_release_pfn_clean(pfn);
+	return ret;
+}
+
+/*
+ * Resolve the access fault by making the page young again.
+ * Note that because the faulting entry is guaranteed not to be
+ * cached in the TLB, we don't need to invalidate anything.
+ * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
+ * so there is no need for atomic (pte|pmd)_mkyoung operations.
+ */
+static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	kvm_pfn_t pfn;
+	bool pfn_valid = false;
+
+	trace_kvm_access_fault(fault_ipa);
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+
+	if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
+		goto out;
+
+	if (pud) {		/* HugeTLB */
+		*pud = kvm_s2pud_mkyoung(*pud);
+		pfn = kvm_pud_pfn(*pud);
+		pfn_valid = true;
+	} else	if (pmd) {	/* THP, HugeTLB */
+		*pmd = pmd_mkyoung(*pmd);
+		pfn = pmd_pfn(*pmd);
+		pfn_valid = true;
+	} else {
+		*pte = pte_mkyoung(*pte);	/* Just a page... */
+		pfn = pte_pfn(*pte);
+		pfn_valid = true;
+	}
+
+out:
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	if (pfn_valid)
+		kvm_set_pfn_accessed(pfn);
+}
+
+/**
+ * kvm_handle_guest_abort - handles all 2nd stage aborts
+ * @vcpu:	the VCPU pointer
+ * @run:	the kvm_run structure
+ *
+ * Any abort that gets to the host is almost guaranteed to be caused by a
+ * missing second stage translation table entry, which can mean that either the
+ * guest simply needs more memory and we must allocate an appropriate page or it
+ * can mean that the guest tried to access I/O memory, which is emulated by user
+ * space. The distinction is based on the IPA causing the fault and whether this
+ * memory region has been registered as standard RAM by user space.
+ */
+int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	unsigned long fault_status;
+	phys_addr_t fault_ipa;
+	struct kvm_memory_slot *memslot;
+	unsigned long hva;
+	bool is_iabt, write_fault, writable;
+	gfn_t gfn;
+	int ret, idx;
+
+	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+
+	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
+
+	/* Synchronous External Abort? */
+	if (kvm_vcpu_dabt_isextabt(vcpu)) {
+		/*
+		 * For RAS the host kernel may handle this abort.
+		 * There is no need to pass the error into the guest.
+		 */
+		if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
+			return 1;
+
+		if (unlikely(!is_iabt)) {
+			kvm_inject_vabt(vcpu);
+			return 1;
+		}
+	}
+
+	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
+			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
+
+	/* Check the stage-2 fault is trans. fault or write fault */
+	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
+	    fault_status != FSC_ACCESS) {
+		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
+			kvm_vcpu_trap_get_class(vcpu),
+			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
+			(unsigned long)kvm_vcpu_get_hsr(vcpu));
+		return -EFAULT;
+	}
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	gfn = fault_ipa >> PAGE_SHIFT;
+	memslot = gfn_to_memslot(vcpu->kvm, gfn);
+	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
+	write_fault = kvm_is_write_fault(vcpu);
+	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
+		if (is_iabt) {
+			/* Prefetch Abort on I/O address */
+			ret = -ENOEXEC;
+			goto out;
+		}
+
+		/*
+		 * Check for a cache maintenance operation. Since we
+		 * ended-up here, we know it is outside of any memory
+		 * slot. But we can't find out if that is for a device,
+		 * or if the guest is just being stupid. The only thing
+		 * we know for sure is that this range cannot be cached.
+		 *
+		 * So let's assume that the guest is just being
+		 * cautious, and skip the instruction.
+		 */
+		if (kvm_vcpu_dabt_is_cm(vcpu)) {
+			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+			ret = 1;
+			goto out_unlock;
+		}
+
+		/*
+		 * The IPA is reported as [MAX:12], so we need to
+		 * complement it with the bottom 12 bits from the
+		 * faulting VA. This is always 12 bits, irrespective
+		 * of the page size.
+		 */
+		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
+		ret = io_mem_abort(vcpu, run, fault_ipa);
+		goto out_unlock;
+	}
+
+	/* Userspace should not be able to register out-of-bounds IPAs */
+	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
+
+	if (fault_status == FSC_ACCESS) {
+		handle_access_fault(vcpu, fault_ipa);
+		ret = 1;
+		goto out_unlock;
+	}
+
+	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+	if (ret == 0)
+		ret = 1;
+out:
+	if (ret == -ENOEXEC) {
+		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+		ret = 1;
+	}
+out_unlock:
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	return ret;
+}
+
+static int handle_hva_to_gpa(struct kvm *kvm,
+			     unsigned long start,
+			     unsigned long end,
+			     int (*handler)(struct kvm *kvm,
+					    gpa_t gpa, u64 size,
+					    void *data),
+			     void *data)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int ret = 0;
+
+	slots = kvm_memslots(kvm);
+
+	/* we only care about the pages that the guest sees */
+	kvm_for_each_memslot(memslot, slots) {
+		unsigned long hva_start, hva_end;
+		gfn_t gpa;
+
+		hva_start = max(start, memslot->userspace_addr);
+		hva_end = min(end, memslot->userspace_addr +
+					(memslot->npages << PAGE_SHIFT));
+		if (hva_start >= hva_end)
+			continue;
+
+		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
+		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
+	}
+
+	return ret;
+}
+
+static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+	unmap_stage2_range(kvm, gpa, size);
+	return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm,
+			unsigned long start, unsigned long end)
+{
+	if (!kvm->arch.pgd)
+		return 0;
+
+	trace_kvm_unmap_hva_range(start, end);
+	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
+	return 0;
+}
+
+static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+	pte_t *pte = (pte_t *)data;
+
+	WARN_ON(size != PAGE_SIZE);
+	/*
+	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+	 * flag clear because MMU notifiers will have unmapped a huge PMD before
+	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+	 * therefore stage2_set_pte() never needs to clear out a huge PMD
+	 * through this calling path.
+	 */
+	stage2_set_pte(kvm, NULL, gpa, pte, 0);
+	return 0;
+}
+
+
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	unsigned long end = hva + PAGE_SIZE;
+	kvm_pfn_t pfn = pte_pfn(pte);
+	pte_t stage2_pte;
+
+	if (!kvm->arch.pgd)
+		return 0;
+
+	trace_kvm_set_spte_hva(hva);
+
+	/*
+	 * We've moved a page around, probably through CoW, so let's treat it
+	 * just like a translation fault and clean the cache to the PoC.
+	 */
+	clean_dcache_guest_page(pfn, PAGE_SIZE);
+	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
+	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
+
+	return 0;
+}
+
+static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
+		return 0;
+
+	if (pud)
+		return stage2_pudp_test_and_clear_young(pud);
+	else if (pmd)
+		return stage2_pmdp_test_and_clear_young(pmd);
+	else
+		return stage2_ptep_test_and_clear_young(pte);
+}
+
+static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
+		return 0;
+
+	if (pud)
+		return kvm_s2pud_young(*pud);
+	else if (pmd)
+		return pmd_young(*pmd);
+	else
+		return pte_young(*pte);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	if (!kvm->arch.pgd)
+		return 0;
+	trace_kvm_age_hva(start, end);
+	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	if (!kvm->arch.pgd)
+		return 0;
+	trace_kvm_test_age_hva(hva);
+	return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
+				 kvm_test_age_hva_handler, NULL);
+}
+
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+}
+
+phys_addr_t kvm_mmu_get_httbr(void)
+{
+	if (__kvm_cpu_uses_extended_idmap())
+		return virt_to_phys(merged_hyp_pgd);
+	else
+		return virt_to_phys(hyp_pgd);
+}
+
+phys_addr_t kvm_get_idmap_vector(void)
+{
+	return hyp_idmap_vector;
+}
+
+static int kvm_map_idmap_text(pgd_t *pgd)
+{
+	int err;
+
+	/* Create the idmap in the boot page tables */
+	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
+				      hyp_idmap_start, hyp_idmap_end,
+				      __phys_to_pfn(hyp_idmap_start),
+				      PAGE_HYP_EXEC);
+	if (err)
+		kvm_err("Failed to idmap %lx-%lx\n",
+			hyp_idmap_start, hyp_idmap_end);
+
+	return err;
+}
+
+int kvm_mmu_init(void)
+{
+	int err;
+
+	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
+	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
+	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
+	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
+	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
+
+	/*
+	 * We rely on the linker script to ensure at build time that the HYP
+	 * init code does not cross a page boundary.
+	 */
+	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
+
+	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
+	kvm_debug("HYP VA range: %lx:%lx\n",
+		  kern_hyp_va(PAGE_OFFSET),
+		  kern_hyp_va((unsigned long)high_memory - 1));
+
+	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
+	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
+	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
+		/*
+		 * The idmap page is intersecting with the VA space,
+		 * it is not safe to continue further.
+		 */
+		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+	if (!hyp_pgd) {
+		kvm_err("Hyp mode PGD not allocated\n");
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (__kvm_cpu_uses_extended_idmap()) {
+		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+							 hyp_pgd_order);
+		if (!boot_hyp_pgd) {
+			kvm_err("Hyp boot PGD not allocated\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = kvm_map_idmap_text(boot_hyp_pgd);
+		if (err)
+			goto out;
+
+		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+		if (!merged_hyp_pgd) {
+			kvm_err("Failed to allocate extra HYP pgd\n");
+			goto out;
+		}
+		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
+				    hyp_idmap_start);
+	} else {
+		err = kvm_map_idmap_text(hyp_pgd);
+		if (err)
+			goto out;
+	}
+
+	io_map_base = hyp_idmap_start;
+	return 0;
+out:
+	free_hyp_pgds();
+	return err;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+				   const struct kvm_userspace_memory_region *mem,
+				   struct kvm_memory_slot *old,
+				   const struct kvm_memory_slot *new,
+				   enum kvm_mr_change change)
+{
+	/*
+	 * At this point memslot has been committed and there is an
+	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
+	 * memory slot is write protected.
+	 */
+	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
+		kvm_mmu_wp_memory_region(kvm, mem->slot);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot,
+				   const struct kvm_userspace_memory_region *mem,
+				   enum kvm_mr_change change)
+{
+	hva_t hva = mem->userspace_addr;
+	hva_t reg_end = hva + mem->memory_size;
+	bool writable = !(mem->flags & KVM_MEM_READONLY);
+	int ret = 0;
+
+	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+			change != KVM_MR_FLAGS_ONLY)
+		return 0;
+
+	/*
+	 * Prevent userspace from creating a memory region outside of the IPA
+	 * space addressable by the KVM guest IPA space.
+	 */
+	if (memslot->base_gfn + memslot->npages >=
+	    (kvm_phys_size(kvm) >> PAGE_SHIFT))
+		return -EFAULT;
+
+	down_read(&current->mm->mmap_sem);
+	/*
+	 * A memory region could potentially cover multiple VMAs, and any holes
+	 * between them, so iterate over all of them to find out if we can map
+	 * any of them right now.
+	 *
+	 *     +--------------------------------------------+
+	 * +---------------+----------------+   +----------------+
+	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
+	 * +---------------+----------------+   +----------------+
+	 *     |               memory region                |
+	 *     +--------------------------------------------+
+	 */
+	do {
+		struct vm_area_struct *vma = find_vma(current->mm, hva);
+		hva_t vm_start, vm_end;
+
+		if (!vma || vma->vm_start >= reg_end)
+			break;
+
+		/*
+		 * Take the intersection of this VMA with the memory region
+		 */
+		vm_start = max(hva, vma->vm_start);
+		vm_end = min(reg_end, vma->vm_end);
+
+		if (vma->vm_flags & VM_PFNMAP) {
+			gpa_t gpa = mem->guest_phys_addr +
+				    (vm_start - mem->userspace_addr);
+			phys_addr_t pa;
+
+			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
+			pa += vm_start - vma->vm_start;
+
+			/* IO region dirty page logging not allowed */
+			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
+						    vm_end - vm_start,
+						    writable);
+			if (ret)
+				break;
+		}
+		hva = vm_end;
+	} while (hva < reg_end);
+
+	if (change == KVM_MR_FLAGS_ONLY)
+		goto out;
+
+	spin_lock(&kvm->mmu_lock);
+	if (ret)
+		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
+	else
+		stage2_flush_memslot(kvm, memslot);
+	spin_unlock(&kvm->mmu_lock);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+}
+
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+{
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+	kvm_free_stage2_pgd(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
+{
+	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
+	phys_addr_t size = slot->npages << PAGE_SHIFT;
+
+	spin_lock(&kvm->mmu_lock);
+	unmap_stage2_range(kvm, gpa, size);
+	spin_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ *
+ * Main problems:
+ * - S/W ops are local to a CPU (not broadcast)
+ * - We have line migration behind our back (speculation)
+ * - System caches don't support S/W at all (damn!)
+ *
+ * In the face of the above, the best we can do is to try and convert
+ * S/W ops to VA ops. Because the guest is not allowed to infer the
+ * S/W to PA mapping, it can only use S/W to nuke the whole cache,
+ * which is a rather good thing for us.
+ *
+ * Also, it is only used when turning caches on/off ("The expected
+ * usage of the cache maintenance instructions that operate by set/way
+ * is associated with the cache maintenance instructions associated
+ * with the powerdown and powerup of caches, if this is required by
+ * the implementation.").
+ *
+ * We use the following policy:
+ *
+ * - If we trap a S/W operation, we enable VM trapping to detect
+ *   caches being turned on/off, and do a full clean.
+ *
+ * - We flush the caches on both caches being turned on and off.
+ *
+ * - Once the caches are enabled, we stop trapping VM ops.
+ */
+void kvm_set_way_flush(struct kvm_vcpu *vcpu)
+{
+	unsigned long hcr = *vcpu_hcr(vcpu);
+
+	/*
+	 * If this is the first time we do a S/W operation
+	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
+	 * VM trapping.
+	 *
+	 * Otherwise, rely on the VM trapping to wait for the MMU +
+	 * Caches to be turned off. At that point, we'll be able to
+	 * clean the caches again.
+	 */
+	if (!(hcr & HCR_TVM)) {
+		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
+					vcpu_has_cache_enabled(vcpu));
+		stage2_flush_vm(vcpu->kvm);
+		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
+	}
+}
+
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
+{
+	bool now_enabled = vcpu_has_cache_enabled(vcpu);
+
+	/*
+	 * If switching the MMU+caches on, need to invalidate the caches.
+	 * If switching it off, need to clean the caches.
+	 * Clean + invalidate does the trick always.
+	 */
+	if (now_enabled != was_enabled)
+		stage2_flush_vm(vcpu->kvm);
+
+	/* Caches are now on, stop trapping VM ops (until a S/W op) */
+	if (now_enabled)
+		*vcpu_hcr(vcpu) &= ~HCR_TVM;
+
+	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
+}
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c
new file mode 100644
index 000000000000..d45b8b9a4415
--- /dev/null
+++ b/arch/arm64/kvm/perf.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Based on the x86 implementation.
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+static int kvm_is_in_guest(void)
+{
+        return kvm_get_running_vcpu() != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = kvm_get_running_vcpu();
+
+	if (vcpu)
+		return !vcpu_mode_priv(vcpu);
+
+	return 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = kvm_get_running_vcpu();
+
+	if (vcpu)
+		return *vcpu_pc(vcpu);
+
+	return 0;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+	.is_in_guest	= kvm_is_in_guest,
+	.is_user_mode	= kvm_is_user_mode,
+	.get_guest_ip	= kvm_get_guest_ip,
+};
+
+int kvm_perf_init(void)
+{
+	return perf_register_guest_info_callbacks(&kvm_guest_cbs);
+}
+
+int kvm_perf_teardown(void)
+{
+	return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
new file mode 100644
index 000000000000..f0d0312c0a55
--- /dev/null
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -0,0 +1,869 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Shannon Zhao <shannon.zhao@linaro.org>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_emulate.h>
+#include <kvm/arm_pmu.h>
+#include <kvm/arm_vgic.h>
+
+static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx);
+static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx);
+static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
+
+#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
+
+/**
+ * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+	return (select_idx == ARMV8_PMU_CYCLE_IDX &&
+		__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC);
+}
+
+static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu;
+	struct kvm_vcpu_arch *vcpu_arch;
+
+	pmc -= pmc->idx;
+	pmu = container_of(pmc, struct kvm_pmu, pmc[0]);
+	vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu);
+	return container_of(vcpu_arch, struct kvm_vcpu, arch);
+}
+
+/**
+ * kvm_pmu_pmc_is_chained - determine if the pmc is chained
+ * @pmc: The PMU counter pointer
+ */
+static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc)
+{
+	struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+
+	return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+}
+
+/**
+ * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_is_high_counter(u64 select_idx)
+{
+	return select_idx & 0x1;
+}
+
+/**
+ * kvm_pmu_get_canonical_pmc - obtain the canonical pmc
+ * @pmc: The PMU counter pointer
+ *
+ * When a pair of PMCs are chained together we use the low counter (canonical)
+ * to hold the underlying perf event.
+ */
+static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc)
+{
+	if (kvm_pmu_pmc_is_chained(pmc) &&
+	    kvm_pmu_idx_is_high_counter(pmc->idx))
+		return pmc - 1;
+
+	return pmc;
+}
+static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc)
+{
+	if (kvm_pmu_idx_is_high_counter(pmc->idx))
+		return pmc - 1;
+	else
+		return pmc + 1;
+}
+
+/**
+ * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+	u64 eventsel, reg;
+
+	select_idx |= 0x1;
+
+	if (select_idx == ARMV8_PMU_CYCLE_IDX)
+		return false;
+
+	reg = PMEVTYPER0_EL0 + select_idx;
+	eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
+
+	return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
+}
+
+/**
+ * kvm_pmu_get_pair_counter_value - get PMU counter value
+ * @vcpu: The vcpu pointer
+ * @pmc: The PMU counter pointer
+ */
+static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
+					  struct kvm_pmc *pmc)
+{
+	u64 counter, counter_high, reg, enabled, running;
+
+	if (kvm_pmu_pmc_is_chained(pmc)) {
+		pmc = kvm_pmu_get_canonical_pmc(pmc);
+		reg = PMEVCNTR0_EL0 + pmc->idx;
+
+		counter = __vcpu_sys_reg(vcpu, reg);
+		counter_high = __vcpu_sys_reg(vcpu, reg + 1);
+
+		counter = lower_32_bits(counter) | (counter_high << 32);
+	} else {
+		reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+		      ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
+		counter = __vcpu_sys_reg(vcpu, reg);
+	}
+
+	/*
+	 * The real counter value is equal to the value of counter register plus
+	 * the value perf event counts.
+	 */
+	if (pmc->perf_event)
+		counter += perf_event_read_value(pmc->perf_event, &enabled,
+						 &running);
+
+	return counter;
+}
+
+/**
+ * kvm_pmu_get_counter_value - get PMU counter value
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+	u64 counter;
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
+
+	counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+	if (kvm_pmu_pmc_is_chained(pmc) &&
+	    kvm_pmu_idx_is_high_counter(select_idx))
+		counter = upper_32_bits(counter);
+	else if (select_idx != ARMV8_PMU_CYCLE_IDX)
+		counter = lower_32_bits(counter);
+
+	return counter;
+}
+
+/**
+ * kvm_pmu_set_counter_value - set PMU counter value
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ * @val: The counter value
+ */
+void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
+{
+	u64 reg;
+
+	reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
+	      ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
+	__vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
+
+	/* Recreate the perf event to reflect the updated sample_period */
+	kvm_pmu_create_perf_event(vcpu, select_idx);
+}
+
+/**
+ * kvm_pmu_release_perf_event - remove the perf event
+ * @pmc: The PMU counter pointer
+ */
+static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
+{
+	pmc = kvm_pmu_get_canonical_pmc(pmc);
+	if (pmc->perf_event) {
+		perf_event_disable(pmc->perf_event);
+		perf_event_release_kernel(pmc->perf_event);
+		pmc->perf_event = NULL;
+	}
+}
+
+/**
+ * kvm_pmu_stop_counter - stop PMU counter
+ * @pmc: The PMU counter pointer
+ *
+ * If this counter has been configured to monitor some event, release it here.
+ */
+static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
+{
+	u64 counter, reg, val;
+
+	pmc = kvm_pmu_get_canonical_pmc(pmc);
+	if (!pmc->perf_event)
+		return;
+
+	counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+	if (pmc->idx == ARMV8_PMU_CYCLE_IDX) {
+		reg = PMCCNTR_EL0;
+		val = counter;
+	} else {
+		reg = PMEVCNTR0_EL0 + pmc->idx;
+		val = lower_32_bits(counter);
+	}
+
+	__vcpu_sys_reg(vcpu, reg) = val;
+
+	if (kvm_pmu_pmc_is_chained(pmc))
+		__vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
+
+	kvm_pmu_release_perf_event(pmc);
+}
+
+/**
+ * kvm_pmu_vcpu_init - assign pmu counter idx for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+		pmu->pmc[i].idx = i;
+}
+
+/**
+ * kvm_pmu_vcpu_reset - reset pmu state for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+	unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	int i;
+
+	for_each_set_bit(i, &mask, 32)
+		kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
+
+	bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
+}
+
+/**
+ * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+		kvm_pmu_release_perf_event(&pmu->pmc[i]);
+}
+
+u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
+{
+	u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
+
+	val &= ARMV8_PMU_PMCR_N_MASK;
+	if (val == 0)
+		return BIT(ARMV8_PMU_CYCLE_IDX);
+	else
+		return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
+}
+
+/**
+ * kvm_pmu_enable_counter_mask - enable selected PMU counters
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCNTENSET register
+ *
+ * Call perf_event_enable to start counting the perf event
+ */
+void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+{
+	int i;
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc;
+
+	if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
+		return;
+
+	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+		if (!(val & BIT(i)))
+			continue;
+
+		pmc = &pmu->pmc[i];
+
+		/* A change in the enable state may affect the chain state */
+		kvm_pmu_update_pmc_chained(vcpu, i);
+		kvm_pmu_create_perf_event(vcpu, i);
+
+		/* At this point, pmc must be the canonical */
+		if (pmc->perf_event) {
+			perf_event_enable(pmc->perf_event);
+			if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
+				kvm_debug("fail to enable perf event\n");
+		}
+	}
+}
+
+/**
+ * kvm_pmu_disable_counter_mask - disable selected PMU counters
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCNTENCLR register
+ *
+ * Call perf_event_disable to stop counting the perf event
+ */
+void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+{
+	int i;
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc;
+
+	if (!val)
+		return;
+
+	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+		if (!(val & BIT(i)))
+			continue;
+
+		pmc = &pmu->pmc[i];
+
+		/* A change in the enable state may affect the chain state */
+		kvm_pmu_update_pmc_chained(vcpu, i);
+		kvm_pmu_create_perf_event(vcpu, i);
+
+		/* At this point, pmc must be the canonical */
+		if (pmc->perf_event)
+			perf_event_disable(pmc->perf_event);
+	}
+}
+
+static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
+{
+	u64 reg = 0;
+
+	if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
+		reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
+		reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+		reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
+		reg &= kvm_pmu_valid_counter_mask(vcpu);
+	}
+
+	return reg;
+}
+
+static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	bool overflow;
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return;
+
+	overflow = !!kvm_pmu_overflow_status(vcpu);
+	if (pmu->irq_level == overflow)
+		return;
+
+	pmu->irq_level = overflow;
+
+	if (likely(irqchip_in_kernel(vcpu->kvm))) {
+		int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+					      pmu->irq_num, overflow, pmu);
+		WARN_ON(ret);
+	}
+}
+
+bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
+	bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU;
+
+	if (likely(irqchip_in_kernel(vcpu->kvm)))
+		return false;
+
+	return pmu->irq_level != run_level;
+}
+
+/*
+ * Reflect the PMU overflow interrupt output level into the kvm_run structure
+ */
+void kvm_pmu_update_run(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+	/* Populate the timer bitmap for user space */
+	regs->device_irq_level &= ~KVM_ARM_DEV_PMU;
+	if (vcpu->arch.pmu.irq_level)
+		regs->device_irq_level |= KVM_ARM_DEV_PMU;
+}
+
+/**
+ * kvm_pmu_flush_hwstate - flush pmu state to cpu
+ * @vcpu: The vcpu pointer
+ *
+ * Check if the PMU has overflowed while we were running in the host, and inject
+ * an interrupt if that was the case.
+ */
+void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+	kvm_pmu_update_state(vcpu);
+}
+
+/**
+ * kvm_pmu_sync_hwstate - sync pmu state from cpu
+ * @vcpu: The vcpu pointer
+ *
+ * Check if the PMU has overflowed while we were running in the guest, and
+ * inject an interrupt if that was the case.
+ */
+void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+	kvm_pmu_update_state(vcpu);
+}
+
+/**
+ * When the perf event overflows, set the overflow status and inform the vcpu.
+ */
+static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
+				  struct perf_sample_data *data,
+				  struct pt_regs *regs)
+{
+	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
+	struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+	int idx = pmc->idx;
+	u64 period;
+
+	cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE);
+
+	/*
+	 * Reset the sample period to the architectural limit,
+	 * i.e. the point where the counter overflows.
+	 */
+	period = -(local64_read(&perf_event->count));
+
+	if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
+		period &= GENMASK(31, 0);
+
+	local64_set(&perf_event->hw.period_left, 0);
+	perf_event->attr.sample_period = period;
+	perf_event->hw.sample_period = period;
+
+	__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
+
+	if (kvm_pmu_overflow_status(vcpu)) {
+		kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD);
+}
+
+/**
+ * kvm_pmu_software_increment - do software increment
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMSWINC register
+ */
+void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	int i;
+
+	if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
+		return;
+
+	/* Weed out disabled counters */
+	val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+
+	for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
+		u64 type, reg;
+
+		if (!(val & BIT(i)))
+			continue;
+
+		/* PMSWINC only applies to ... SW_INC! */
+		type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
+		type &= ARMV8_PMU_EVTYPE_EVENT;
+		if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
+			continue;
+
+		/* increment this even SW_INC counter */
+		reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
+		reg = lower_32_bits(reg);
+		__vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
+
+		if (reg) /* no overflow on the low part */
+			continue;
+
+		if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) {
+			/* increment the high counter */
+			reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1;
+			reg = lower_32_bits(reg);
+			__vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg;
+			if (!reg) /* mark overflow on the high counter */
+				__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1);
+		} else {
+			/* mark overflow on low counter */
+			__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
+		}
+	}
+}
+
+/**
+ * kvm_pmu_handle_pmcr - handle PMCR register
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCR register
+ */
+void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
+{
+	unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
+	int i;
+
+	if (val & ARMV8_PMU_PMCR_E) {
+		kvm_pmu_enable_counter_mask(vcpu,
+		       __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
+	} else {
+		kvm_pmu_disable_counter_mask(vcpu, mask);
+	}
+
+	if (val & ARMV8_PMU_PMCR_C)
+		kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
+
+	if (val & ARMV8_PMU_PMCR_P) {
+		for_each_set_bit(i, &mask, 32)
+			kvm_pmu_set_counter_value(vcpu, i, 0);
+	}
+}
+
+static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+	return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
+	       (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
+}
+
+/**
+ * kvm_pmu_create_perf_event - create a perf event for a counter
+ * @vcpu: The vcpu pointer
+ * @select_idx: The number of selected counter
+ */
+static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc;
+	struct perf_event *event;
+	struct perf_event_attr attr;
+	u64 eventsel, counter, reg, data;
+
+	/*
+	 * For chained counters the event type and filtering attributes are
+	 * obtained from the low/even counter. We also use this counter to
+	 * determine if the event is enabled/disabled.
+	 */
+	pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]);
+
+	reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+	      ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx;
+	data = __vcpu_sys_reg(vcpu, reg);
+
+	kvm_pmu_stop_counter(vcpu, pmc);
+	eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
+
+	/* Software increment event does't need to be backed by a perf event */
+	if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
+	    pmc->idx != ARMV8_PMU_CYCLE_IDX)
+		return;
+
+	memset(&attr, 0, sizeof(struct perf_event_attr));
+	attr.type = PERF_TYPE_RAW;
+	attr.size = sizeof(attr);
+	attr.pinned = 1;
+	attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx);
+	attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
+	attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
+	attr.exclude_hv = 1; /* Don't count EL2 events */
+	attr.exclude_host = 1; /* Don't count host events */
+	attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
+		ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
+
+	counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+	if (kvm_pmu_pmc_is_chained(pmc)) {
+		/**
+		 * The initial sample period (overflow count) of an event. For
+		 * chained counters we only support overflow interrupts on the
+		 * high counter.
+		 */
+		attr.sample_period = (-counter) & GENMASK(63, 0);
+		attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
+
+		event = perf_event_create_kernel_counter(&attr, -1, current,
+							 kvm_pmu_perf_overflow,
+							 pmc + 1);
+	} else {
+		/* The initial sample period (overflow count) of an event. */
+		if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
+			attr.sample_period = (-counter) & GENMASK(63, 0);
+		else
+			attr.sample_period = (-counter) & GENMASK(31, 0);
+
+		event = perf_event_create_kernel_counter(&attr, -1, current,
+						 kvm_pmu_perf_overflow, pmc);
+	}
+
+	if (IS_ERR(event)) {
+		pr_err_once("kvm: pmu event creation failed %ld\n",
+			    PTR_ERR(event));
+		return;
+	}
+
+	pmc->perf_event = event;
+}
+
+/**
+ * kvm_pmu_update_pmc_chained - update chained bitmap
+ * @vcpu: The vcpu pointer
+ * @select_idx: The number of selected counter
+ *
+ * Update the chained bitmap based on the event type written in the
+ * typer register and the enable state of the odd register.
+ */
+static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc;
+	bool new_state, old_state;
+
+	old_state = kvm_pmu_pmc_is_chained(pmc);
+	new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) &&
+		    kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1);
+
+	if (old_state == new_state)
+		return;
+
+	canonical_pmc = kvm_pmu_get_canonical_pmc(pmc);
+	kvm_pmu_stop_counter(vcpu, canonical_pmc);
+	if (new_state) {
+		/*
+		 * During promotion from !chained to chained we must ensure
+		 * the adjacent counter is stopped and its event destroyed
+		 */
+		kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc));
+		set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+		return;
+	}
+	clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+}
+
+/**
+ * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
+ * @vcpu: The vcpu pointer
+ * @data: The data guest writes to PMXEVTYPER_EL0
+ * @select_idx: The number of selected counter
+ *
+ * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an
+ * event with given hardware event number. Here we call perf_event API to
+ * emulate this action and create a kernel perf event for it.
+ */
+void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
+				    u64 select_idx)
+{
+	u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
+
+	reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
+	      ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
+
+	__vcpu_sys_reg(vcpu, reg) = event_type;
+
+	kvm_pmu_update_pmc_chained(vcpu, select_idx);
+	kvm_pmu_create_perf_event(vcpu, select_idx);
+}
+
+bool kvm_arm_support_pmu_v3(void)
+{
+	/*
+	 * Check if HW_PERF_EVENTS are supported by checking the number of
+	 * hardware performance counters. This could ensure the presence of
+	 * a physical PMU and CONFIG_PERF_EVENT is selected.
+	 */
+	return (perf_num_counters() > 0);
+}
+
+int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.pmu.created)
+		return 0;
+
+	/*
+	 * A valid interrupt configuration for the PMU is either to have a
+	 * properly configured interrupt number and using an in-kernel
+	 * irqchip, or to not have an in-kernel GIC and not set an IRQ.
+	 */
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		int irq = vcpu->arch.pmu.irq_num;
+		if (!kvm_arm_pmu_irq_initialized(vcpu))
+			return -EINVAL;
+
+		/*
+		 * If we are using an in-kernel vgic, at this point we know
+		 * the vgic will be initialized, so we can check the PMU irq
+		 * number against the dimensions of the vgic and make sure
+		 * it's valid.
+		 */
+		if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
+			return -EINVAL;
+	} else if (kvm_arm_pmu_irq_initialized(vcpu)) {
+		   return -EINVAL;
+	}
+
+	kvm_pmu_vcpu_reset(vcpu);
+	vcpu->arch.pmu.ready = true;
+
+	return 0;
+}
+
+static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
+{
+	if (!kvm_arm_support_pmu_v3())
+		return -ENODEV;
+
+	if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+		return -ENXIO;
+
+	if (vcpu->arch.pmu.created)
+		return -EBUSY;
+
+	if (irqchip_in_kernel(vcpu->kvm)) {
+		int ret;
+
+		/*
+		 * If using the PMU with an in-kernel virtual GIC
+		 * implementation, we require the GIC to be already
+		 * initialized when initializing the PMU.
+		 */
+		if (!vgic_initialized(vcpu->kvm))
+			return -ENODEV;
+
+		if (!kvm_arm_pmu_irq_initialized(vcpu))
+			return -ENXIO;
+
+		ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
+					 &vcpu->arch.pmu);
+		if (ret)
+			return ret;
+	}
+
+	vcpu->arch.pmu.created = true;
+	return 0;
+}
+
+/*
+ * For one VM the interrupt type must be same for each vcpu.
+ * As a PPI, the interrupt number is the same for all vcpus,
+ * while as an SPI it must be a separate number per vcpu.
+ */
+static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!kvm_arm_pmu_irq_initialized(vcpu))
+			continue;
+
+		if (irq_is_ppi(irq)) {
+			if (vcpu->arch.pmu.irq_num != irq)
+				return false;
+		} else {
+			if (vcpu->arch.pmu.irq_num == irq)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_PMU_V3_IRQ: {
+		int __user *uaddr = (int __user *)(long)attr->addr;
+		int irq;
+
+		if (!irqchip_in_kernel(vcpu->kvm))
+			return -EINVAL;
+
+		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+			return -ENODEV;
+
+		if (get_user(irq, uaddr))
+			return -EFAULT;
+
+		/* The PMU overflow interrupt can be a PPI or a valid SPI. */
+		if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
+			return -EINVAL;
+
+		if (!pmu_irq_is_valid(vcpu->kvm, irq))
+			return -EINVAL;
+
+		if (kvm_arm_pmu_irq_initialized(vcpu))
+			return -EBUSY;
+
+		kvm_debug("Set kvm ARM PMU irq: %d\n", irq);
+		vcpu->arch.pmu.irq_num = irq;
+		return 0;
+	}
+	case KVM_ARM_VCPU_PMU_V3_INIT:
+		return kvm_arm_pmu_v3_init(vcpu);
+	}
+
+	return -ENXIO;
+}
+
+int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_PMU_V3_IRQ: {
+		int __user *uaddr = (int __user *)(long)attr->addr;
+		int irq;
+
+		if (!irqchip_in_kernel(vcpu->kvm))
+			return -EINVAL;
+
+		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+			return -ENODEV;
+
+		if (!kvm_arm_pmu_irq_initialized(vcpu))
+			return -ENXIO;
+
+		irq = vcpu->arch.pmu.irq_num;
+		return put_user(irq, uaddr);
+	}
+	}
+
+	return -ENXIO;
+}
+
+int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_PMU_V3_IRQ:
+	case KVM_ARM_VCPU_PMU_V3_INIT:
+		if (kvm_arm_support_pmu_v3() &&
+		    test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+			return 0;
+	}
+
+	return -ENXIO;
+}
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
new file mode 100644
index 000000000000..ae364716ee40
--- /dev/null
+++ b/arch/arm64/kvm/psci.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/preempt.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <linux/wait.h>
+
+#include <asm/cputype.h>
+#include <asm/kvm_emulate.h>
+
+#include <kvm/arm_psci.h>
+#include <kvm/arm_hypercalls.h>
+
+/*
+ * This is an implementation of the Power State Coordination Interface
+ * as described in ARM document number ARM DEN 0022A.
+ */
+
+#define AFFINITY_MASK(level)	~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
+
+static unsigned long psci_affinity_mask(unsigned long affinity_level)
+{
+	if (affinity_level <= 3)
+		return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
+
+	return 0;
+}
+
+static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * NOTE: For simplicity, we make VCPU suspend emulation to be
+	 * same-as WFI (Wait-for-interrupt) emulation.
+	 *
+	 * This means for KVM the wakeup events are interrupts and
+	 * this is consistent with intended use of StateID as described
+	 * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
+	 *
+	 * Further, we also treat power-down request to be same as
+	 * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
+	 * specification (ARM DEN 0022A). This means all suspend states
+	 * for KVM will preserve the register state.
+	 */
+	kvm_vcpu_block(vcpu);
+	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+
+	return PSCI_RET_SUCCESS;
+}
+
+static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.power_off = true;
+	kvm_make_request(KVM_REQ_SLEEP, vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
+static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
+{
+	struct vcpu_reset_state *reset_state;
+	struct kvm *kvm = source_vcpu->kvm;
+	struct kvm_vcpu *vcpu = NULL;
+	unsigned long cpu_id;
+
+	cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
+	if (vcpu_mode_is_32bit(source_vcpu))
+		cpu_id &= ~((u32) 0);
+
+	vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
+
+	/*
+	 * Make sure the caller requested a valid CPU and that the CPU is
+	 * turned off.
+	 */
+	if (!vcpu)
+		return PSCI_RET_INVALID_PARAMS;
+	if (!vcpu->arch.power_off) {
+		if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1)
+			return PSCI_RET_ALREADY_ON;
+		else
+			return PSCI_RET_INVALID_PARAMS;
+	}
+
+	reset_state = &vcpu->arch.reset_state;
+
+	reset_state->pc = smccc_get_arg2(source_vcpu);
+
+	/* Propagate caller endianness */
+	reset_state->be = kvm_vcpu_is_be(source_vcpu);
+
+	/*
+	 * NOTE: We always update r0 (or x0) because for PSCI v0.1
+	 * the general puspose registers are undefined upon CPU_ON.
+	 */
+	reset_state->r0 = smccc_get_arg3(source_vcpu);
+
+	WRITE_ONCE(reset_state->reset, true);
+	kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
+
+	/*
+	 * Make sure the reset request is observed if the change to
+	 * power_state is observed.
+	 */
+	smp_wmb();
+
+	vcpu->arch.power_off = false;
+	kvm_vcpu_wake_up(vcpu);
+
+	return PSCI_RET_SUCCESS;
+}
+
+static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
+{
+	int i, matching_cpus = 0;
+	unsigned long mpidr;
+	unsigned long target_affinity;
+	unsigned long target_affinity_mask;
+	unsigned long lowest_affinity_level;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu *tmp;
+
+	target_affinity = smccc_get_arg1(vcpu);
+	lowest_affinity_level = smccc_get_arg2(vcpu);
+
+	/* Determine target affinity mask */
+	target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
+	if (!target_affinity_mask)
+		return PSCI_RET_INVALID_PARAMS;
+
+	/* Ignore other bits of target affinity */
+	target_affinity &= target_affinity_mask;
+
+	/*
+	 * If one or more VCPU matching target affinity are running
+	 * then ON else OFF
+	 */
+	kvm_for_each_vcpu(i, tmp, kvm) {
+		mpidr = kvm_vcpu_get_mpidr_aff(tmp);
+		if ((mpidr & target_affinity_mask) == target_affinity) {
+			matching_cpus++;
+			if (!tmp->arch.power_off)
+				return PSCI_0_2_AFFINITY_LEVEL_ON;
+		}
+	}
+
+	if (!matching_cpus)
+		return PSCI_RET_INVALID_PARAMS;
+
+	return PSCI_0_2_AFFINITY_LEVEL_OFF;
+}
+
+static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
+{
+	int i;
+	struct kvm_vcpu *tmp;
+
+	/*
+	 * The KVM ABI specifies that a system event exit may call KVM_RUN
+	 * again and may perform shutdown/reboot at a later time that when the
+	 * actual request is made.  Since we are implementing PSCI and a
+	 * caller of PSCI reboot and shutdown expects that the system shuts
+	 * down or reboots immediately, let's make sure that VCPUs are not run
+	 * after this call is handled and before the VCPUs have been
+	 * re-initialized.
+	 */
+	kvm_for_each_vcpu(i, tmp, vcpu->kvm)
+		tmp->arch.power_off = true;
+	kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
+
+	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
+	vcpu->run->system_event.type = type;
+	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+}
+
+static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
+{
+	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
+}
+
+static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
+{
+	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
+}
+
+static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	/*
+	 * Zero the input registers' upper 32 bits. They will be fully
+	 * zeroed on exit, so we're fine changing them in place.
+	 */
+	for (i = 1; i < 4; i++)
+		vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
+}
+
+static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn)
+{
+	switch(fn) {
+	case PSCI_0_2_FN64_CPU_SUSPEND:
+	case PSCI_0_2_FN64_CPU_ON:
+	case PSCI_0_2_FN64_AFFINITY_INFO:
+		/* Disallow these functions for 32bit guests */
+		if (vcpu_mode_is_32bit(vcpu))
+			return PSCI_RET_NOT_SUPPORTED;
+		break;
+	}
+
+	return 0;
+}
+
+static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u32 psci_fn = smccc_get_function(vcpu);
+	unsigned long val;
+	int ret = 1;
+
+	val = kvm_psci_check_allowed_function(vcpu, psci_fn);
+	if (val)
+		goto out;
+
+	switch (psci_fn) {
+	case PSCI_0_2_FN_PSCI_VERSION:
+		/*
+		 * Bits[31:16] = Major Version = 0
+		 * Bits[15:0] = Minor Version = 2
+		 */
+		val = KVM_ARM_PSCI_0_2;
+		break;
+	case PSCI_0_2_FN_CPU_SUSPEND:
+	case PSCI_0_2_FN64_CPU_SUSPEND:
+		val = kvm_psci_vcpu_suspend(vcpu);
+		break;
+	case PSCI_0_2_FN_CPU_OFF:
+		kvm_psci_vcpu_off(vcpu);
+		val = PSCI_RET_SUCCESS;
+		break;
+	case PSCI_0_2_FN_CPU_ON:
+		kvm_psci_narrow_to_32bit(vcpu);
+		fallthrough;
+	case PSCI_0_2_FN64_CPU_ON:
+		mutex_lock(&kvm->lock);
+		val = kvm_psci_vcpu_on(vcpu);
+		mutex_unlock(&kvm->lock);
+		break;
+	case PSCI_0_2_FN_AFFINITY_INFO:
+		kvm_psci_narrow_to_32bit(vcpu);
+		fallthrough;
+	case PSCI_0_2_FN64_AFFINITY_INFO:
+		val = kvm_psci_vcpu_affinity_info(vcpu);
+		break;
+	case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+		/*
+		 * Trusted OS is MP hence does not require migration
+	         * or
+		 * Trusted OS is not present
+		 */
+		val = PSCI_0_2_TOS_MP;
+		break;
+	case PSCI_0_2_FN_SYSTEM_OFF:
+		kvm_psci_system_off(vcpu);
+		/*
+		 * We should'nt be going back to guest VCPU after
+		 * receiving SYSTEM_OFF request.
+		 *
+		 * If user space accidently/deliberately resumes
+		 * guest VCPU after SYSTEM_OFF request then guest
+		 * VCPU should see internal failure from PSCI return
+		 * value. To achieve this, we preload r0 (or x0) with
+		 * PSCI return value INTERNAL_FAILURE.
+		 */
+		val = PSCI_RET_INTERNAL_FAILURE;
+		ret = 0;
+		break;
+	case PSCI_0_2_FN_SYSTEM_RESET:
+		kvm_psci_system_reset(vcpu);
+		/*
+		 * Same reason as SYSTEM_OFF for preloading r0 (or x0)
+		 * with PSCI return value INTERNAL_FAILURE.
+		 */
+		val = PSCI_RET_INTERNAL_FAILURE;
+		ret = 0;
+		break;
+	default:
+		val = PSCI_RET_NOT_SUPPORTED;
+		break;
+	}
+
+out:
+	smccc_set_retval(vcpu, val, 0, 0, 0);
+	return ret;
+}
+
+static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
+{
+	u32 psci_fn = smccc_get_function(vcpu);
+	u32 feature;
+	unsigned long val;
+	int ret = 1;
+
+	switch(psci_fn) {
+	case PSCI_0_2_FN_PSCI_VERSION:
+		val = KVM_ARM_PSCI_1_0;
+		break;
+	case PSCI_1_0_FN_PSCI_FEATURES:
+		feature = smccc_get_arg1(vcpu);
+		val = kvm_psci_check_allowed_function(vcpu, feature);
+		if (val)
+			break;
+
+		switch(feature) {
+		case PSCI_0_2_FN_PSCI_VERSION:
+		case PSCI_0_2_FN_CPU_SUSPEND:
+		case PSCI_0_2_FN64_CPU_SUSPEND:
+		case PSCI_0_2_FN_CPU_OFF:
+		case PSCI_0_2_FN_CPU_ON:
+		case PSCI_0_2_FN64_CPU_ON:
+		case PSCI_0_2_FN_AFFINITY_INFO:
+		case PSCI_0_2_FN64_AFFINITY_INFO:
+		case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+		case PSCI_0_2_FN_SYSTEM_OFF:
+		case PSCI_0_2_FN_SYSTEM_RESET:
+		case PSCI_1_0_FN_PSCI_FEATURES:
+		case ARM_SMCCC_VERSION_FUNC_ID:
+			val = 0;
+			break;
+		default:
+			val = PSCI_RET_NOT_SUPPORTED;
+			break;
+		}
+		break;
+	default:
+		return kvm_psci_0_2_call(vcpu);
+	}
+
+	smccc_set_retval(vcpu, val, 0, 0, 0);
+	return ret;
+}
+
+static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u32 psci_fn = smccc_get_function(vcpu);
+	unsigned long val;
+
+	switch (psci_fn) {
+	case KVM_PSCI_FN_CPU_OFF:
+		kvm_psci_vcpu_off(vcpu);
+		val = PSCI_RET_SUCCESS;
+		break;
+	case KVM_PSCI_FN_CPU_ON:
+		mutex_lock(&kvm->lock);
+		val = kvm_psci_vcpu_on(vcpu);
+		mutex_unlock(&kvm->lock);
+		break;
+	default:
+		val = PSCI_RET_NOT_SUPPORTED;
+		break;
+	}
+
+	smccc_set_retval(vcpu, val, 0, 0, 0);
+	return 1;
+}
+
+/**
+ * kvm_psci_call - handle PSCI call if r0 value is in range
+ * @vcpu: Pointer to the VCPU struct
+ *
+ * Handle PSCI calls from guests through traps from HVC instructions.
+ * The calling convention is similar to SMC calls to the secure world
+ * where the function number is placed in r0.
+ *
+ * This function returns: > 0 (success), 0 (success but exit to user
+ * space), and < 0 (errors)
+ *
+ * Errors:
+ * -EINVAL: Unrecognized PSCI function
+ */
+int kvm_psci_call(struct kvm_vcpu *vcpu)
+{
+	switch (kvm_psci_version(vcpu, vcpu->kvm)) {
+	case KVM_ARM_PSCI_1_0:
+		return kvm_psci_1_0_call(vcpu);
+	case KVM_ARM_PSCI_0_2:
+		return kvm_psci_0_2_call(vcpu);
+	case KVM_ARM_PSCI_0_1:
+		return kvm_psci_0_1_call(vcpu);
+	default:
+		return -EINVAL;
+	};
+}
+
+int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
+{
+	return 3;		/* PSCI version and two workaround registers */
+}
+
+int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
+{
+	if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++))
+		return -EFAULT;
+
+	if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++))
+		return -EFAULT;
+
+	if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define KVM_REG_FEATURE_LEVEL_WIDTH	4
+#define KVM_REG_FEATURE_LEVEL_MASK	(BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1)
+
+/*
+ * Convert the workaround level into an easy-to-compare number, where higher
+ * values mean better protection.
+ */
+static int get_kernel_wa_level(u64 regid)
+{
+	switch (regid) {
+	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+		switch (kvm_arm_harden_branch_predictor()) {
+		case KVM_BP_HARDEN_UNKNOWN:
+			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
+		case KVM_BP_HARDEN_WA_NEEDED:
+			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
+		case KVM_BP_HARDEN_NOT_REQUIRED:
+			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
+		}
+		return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
+	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+		switch (kvm_arm_have_ssbd()) {
+		case KVM_SSBD_FORCE_DISABLE:
+			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+		case KVM_SSBD_KERNEL:
+			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
+		case KVM_SSBD_FORCE_ENABLE:
+		case KVM_SSBD_MITIGATED:
+			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
+		case KVM_SSBD_UNKNOWN:
+		default:
+			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
+		}
+	}
+
+	return -EINVAL;
+}
+
+int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+	void __user *uaddr = (void __user *)(long)reg->addr;
+	u64 val;
+
+	switch (reg->id) {
+	case KVM_REG_ARM_PSCI_VERSION:
+		val = kvm_psci_version(vcpu, vcpu->kvm);
+		break;
+	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+		val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+		break;
+	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+		val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+
+		if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
+		    kvm_arm_get_vcpu_workaround_2_flag(vcpu))
+			val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
+		break;
+	default:
+		return -ENOENT;
+	}
+
+	if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+	void __user *uaddr = (void __user *)(long)reg->addr;
+	u64 val;
+	int wa_level;
+
+	if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
+		return -EFAULT;
+
+	switch (reg->id) {
+	case KVM_REG_ARM_PSCI_VERSION:
+	{
+		bool wants_02;
+
+		wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
+
+		switch (val) {
+		case KVM_ARM_PSCI_0_1:
+			if (wants_02)
+				return -EINVAL;
+			vcpu->kvm->arch.psci_version = val;
+			return 0;
+		case KVM_ARM_PSCI_0_2:
+		case KVM_ARM_PSCI_1_0:
+			if (!wants_02)
+				return -EINVAL;
+			vcpu->kvm->arch.psci_version = val;
+			return 0;
+		}
+		break;
+	}
+
+	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+		if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
+			return -EINVAL;
+
+		if (get_kernel_wa_level(reg->id) < val)
+			return -EINVAL;
+
+		return 0;
+
+	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+		if (val & ~(KVM_REG_FEATURE_LEVEL_MASK |
+			    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
+			return -EINVAL;
+
+		wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
+
+		if (get_kernel_wa_level(reg->id) < wa_level)
+			return -EINVAL;
+
+		/* The enabled bit must not be set unless the level is AVAIL. */
+		if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
+		    wa_level != val)
+			return -EINVAL;
+
+		/* Are we finished or do we need to check the enable bit ? */
+		if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
+			return 0;
+
+		/*
+		 * If this kernel supports the workaround to be switched on
+		 * or off, make sure it matches the requested setting.
+		 */
+		switch (wa_level) {
+		case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
+			kvm_arm_set_vcpu_workaround_2_flag(vcpu,
+			    val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
+			break;
+		case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
+			kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
+			break;
+		}
+
+		return 0;
+	default:
+		return -ENOENT;
+	}
+
+	return -EINVAL;
+}
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
new file mode 100644
index 000000000000..1e0f4c284888
--- /dev/null
+++ b/arch/arm64/kvm/pvtime.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/pvclock-abi.h>
+
+#include <kvm/arm_hypercalls.h>
+
+void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u64 steal;
+	__le64 steal_le;
+	u64 offset;
+	int idx;
+	u64 base = vcpu->arch.steal.base;
+
+	if (base == GPA_INVALID)
+		return;
+
+	/* Let's do the local bookkeeping */
+	steal = vcpu->arch.steal.steal;
+	steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
+	vcpu->arch.steal.last_steal = current->sched_info.run_delay;
+	vcpu->arch.steal.steal = steal;
+
+	steal_le = cpu_to_le64(steal);
+	idx = srcu_read_lock(&kvm->srcu);
+	offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
+	kvm_put_guest(kvm, base + offset, steal_le, u64);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
+long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
+{
+	u32 feature = smccc_get_arg1(vcpu);
+	long val = SMCCC_RET_NOT_SUPPORTED;
+
+	switch (feature) {
+	case ARM_SMCCC_HV_PV_TIME_FEATURES:
+	case ARM_SMCCC_HV_PV_TIME_ST:
+		val = SMCCC_RET_SUCCESS;
+		break;
+	}
+
+	return val;
+}
+
+gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
+{
+	struct pvclock_vcpu_stolen_time init_values = {};
+	struct kvm *kvm = vcpu->kvm;
+	u64 base = vcpu->arch.steal.base;
+	int idx;
+
+	if (base == GPA_INVALID)
+		return base;
+
+	/*
+	 * Start counting stolen time from the time the guest requests
+	 * the feature enabled.
+	 */
+	vcpu->arch.steal.steal = 0;
+	vcpu->arch.steal.last_steal = current->sched_info.run_delay;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	kvm_write_guest(kvm, base, &init_values, sizeof(init_values));
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return base;
+}
+
+int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
+			    struct kvm_device_attr *attr)
+{
+	u64 __user *user = (u64 __user *)attr->addr;
+	struct kvm *kvm = vcpu->kvm;
+	u64 ipa;
+	int ret = 0;
+	int idx;
+
+	if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
+		return -ENXIO;
+
+	if (get_user(ipa, user))
+		return -EFAULT;
+	if (!IS_ALIGNED(ipa, 64))
+		return -EINVAL;
+	if (vcpu->arch.steal.base != GPA_INVALID)
+		return -EEXIST;
+
+	/* Check the address is in a valid memslot */
+	idx = srcu_read_lock(&kvm->srcu);
+	if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT)))
+		ret = -EINVAL;
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	if (!ret)
+		vcpu->arch.steal.base = ipa;
+
+	return ret;
+}
+
+int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
+			    struct kvm_device_attr *attr)
+{
+	u64 __user *user = (u64 __user *)attr->addr;
+	u64 ipa;
+
+	if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
+		return -ENXIO;
+
+	ipa = vcpu->arch.steal.base;
+
+	if (put_user(ipa, user))
+		return -EFAULT;
+	return 0;
+}
+
+int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
+			    struct kvm_device_attr *attr)
+{
+	switch (attr->attr) {
+	case KVM_ARM_VCPU_PVTIME_IPA:
+		return 0;
+	}
+	return -ENXIO;
+}
diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h
index eab91ad0effb..86f9ea47be29 100644
--- a/arch/arm64/kvm/trace.h
+++ b/arch/arm64/kvm/trace.h
@@ -1,216 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#ifndef _TRACE_ARM64_KVM_H
 #define _TRACE_ARM64_KVM_H
 
-#include <linux/tracepoint.h>
-#include "sys_regs.h"
+#include "trace_arm.h"
+#include "trace_handle_exit.h"
 
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-TRACE_EVENT(kvm_wfx_arm64,
-	TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
-	TP_ARGS(vcpu_pc, is_wfe),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,	vcpu_pc)
-		__field(bool,		is_wfe)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc = vcpu_pc;
-		__entry->is_wfe  = is_wfe;
-	),
-
-	TP_printk("guest executed wf%c at: 0x%08lx",
-		  __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_hvc_arm64,
-	TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
-	TP_ARGS(vcpu_pc, r0, imm),
-
-	TP_STRUCT__entry(
-		__field(unsigned long, vcpu_pc)
-		__field(unsigned long, r0)
-		__field(unsigned long, imm)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc = vcpu_pc;
-		__entry->r0 = r0;
-		__entry->imm = imm;
-	),
-
-	TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
-		  __entry->vcpu_pc, __entry->r0, __entry->imm)
-);
-
-TRACE_EVENT(kvm_arm_setup_debug,
-	TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
-	TP_ARGS(vcpu, guest_debug),
-
-	TP_STRUCT__entry(
-		__field(struct kvm_vcpu *, vcpu)
-		__field(__u32, guest_debug)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu = vcpu;
-		__entry->guest_debug = guest_debug;
-	),
-
-	TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
-);
-
-TRACE_EVENT(kvm_arm_clear_debug,
-	TP_PROTO(__u32 guest_debug),
-	TP_ARGS(guest_debug),
-
-	TP_STRUCT__entry(
-		__field(__u32, guest_debug)
-	),
-
-	TP_fast_assign(
-		__entry->guest_debug = guest_debug;
-	),
-
-	TP_printk("flags: 0x%08x", __entry->guest_debug)
-);
-
-TRACE_EVENT(kvm_arm_set_dreg32,
-	TP_PROTO(const char *name, __u32 value),
-	TP_ARGS(name, value),
-
-	TP_STRUCT__entry(
-		__field(const char *, name)
-		__field(__u32, value)
-	),
-
-	TP_fast_assign(
-		__entry->name = name;
-		__entry->value = value;
-	),
-
-	TP_printk("%s: 0x%08x", __entry->name, __entry->value)
-);
-
-TRACE_DEFINE_SIZEOF(__u64);
-
-TRACE_EVENT(kvm_arm_set_regset,
-	TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
-	TP_ARGS(type, len, control, value),
-	TP_STRUCT__entry(
-		__field(const char *, name)
-		__field(int, len)
-		__array(u64, ctrls, 16)
-		__array(u64, values, 16)
-	),
-	TP_fast_assign(
-		__entry->name = type;
-		__entry->len = len;
-		memcpy(__entry->ctrls, control, len << 3);
-		memcpy(__entry->values, value, len << 3);
-	),
-	TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
-		__print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
-		__print_array(__entry->values, __entry->len, sizeof(__u64)))
-);
-
-TRACE_EVENT(trap_reg,
-	TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
-	TP_ARGS(fn, reg, is_write, write_value),
-
-	TP_STRUCT__entry(
-		__field(const char *, fn)
-		__field(int, reg)
-		__field(bool, is_write)
-		__field(u64, write_value)
-	),
-
-	TP_fast_assign(
-		__entry->fn = fn;
-		__entry->reg = reg;
-		__entry->is_write = is_write;
-		__entry->write_value = write_value;
-	),
-
-	TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
-);
-
-TRACE_EVENT(kvm_handle_sys_reg,
-	TP_PROTO(unsigned long hsr),
-	TP_ARGS(hsr),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,	hsr)
-	),
-
-	TP_fast_assign(
-		__entry->hsr = hsr;
-	),
-
-	TP_printk("HSR 0x%08lx", __entry->hsr)
-);
-
-TRACE_EVENT(kvm_sys_access,
-	TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg),
-	TP_ARGS(vcpu_pc, params, reg),
-
-	TP_STRUCT__entry(
-		__field(unsigned long,			vcpu_pc)
-		__field(bool,				is_write)
-		__field(const char *,			name)
-		__field(u8,				Op0)
-		__field(u8,				Op1)
-		__field(u8,				CRn)
-		__field(u8,				CRm)
-		__field(u8,				Op2)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc = vcpu_pc;
-		__entry->is_write = params->is_write;
-		__entry->name = reg->name;
-		__entry->Op0 = reg->Op0;
-		__entry->Op0 = reg->Op0;
-		__entry->Op1 = reg->Op1;
-		__entry->CRn = reg->CRn;
-		__entry->CRm = reg->CRm;
-		__entry->Op2 = reg->Op2;
-	),
-
-	TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s",
-		  __entry->vcpu_pc, __entry->name ?: "UNKN",
-		  __entry->Op0, __entry->Op1, __entry->CRn,
-		  __entry->CRm, __entry->Op2,
-		  __entry->is_write ? "write" : "read")
-);
-
-TRACE_EVENT(kvm_set_guest_debug,
-	TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
-	TP_ARGS(vcpu, guest_debug),
-
-	TP_STRUCT__entry(
-		__field(struct kvm_vcpu *, vcpu)
-		__field(__u32, guest_debug)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu = vcpu;
-		__entry->guest_debug = guest_debug;
-	),
-
-	TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
-);
-
-
-#endif /* _TRACE_ARM64_KVM_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
+#endif	/* _TRACE_ARM64_KVM_H */
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
new file mode 100644
index 000000000000..4c71270cc097
--- /dev/null
+++ b/arch/arm64/kvm/trace_arm.h
@@ -0,0 +1,378 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ARM_ARM64_KVM_H
+
+#include <kvm/arm_arch_timer.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoints for entry/exit to guest
+ */
+TRACE_EVENT(kvm_entry,
+	TP_PROTO(unsigned long vcpu_pc),
+	TP_ARGS(vcpu_pc),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_pc		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_pc		= vcpu_pc;
+	),
+
+	TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_exit,
+	TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
+	TP_ARGS(ret, esr_ec, vcpu_pc),
+
+	TP_STRUCT__entry(
+		__field(	int,		ret		)
+		__field(	unsigned int,	esr_ec		)
+		__field(	unsigned long,	vcpu_pc		)
+	),
+
+	TP_fast_assign(
+		__entry->ret			= ARM_EXCEPTION_CODE(ret);
+		__entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
+		__entry->vcpu_pc		= vcpu_pc;
+	),
+
+	TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
+		  __print_symbolic(__entry->ret, kvm_arm_exception_type),
+		  __entry->esr_ec,
+		  __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
+		  __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_guest_fault,
+	TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
+		 unsigned long hxfar,
+		 unsigned long long ipa),
+	TP_ARGS(vcpu_pc, hsr, hxfar, ipa),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_pc		)
+		__field(	unsigned long,	hsr		)
+		__field(	unsigned long,	hxfar		)
+		__field(   unsigned long long,	ipa		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_pc		= vcpu_pc;
+		__entry->hsr			= hsr;
+		__entry->hxfar			= hxfar;
+		__entry->ipa			= ipa;
+	),
+
+	TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
+		  __entry->ipa, __entry->hsr,
+		  __entry->hxfar, __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_access_fault,
+	TP_PROTO(unsigned long ipa),
+	TP_ARGS(ipa),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	ipa		)
+	),
+
+	TP_fast_assign(
+		__entry->ipa		= ipa;
+	),
+
+	TP_printk("IPA: %lx", __entry->ipa)
+);
+
+TRACE_EVENT(kvm_irq_line,
+	TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
+	TP_ARGS(type, vcpu_idx, irq_num, level),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	type		)
+		__field(	int,		vcpu_idx	)
+		__field(	int,		irq_num		)
+		__field(	int,		level		)
+	),
+
+	TP_fast_assign(
+		__entry->type		= type;
+		__entry->vcpu_idx	= vcpu_idx;
+		__entry->irq_num	= irq_num;
+		__entry->level		= level;
+	),
+
+	TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d",
+		  (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" :
+		  (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" :
+		  (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN",
+		  __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
+);
+
+TRACE_EVENT(kvm_mmio_emulate,
+	TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
+		 unsigned long cpsr),
+	TP_ARGS(vcpu_pc, instr, cpsr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_pc		)
+		__field(	unsigned long,	instr		)
+		__field(	unsigned long,	cpsr		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_pc		= vcpu_pc;
+		__entry->instr			= instr;
+		__entry->cpsr			= cpsr;
+	),
+
+	TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
+		  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
+);
+
+TRACE_EVENT(kvm_unmap_hva_range,
+	TP_PROTO(unsigned long start, unsigned long end),
+	TP_ARGS(start, end),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	start		)
+		__field(	unsigned long,	end		)
+	),
+
+	TP_fast_assign(
+		__entry->start		= start;
+		__entry->end		= end;
+	),
+
+	TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
+		  __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_set_spte_hva,
+	TP_PROTO(unsigned long hva),
+	TP_ARGS(hva),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	hva		)
+	),
+
+	TP_fast_assign(
+		__entry->hva		= hva;
+	),
+
+	TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_age_hva,
+	TP_PROTO(unsigned long start, unsigned long end),
+	TP_ARGS(start, end),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	start		)
+		__field(	unsigned long,	end		)
+	),
+
+	TP_fast_assign(
+		__entry->start		= start;
+		__entry->end		= end;
+	),
+
+	TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
+		  __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_test_age_hva,
+	TP_PROTO(unsigned long hva),
+	TP_ARGS(hva),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	hva		)
+	),
+
+	TP_fast_assign(
+		__entry->hva		= hva;
+	),
+
+	TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_set_way_flush,
+	    TP_PROTO(unsigned long vcpu_pc, bool cache),
+	    TP_ARGS(vcpu_pc, cache),
+
+	    TP_STRUCT__entry(
+		    __field(	unsigned long,	vcpu_pc		)
+		    __field(	bool,		cache		)
+	    ),
+
+	    TP_fast_assign(
+		    __entry->vcpu_pc		= vcpu_pc;
+		    __entry->cache		= cache;
+	    ),
+
+	    TP_printk("S/W flush at 0x%016lx (cache %s)",
+		      __entry->vcpu_pc, __entry->cache ? "on" : "off")
+);
+
+TRACE_EVENT(kvm_toggle_cache,
+	    TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
+	    TP_ARGS(vcpu_pc, was, now),
+
+	    TP_STRUCT__entry(
+		    __field(	unsigned long,	vcpu_pc		)
+		    __field(	bool,		was		)
+		    __field(	bool,		now		)
+	    ),
+
+	    TP_fast_assign(
+		    __entry->vcpu_pc		= vcpu_pc;
+		    __entry->was		= was;
+		    __entry->now		= now;
+	    ),
+
+	    TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
+		      __entry->vcpu_pc, __entry->was ? "on" : "off",
+		      __entry->now ? "on" : "off")
+);
+
+/*
+ * Tracepoints for arch_timer
+ */
+TRACE_EVENT(kvm_timer_update_irq,
+	TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
+	TP_ARGS(vcpu_id, irq, level),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_id	)
+		__field(	__u32,		irq	)
+		__field(	int,		level	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->irq		= irq;
+		__entry->level		= level;
+	),
+
+	TP_printk("VCPU: %ld, IRQ %d, level %d",
+		  __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+TRACE_EVENT(kvm_get_timer_map,
+	TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
+	TP_ARGS(vcpu_id, map),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,		vcpu_id	)
+		__field(	int,			direct_vtimer	)
+		__field(	int,			direct_ptimer	)
+		__field(	int,			emul_ptimer	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
+		__entry->direct_vtimer		= arch_timer_ctx_index(map->direct_vtimer);
+		__entry->direct_ptimer =
+			(map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
+		__entry->emul_ptimer =
+			(map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
+	),
+
+	TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
+		  __entry->vcpu_id,
+		  __entry->direct_vtimer,
+		  __entry->direct_ptimer,
+		  __entry->emul_ptimer)
+);
+
+TRACE_EVENT(kvm_timer_save_state,
+	TP_PROTO(struct arch_timer_context *ctx),
+	TP_ARGS(ctx),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,		ctl		)
+		__field(	unsigned long long,	cval		)
+		__field(	int,			timer_idx	)
+	),
+
+	TP_fast_assign(
+		__entry->ctl			= ctx->cnt_ctl;
+		__entry->cval			= ctx->cnt_cval;
+		__entry->timer_idx		= arch_timer_ctx_index(ctx);
+	),
+
+	TP_printk("   CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+		  __entry->ctl,
+		  __entry->cval,
+		  __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_restore_state,
+	TP_PROTO(struct arch_timer_context *ctx),
+	TP_ARGS(ctx),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,		ctl		)
+		__field(	unsigned long long,	cval		)
+		__field(	int,			timer_idx	)
+	),
+
+	TP_fast_assign(
+		__entry->ctl			= ctx->cnt_ctl;
+		__entry->cval			= ctx->cnt_cval;
+		__entry->timer_idx		= arch_timer_ctx_index(ctx);
+	),
+
+	TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+		  __entry->ctl,
+		  __entry->cval,
+		  __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_hrtimer_expire,
+	TP_PROTO(struct arch_timer_context *ctx),
+	TP_ARGS(ctx),
+
+	TP_STRUCT__entry(
+		__field(	int,			timer_idx	)
+	),
+
+	TP_fast_assign(
+		__entry->timer_idx		= arch_timer_ctx_index(ctx);
+	),
+
+	TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_emulate,
+	TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
+	TP_ARGS(ctx, should_fire),
+
+	TP_STRUCT__entry(
+		__field(	int,			timer_idx	)
+		__field(	bool,			should_fire	)
+	),
+
+	TP_fast_assign(
+		__entry->timer_idx		= arch_timer_ctx_index(ctx);
+		__entry->should_fire		= should_fire;
+	),
+
+	TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
+		  __entry->timer_idx, __entry->should_fire)
+);
+
+#endif /* _TRACE_ARM_ARM64_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_arm
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
new file mode 100644
index 000000000000..2c56d1e0f5bd
--- /dev/null
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_HANDLE_EXIT_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HANDLE_EXIT_ARM64_KVM_H
+
+#include <linux/tracepoint.h>
+#include "sys_regs.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+TRACE_EVENT(kvm_wfx_arm64,
+	TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
+	TP_ARGS(vcpu_pc, is_wfe),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	vcpu_pc)
+		__field(bool,		is_wfe)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_pc = vcpu_pc;
+		__entry->is_wfe  = is_wfe;
+	),
+
+	TP_printk("guest executed wf%c at: 0x%08lx",
+		  __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_hvc_arm64,
+	TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
+	TP_ARGS(vcpu_pc, r0, imm),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, vcpu_pc)
+		__field(unsigned long, r0)
+		__field(unsigned long, imm)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_pc = vcpu_pc;
+		__entry->r0 = r0;
+		__entry->imm = imm;
+	),
+
+	TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
+		  __entry->vcpu_pc, __entry->r0, __entry->imm)
+);
+
+TRACE_EVENT(kvm_arm_setup_debug,
+	TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+	TP_ARGS(vcpu, guest_debug),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(__u32, guest_debug)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->guest_debug = guest_debug;
+	),
+
+	TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_clear_debug,
+	TP_PROTO(__u32 guest_debug),
+	TP_ARGS(guest_debug),
+
+	TP_STRUCT__entry(
+		__field(__u32, guest_debug)
+	),
+
+	TP_fast_assign(
+		__entry->guest_debug = guest_debug;
+	),
+
+	TP_printk("flags: 0x%08x", __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_set_dreg32,
+	TP_PROTO(const char *name, __u32 value),
+	TP_ARGS(name, value),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(__u32, value)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->value = value;
+	),
+
+	TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+);
+
+TRACE_DEFINE_SIZEOF(__u64);
+
+TRACE_EVENT(kvm_arm_set_regset,
+	TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
+	TP_ARGS(type, len, control, value),
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(int, len)
+		__array(u64, ctrls, 16)
+		__array(u64, values, 16)
+	),
+	TP_fast_assign(
+		__entry->name = type;
+		__entry->len = len;
+		memcpy(__entry->ctrls, control, len << 3);
+		memcpy(__entry->values, value, len << 3);
+	),
+	TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
+		__print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
+		__print_array(__entry->values, __entry->len, sizeof(__u64)))
+);
+
+TRACE_EVENT(trap_reg,
+	TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
+	TP_ARGS(fn, reg, is_write, write_value),
+
+	TP_STRUCT__entry(
+		__field(const char *, fn)
+		__field(int, reg)
+		__field(bool, is_write)
+		__field(u64, write_value)
+	),
+
+	TP_fast_assign(
+		__entry->fn = fn;
+		__entry->reg = reg;
+		__entry->is_write = is_write;
+		__entry->write_value = write_value;
+	),
+
+	TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+);
+
+TRACE_EVENT(kvm_handle_sys_reg,
+	TP_PROTO(unsigned long hsr),
+	TP_ARGS(hsr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	hsr)
+	),
+
+	TP_fast_assign(
+		__entry->hsr = hsr;
+	),
+
+	TP_printk("HSR 0x%08lx", __entry->hsr)
+);
+
+TRACE_EVENT(kvm_sys_access,
+	TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg),
+	TP_ARGS(vcpu_pc, params, reg),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,			vcpu_pc)
+		__field(bool,				is_write)
+		__field(const char *,			name)
+		__field(u8,				Op0)
+		__field(u8,				Op1)
+		__field(u8,				CRn)
+		__field(u8,				CRm)
+		__field(u8,				Op2)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_pc = vcpu_pc;
+		__entry->is_write = params->is_write;
+		__entry->name = reg->name;
+		__entry->Op0 = reg->Op0;
+		__entry->Op0 = reg->Op0;
+		__entry->Op1 = reg->Op1;
+		__entry->CRn = reg->CRn;
+		__entry->CRm = reg->CRm;
+		__entry->Op2 = reg->Op2;
+	),
+
+	TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s",
+		  __entry->vcpu_pc, __entry->name ?: "UNKN",
+		  __entry->Op0, __entry->Op1, __entry->CRn,
+		  __entry->CRm, __entry->Op2,
+		  __entry->is_write ? "write" : "read")
+);
+
+TRACE_EVENT(kvm_set_guest_debug,
+	TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+	TP_ARGS(vcpu, guest_debug),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(__u32, guest_debug)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->guest_debug = guest_debug;
+	),
+
+	TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+#endif /* _TRACE_HANDLE_EXIT_ARM64_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_handle_exit
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index e7d1ea92095d..2f92bdcb1188 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -7,7 +7,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <asm/kvm_emulate.h>
-#include "vgic.h"
+#include "vgic/vgic.h"
 #include "sys_regs.h"
 
 static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
diff --git a/arch/arm64/kvm/vgic/trace.h b/arch/arm64/kvm/vgic/trace.h
new file mode 100644
index 000000000000..83c64401a7fc
--- /dev/null
+++ b/arch/arm64/kvm/vgic/trace.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VGIC_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+TRACE_EVENT(vgic_update_irq_pending,
+	TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
+	TP_ARGS(vcpu_id, irq, level),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_id	)
+		__field(	__u32,		irq	)
+		__field(	bool,		level	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->irq		= irq;
+		__entry->level		= level;
+	),
+
+	TP_printk("VCPU: %ld, IRQ %d, level: %d",
+		  __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+#endif /* _TRACE_VGIC_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
new file mode 100644
index 000000000000..b13a9e3f99dd
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 Linaro
+ * Author: Christoffer Dall <christoffer.dall@linaro.org>
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/interrupt.h>
+#include <linux/kvm_host.h>
+#include <linux/seq_file.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/*
+ * Structure to control looping through the entire vgic state.  We start at
+ * zero for each field and move upwards.  So, if dist_id is 0 we print the
+ * distributor info.  When dist_id is 1, we have already printed it and move
+ * on.
+ *
+ * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and
+ * so on.
+ */
+struct vgic_state_iter {
+	int nr_cpus;
+	int nr_spis;
+	int nr_lpis;
+	int dist_id;
+	int vcpu_id;
+	int intid;
+	int lpi_idx;
+	u32 *lpi_array;
+};
+
+static void iter_next(struct vgic_state_iter *iter)
+{
+	if (iter->dist_id == 0) {
+		iter->dist_id++;
+		return;
+	}
+
+	iter->intid++;
+	if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
+	    ++iter->vcpu_id < iter->nr_cpus)
+		iter->intid = 0;
+
+	if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+		if (iter->lpi_idx < iter->nr_lpis)
+			iter->intid = iter->lpi_array[iter->lpi_idx];
+		iter->lpi_idx++;
+	}
+}
+
+static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
+		      loff_t pos)
+{
+	int nr_cpus = atomic_read(&kvm->online_vcpus);
+
+	memset(iter, 0, sizeof(*iter));
+
+	iter->nr_cpus = nr_cpus;
+	iter->nr_spis = kvm->arch.vgic.nr_spis;
+	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+		iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
+		if (iter->nr_lpis < 0)
+			iter->nr_lpis = 0;
+	}
+
+	/* Fast forward to the right position if needed */
+	while (pos--)
+		iter_next(iter);
+}
+
+static bool end_of_vgic(struct vgic_state_iter *iter)
+{
+	return iter->dist_id > 0 &&
+		iter->vcpu_id == iter->nr_cpus &&
+		iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
+		iter->lpi_idx > iter->nr_lpis;
+}
+
+static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
+{
+	struct kvm *kvm = (struct kvm *)s->private;
+	struct vgic_state_iter *iter;
+
+	mutex_lock(&kvm->lock);
+	iter = kvm->arch.vgic.iter;
+	if (iter) {
+		iter = ERR_PTR(-EBUSY);
+		goto out;
+	}
+
+	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter) {
+		iter = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	iter_init(kvm, iter, *pos);
+	kvm->arch.vgic.iter = iter;
+
+	if (end_of_vgic(iter))
+		iter = NULL;
+out:
+	mutex_unlock(&kvm->lock);
+	return iter;
+}
+
+static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kvm *kvm = (struct kvm *)s->private;
+	struct vgic_state_iter *iter = kvm->arch.vgic.iter;
+
+	++*pos;
+	iter_next(iter);
+	if (end_of_vgic(iter))
+		iter = NULL;
+	return iter;
+}
+
+static void vgic_debug_stop(struct seq_file *s, void *v)
+{
+	struct kvm *kvm = (struct kvm *)s->private;
+	struct vgic_state_iter *iter;
+
+	/*
+	 * If the seq file wasn't properly opened, there's nothing to clearn
+	 * up.
+	 */
+	if (IS_ERR(v))
+		return;
+
+	mutex_lock(&kvm->lock);
+	iter = kvm->arch.vgic.iter;
+	kfree(iter->lpi_array);
+	kfree(iter);
+	kvm->arch.vgic.iter = NULL;
+	mutex_unlock(&kvm->lock);
+}
+
+static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
+{
+	bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
+
+	seq_printf(s, "Distributor\n");
+	seq_printf(s, "===========\n");
+	seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
+	seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
+	if (v3)
+		seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count);
+	seq_printf(s, "enabled:\t%d\n", dist->enabled);
+	seq_printf(s, "\n");
+
+	seq_printf(s, "P=pending_latch, L=line_level, A=active\n");
+	seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n");
+	seq_printf(s, "G=group\n");
+}
+
+static void print_header(struct seq_file *s, struct vgic_irq *irq,
+			 struct kvm_vcpu *vcpu)
+{
+	int id = 0;
+	char *hdr = "SPI ";
+
+	if (vcpu) {
+		hdr = "VCPU";
+		id = vcpu->vcpu_id;
+	}
+
+	seq_printf(s, "\n");
+	seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHCG     HWID   TARGET SRC PRI VCPU_ID\n", hdr, id);
+	seq_printf(s, "----------------------------------------------------------------\n");
+}
+
+static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
+			    struct kvm_vcpu *vcpu)
+{
+	char *type;
+	bool pending;
+
+	if (irq->intid < VGIC_NR_SGIS)
+		type = "SGI";
+	else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
+		type = "PPI";
+	else if (irq->intid < VGIC_MAX_SPI)
+		type = "SPI";
+	else
+		type = "LPI";
+
+	if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
+		print_header(s, irq, vcpu);
+
+	pending = irq->pending_latch;
+	if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+		int err;
+
+		err = irq_get_irqchip_state(irq->host_irq,
+					    IRQCHIP_STATE_PENDING,
+					    &pending);
+		WARN_ON_ONCE(err);
+	}
+
+	seq_printf(s, "       %s %4d "
+		      "    %2d "
+		      "%d%d%d%d%d%d%d "
+		      "%8d "
+		      "%8x "
+		      " %2x "
+		      "%3d "
+		      "     %2d "
+		      "\n",
+			type, irq->intid,
+			(irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
+			pending,
+			irq->line_level,
+			irq->active,
+			irq->enabled,
+			irq->hw,
+			irq->config == VGIC_CONFIG_LEVEL,
+			irq->group,
+			irq->hwintid,
+			irq->mpidr,
+			irq->source,
+			irq->priority,
+			(irq->vcpu) ? irq->vcpu->vcpu_id : -1);
+}
+
+static int vgic_debug_show(struct seq_file *s, void *v)
+{
+	struct kvm *kvm = (struct kvm *)s->private;
+	struct vgic_state_iter *iter = (struct vgic_state_iter *)v;
+	struct vgic_irq *irq;
+	struct kvm_vcpu *vcpu = NULL;
+	unsigned long flags;
+
+	if (iter->dist_id == 0) {
+		print_dist_state(s, &kvm->arch.vgic);
+		return 0;
+	}
+
+	if (!kvm->arch.vgic.initialized)
+		return 0;
+
+	if (iter->vcpu_id < iter->nr_cpus)
+		vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
+
+	irq = vgic_get_irq(kvm, vcpu, iter->intid);
+	if (!irq) {
+		seq_printf(s, "       LPI %4d freed\n", iter->intid);
+		return 0;
+	}
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	print_irq_state(s, irq, vcpu);
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+	vgic_put_irq(kvm, irq);
+	return 0;
+}
+
+static const struct seq_operations vgic_debug_seq_ops = {
+	.start = vgic_debug_start,
+	.next  = vgic_debug_next,
+	.stop  = vgic_debug_stop,
+	.show  = vgic_debug_show
+};
+
+static int debug_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	ret = seq_open(file, &vgic_debug_seq_ops);
+	if (!ret) {
+		struct seq_file *seq;
+		/* seq_open will have modified file->private_data */
+		seq = file->private_data;
+		seq->private = inode->i_private;
+	}
+
+	return ret;
+};
+
+static const struct file_operations vgic_debug_fops = {
+	.owner   = THIS_MODULE,
+	.open    = debug_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+void vgic_debug_init(struct kvm *kvm)
+{
+	debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm,
+			    &vgic_debug_fops);
+}
+
+void vgic_debug_destroy(struct kvm *kvm)
+{
+}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
new file mode 100644
index 000000000000..32e32d67a127
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/*
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces.  The basic
+ * idea is that even though the VGIC is not functional or not requested from
+ * user space, the critical path of the run loop can still call VGIC functions
+ * that just won't do anything, without them having to check additional
+ * initialization flags to ensure they don't look at uninitialized data
+ * structures.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ *   depend on any sizing information or emulation type. No allocation
+ *   is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ *   structures that depend on sizing information (number of CPUs,
+ *   number of interrupts). Also initializes the vcpu specific data
+ *   structures. Can be executed lazily for GICv2.
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_vcpu_init(): initialization of static data that
+ *   doesn't depend on any sizing information or emulation type. No
+ *   allocation is allowed there.
+ */
+
+/* EARLY INIT */
+
+/**
+ * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
+ * @kvm: The VM whose VGIC districutor should be initialized
+ *
+ * Only do initialization of static structures that don't require any
+ * allocation or sizing information from userspace.  vgic_init() called
+ * kvm_vgic_dist_init() which takes care of the rest.
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	INIT_LIST_HEAD(&dist->lpi_list_head);
+	INIT_LIST_HEAD(&dist->lpi_translation_cache);
+	raw_spin_lock_init(&dist->lpi_list_lock);
+}
+
+/* CREATION */
+
+/**
+ * kvm_vgic_create: triggered by the instantiation of the VGIC device by
+ * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
+ * or through the generic KVM_CREATE_DEVICE API ioctl.
+ * irqchip_in_kernel() tells you if this function succeeded or not.
+ * @kvm: kvm struct pointer
+ * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
+ */
+int kvm_vgic_create(struct kvm *kvm, u32 type)
+{
+	int i, ret;
+	struct kvm_vcpu *vcpu;
+
+	if (irqchip_in_kernel(kvm))
+		return -EEXIST;
+
+	/*
+	 * This function is also called by the KVM_CREATE_IRQCHIP handler,
+	 * which had no chance yet to check the availability of the GICv2
+	 * emulation. So check this here again. KVM_CREATE_DEVICE does
+	 * the proper checks already.
+	 */
+	if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+		!kvm_vgic_global_state.can_emulate_gicv2)
+		return -ENODEV;
+
+	ret = -EBUSY;
+	if (!lock_all_vcpus(kvm))
+		return ret;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.has_run_once)
+			goto out_unlock;
+	}
+	ret = 0;
+
+	if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+		kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+	else
+		kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
+
+	if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
+		ret = -E2BIG;
+		goto out_unlock;
+	}
+
+	kvm->arch.vgic.in_kernel = true;
+	kvm->arch.vgic.vgic_model = type;
+
+	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
+
+	if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+		kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+	else
+		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+
+out_unlock:
+	unlock_all_vcpus(kvm);
+	return ret;
+}
+
+/* INIT/DESTROY */
+
+/**
+ * kvm_vgic_dist_init: initialize the dist data structures
+ * @kvm: kvm struct pointer
+ * @nr_spis: number of spis, frozen by caller
+ */
+static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
+	int i;
+
+	dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
+	if (!dist->spis)
+		return  -ENOMEM;
+
+	/*
+	 * In the following code we do not take the irq struct lock since
+	 * no other action on irq structs can happen while the VGIC is
+	 * not initialized yet:
+	 * If someone wants to inject an interrupt or does a MMIO access, we
+	 * require prior initialization in case of a virtual GICv3 or trigger
+	 * initialization when using a virtual GICv2.
+	 */
+	for (i = 0; i < nr_spis; i++) {
+		struct vgic_irq *irq = &dist->spis[i];
+
+		irq->intid = i + VGIC_NR_PRIVATE_IRQS;
+		INIT_LIST_HEAD(&irq->ap_list);
+		raw_spin_lock_init(&irq->irq_lock);
+		irq->vcpu = NULL;
+		irq->target_vcpu = vcpu0;
+		kref_init(&irq->refcount);
+		switch (dist->vgic_model) {
+		case KVM_DEV_TYPE_ARM_VGIC_V2:
+			irq->targets = 0;
+			irq->group = 0;
+			break;
+		case KVM_DEV_TYPE_ARM_VGIC_V3:
+			irq->mpidr = 0;
+			irq->group = 1;
+			break;
+		default:
+			kfree(dist->spis);
+			dist->spis = NULL;
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/**
+ * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
+ * structures and register VCPU-specific KVM iodevs
+ *
+ * @vcpu: pointer to the VCPU being created and initialized
+ *
+ * Only do initialization, but do not actually enable the
+ * VGIC CPU interface
+ */
+int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	int ret = 0;
+	int i;
+
+	vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+
+	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+	raw_spin_lock_init(&vgic_cpu->ap_list_lock);
+	atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+
+	/*
+	 * Enable and configure all SGIs to be edge-triggered and
+	 * configure all PPIs as level-triggered.
+	 */
+	for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+		struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+
+		INIT_LIST_HEAD(&irq->ap_list);
+		raw_spin_lock_init(&irq->irq_lock);
+		irq->intid = i;
+		irq->vcpu = NULL;
+		irq->target_vcpu = vcpu;
+		kref_init(&irq->refcount);
+		if (vgic_irq_is_sgi(i)) {
+			/* SGIs */
+			irq->enabled = 1;
+			irq->config = VGIC_CONFIG_EDGE;
+		} else {
+			/* PPIs */
+			irq->config = VGIC_CONFIG_LEVEL;
+		}
+	}
+
+	if (!irqchip_in_kernel(vcpu->kvm))
+		return 0;
+
+	/*
+	 * If we are creating a VCPU with a GICv3 we must also register the
+	 * KVM io device for the redistributor that belongs to this VCPU.
+	 */
+	if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+		mutex_lock(&vcpu->kvm->lock);
+		ret = vgic_register_redist_iodev(vcpu);
+		mutex_unlock(&vcpu->kvm->lock);
+	}
+	return ret;
+}
+
+static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_enable(vcpu);
+	else
+		vgic_v3_enable(vcpu);
+}
+
+/*
+ * vgic_init: allocates and initializes dist and vcpu data structures
+ * depending on two dimensioning parameters:
+ * - the number of spis
+ * - the number of vcpus
+ * The function is generally called when nr_spis has been explicitly set
+ * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
+ * vgic_initialized() returns true when this function has succeeded.
+ * Must be called with kvm->lock held!
+ */
+int vgic_init(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct kvm_vcpu *vcpu;
+	int ret = 0, i, idx;
+
+	if (vgic_initialized(kvm))
+		return 0;
+
+	/* Are we also in the middle of creating a VCPU? */
+	if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
+		return -EBUSY;
+
+	/* freeze the number of spis */
+	if (!dist->nr_spis)
+		dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
+
+	ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
+	if (ret)
+		goto out;
+
+	/* Initialize groups on CPUs created before the VGIC type was known */
+	kvm_for_each_vcpu(idx, vcpu, kvm) {
+		struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+		for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+			struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+			switch (dist->vgic_model) {
+			case KVM_DEV_TYPE_ARM_VGIC_V3:
+				irq->group = 1;
+				irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+				break;
+			case KVM_DEV_TYPE_ARM_VGIC_V2:
+				irq->group = 0;
+				irq->targets = 1U << idx;
+				break;
+			default:
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+	}
+
+	if (vgic_has_its(kvm))
+		vgic_lpi_translation_cache_init(kvm);
+
+	/*
+	 * If we have GICv4.1 enabled, unconditionnaly request enable the
+	 * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
+	 * enable it if we present a virtual ITS to the guest.
+	 */
+	if (vgic_supports_direct_msis(kvm)) {
+		ret = vgic_v4_init(kvm);
+		if (ret)
+			goto out;
+	}
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_vgic_vcpu_enable(vcpu);
+
+	ret = kvm_vgic_setup_default_irq_routing(kvm);
+	if (ret)
+		goto out;
+
+	vgic_debug_init(kvm);
+
+	dist->implementation_rev = 2;
+	dist->initialized = true;
+
+out:
+	return ret;
+}
+
+static void kvm_vgic_dist_destroy(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_redist_region *rdreg, *next;
+
+	dist->ready = false;
+	dist->initialized = false;
+
+	kfree(dist->spis);
+	dist->spis = NULL;
+	dist->nr_spis = 0;
+
+	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+		list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
+			list_del(&rdreg->list);
+			kfree(rdreg);
+		}
+		INIT_LIST_HEAD(&dist->rd_regions);
+	}
+
+	if (vgic_has_its(kvm))
+		vgic_lpi_translation_cache_destroy(kvm);
+
+	if (vgic_supports_direct_msis(kvm))
+		vgic_v4_teardown(kvm);
+}
+
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+	/*
+	 * Retire all pending LPIs on this vcpu anyway as we're
+	 * going to destroy it.
+	 */
+	vgic_flush_pending_lpis(vcpu);
+
+	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+}
+
+/* To be called with kvm->lock held */
+static void __kvm_vgic_destroy(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	vgic_debug_destroy(kvm);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_vgic_vcpu_destroy(vcpu);
+
+	kvm_vgic_dist_destroy(kvm);
+}
+
+void kvm_vgic_destroy(struct kvm *kvm)
+{
+	mutex_lock(&kvm->lock);
+	__kvm_vgic_destroy(kvm);
+	mutex_unlock(&kvm->lock);
+}
+
+/**
+ * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
+ * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
+ * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
+ * @kvm: kvm struct pointer
+ */
+int vgic_lazy_init(struct kvm *kvm)
+{
+	int ret = 0;
+
+	if (unlikely(!vgic_initialized(kvm))) {
+		/*
+		 * We only provide the automatic initialization of the VGIC
+		 * for the legacy case of a GICv2. Any other type must
+		 * be explicitly initialized once setup with the respective
+		 * KVM device call.
+		 */
+		if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+			return -EBUSY;
+
+		mutex_lock(&kvm->lock);
+		ret = vgic_init(kvm);
+		mutex_unlock(&kvm->lock);
+	}
+
+	return ret;
+}
+
+/* RESOURCE MAPPING */
+
+/**
+ * Map the MMIO regions depending on the VGIC model exposed to the guest
+ * called on the first VCPU run.
+ * Also map the virtual CPU interface into the VM.
+ * v2/v3 derivatives call vgic_init if not already done.
+ * vgic_ready() returns true if this function has succeeded.
+ * @kvm: kvm struct pointer
+ */
+int kvm_vgic_map_resources(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	int ret = 0;
+
+	mutex_lock(&kvm->lock);
+	if (!irqchip_in_kernel(kvm))
+		goto out;
+
+	if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+		ret = vgic_v2_map_resources(kvm);
+	else
+		ret = vgic_v3_map_resources(kvm);
+
+	if (ret)
+		__kvm_vgic_destroy(kvm);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+/* GENERIC PROBE */
+
+static int vgic_init_cpu_starting(unsigned int cpu)
+{
+	enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
+	return 0;
+}
+
+
+static int vgic_init_cpu_dying(unsigned int cpu)
+{
+	disable_percpu_irq(kvm_vgic_global_state.maint_irq);
+	return 0;
+}
+
+static irqreturn_t vgic_maintenance_handler(int irq, void *data)
+{
+	/*
+	 * We cannot rely on the vgic maintenance interrupt to be
+	 * delivered synchronously. This means we can only use it to
+	 * exit the VM, and we perform the handling of EOIed
+	 * interrupts on the exit path (see vgic_fold_lr_state).
+	 */
+	return IRQ_HANDLED;
+}
+
+/**
+ * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
+ *
+ * For a specific CPU, initialize the GIC VE hardware.
+ */
+void kvm_vgic_init_cpu_hardware(void)
+{
+	BUG_ON(preemptible());
+
+	/*
+	 * We want to make sure the list registers start out clear so that we
+	 * only have the program the used registers.
+	 */
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_init_lrs();
+	else
+		kvm_call_hyp(__vgic_v3_init_lrs);
+}
+
+/**
+ * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
+ * according to the host GIC model. Accordingly calls either
+ * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
+ * instantiated by a guest later on .
+ */
+int kvm_vgic_hyp_init(void)
+{
+	const struct gic_kvm_info *gic_kvm_info;
+	int ret;
+
+	gic_kvm_info = gic_get_kvm_info();
+	if (!gic_kvm_info)
+		return -ENODEV;
+
+	if (!gic_kvm_info->maint_irq) {
+		kvm_err("No vgic maintenance irq\n");
+		return -ENXIO;
+	}
+
+	switch (gic_kvm_info->type) {
+	case GIC_V2:
+		ret = vgic_v2_probe(gic_kvm_info);
+		break;
+	case GIC_V3:
+		ret = vgic_v3_probe(gic_kvm_info);
+		if (!ret) {
+			static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+			kvm_info("GIC system register CPU interface enabled\n");
+		}
+		break;
+	default:
+		ret = -ENODEV;
+	}
+
+	if (ret)
+		return ret;
+
+	kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+	ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
+				 vgic_maintenance_handler,
+				 "vgic", kvm_get_running_vcpus());
+	if (ret) {
+		kvm_err("Cannot register interrupt %d\n",
+			kvm_vgic_global_state.maint_irq);
+		return ret;
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
+				"kvm/arm/vgic:starting",
+				vgic_init_cpu_starting, vgic_init_cpu_dying);
+	if (ret) {
+		kvm_err("Cannot register vgic CPU notifier\n");
+		goto out_free_irq;
+	}
+
+	kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
+	return 0;
+
+out_free_irq:
+	free_percpu_irq(kvm_vgic_global_state.maint_irq,
+			kvm_get_running_vcpus());
+	return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
new file mode 100644
index 000000000000..d8cdfea5cc96
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-irqfd.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <trace/events/kvm.h>
+#include <kvm/arm_vgic.h>
+#include "vgic.h"
+
+/**
+ * vgic_irqfd_set_irq: inject the IRQ corresponding to the
+ * irqchip routing entry
+ *
+ * This is the entry point for irqfd IRQ injection
+ */
+static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
+			struct kvm *kvm, int irq_source_id,
+			int level, bool line_status)
+{
+	unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS;
+
+	if (!vgic_valid_spi(kvm, spi_id))
+		return -EINVAL;
+	return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
+}
+
+/**
+ * kvm_set_routing_entry: populate a kvm routing entry
+ * from a user routing entry
+ *
+ * @kvm: the VM this entry is applied to
+ * @e: kvm kernel routing entry handle
+ * @ue: user api routing entry handle
+ * return 0 on success, -EINVAL on errors.
+ */
+int kvm_set_routing_entry(struct kvm *kvm,
+			  struct kvm_kernel_irq_routing_entry *e,
+			  const struct kvm_irq_routing_entry *ue)
+{
+	int r = -EINVAL;
+
+	switch (ue->type) {
+	case KVM_IRQ_ROUTING_IRQCHIP:
+		e->set = vgic_irqfd_set_irq;
+		e->irqchip.irqchip = ue->u.irqchip.irqchip;
+		e->irqchip.pin = ue->u.irqchip.pin;
+		if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
+		    (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
+			goto out;
+		break;
+	case KVM_IRQ_ROUTING_MSI:
+		e->set = kvm_set_msi;
+		e->msi.address_lo = ue->u.msi.address_lo;
+		e->msi.address_hi = ue->u.msi.address_hi;
+		e->msi.data = ue->u.msi.data;
+		e->msi.flags = ue->flags;
+		e->msi.devid = ue->u.msi.devid;
+		break;
+	default:
+		goto out;
+	}
+	r = 0;
+out:
+	return r;
+}
+
+static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
+			     struct kvm_msi *msi)
+{
+	msi->address_lo = e->msi.address_lo;
+	msi->address_hi = e->msi.address_hi;
+	msi->data = e->msi.data;
+	msi->flags = e->msi.flags;
+	msi->devid = e->msi.devid;
+}
+/**
+ * kvm_set_msi: inject the MSI corresponding to the
+ * MSI routing entry
+ *
+ * This is the entry point for irqfd MSI injection
+ * and userspace MSI injection.
+ */
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		struct kvm *kvm, int irq_source_id,
+		int level, bool line_status)
+{
+	struct kvm_msi msi;
+
+	if (!vgic_has_its(kvm))
+		return -ENODEV;
+
+	if (!level)
+		return -1;
+
+	kvm_populate_msi(e, &msi);
+	return vgic_its_inject_msi(kvm, &msi);
+}
+
+/**
+ * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
+ *
+ * Currently only direct MSI injection is supported.
+ */
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+			      struct kvm *kvm, int irq_source_id, int level,
+			      bool line_status)
+{
+	if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) {
+		struct kvm_msi msi;
+
+		kvm_populate_msi(e, &msi);
+		if (!vgic_its_inject_cached_translation(kvm, &msi))
+			return 0;
+	}
+
+	return -EWOULDBLOCK;
+}
+
+int kvm_vgic_setup_default_irq_routing(struct kvm *kvm)
+{
+	struct kvm_irq_routing_entry *entries;
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	u32 nr = dist->nr_spis;
+	int i, ret;
+
+	entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	for (i = 0; i < nr; i++) {
+		entries[i].gsi = i;
+		entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+		entries[i].u.irqchip.irqchip = 0;
+		entries[i].u.irqchip.pin = i;
+	}
+	ret = kvm_set_irq_routing(kvm, entries, nr, 0);
+	kfree(entries);
+	return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
new file mode 100644
index 000000000000..c012a52b19f5
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -0,0 +1,2783 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015,2016 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+#include <linux/list_sort.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+static int vgic_its_save_tables_v0(struct vgic_its *its);
+static int vgic_its_restore_tables_v0(struct vgic_its *its);
+static int vgic_its_commit_v0(struct vgic_its *its);
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+			     struct kvm_vcpu *filter_vcpu, bool needs_inv);
+
+/*
+ * Creates a new (reference to a) struct vgic_irq for a given LPI.
+ * If this LPI is already mapped on another ITS, we increase its refcount
+ * and return a pointer to the existing structure.
+ * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
+ * This function returns a pointer to the _unlocked_ structure.
+ */
+static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
+				     struct kvm_vcpu *vcpu)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+	unsigned long flags;
+	int ret;
+
+	/* In this case there is no put, since we keep the reference. */
+	if (irq)
+		return irq;
+
+	irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
+	if (!irq)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&irq->lpi_list);
+	INIT_LIST_HEAD(&irq->ap_list);
+	raw_spin_lock_init(&irq->irq_lock);
+
+	irq->config = VGIC_CONFIG_EDGE;
+	kref_init(&irq->refcount);
+	irq->intid = intid;
+	irq->target_vcpu = vcpu;
+	irq->group = 1;
+
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+	/*
+	 * There could be a race with another vgic_add_lpi(), so we need to
+	 * check that we don't add a second list entry with the same LPI.
+	 */
+	list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+		if (oldirq->intid != intid)
+			continue;
+
+		/* Someone was faster with adding this LPI, lets use that. */
+		kfree(irq);
+		irq = oldirq;
+
+		/*
+		 * This increases the refcount, the caller is expected to
+		 * call vgic_put_irq() on the returned pointer once it's
+		 * finished with the IRQ.
+		 */
+		vgic_get_irq_kref(irq);
+
+		goto out_unlock;
+	}
+
+	list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+	dist->lpi_list_count++;
+
+out_unlock:
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+	/*
+	 * We "cache" the configuration table entries in our struct vgic_irq's.
+	 * However we only have those structs for mapped IRQs, so we read in
+	 * the respective config data from memory here upon mapping the LPI.
+	 *
+	 * Should any of these fail, behave as if we couldn't create the LPI
+	 * by dropping the refcount and returning the error.
+	 */
+	ret = update_lpi_config(kvm, irq, NULL, false);
+	if (ret) {
+		vgic_put_irq(kvm, irq);
+		return ERR_PTR(ret);
+	}
+
+	ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
+	if (ret) {
+		vgic_put_irq(kvm, irq);
+		return ERR_PTR(ret);
+	}
+
+	return irq;
+}
+
+struct its_device {
+	struct list_head dev_list;
+
+	/* the head for the list of ITTEs */
+	struct list_head itt_head;
+	u32 num_eventid_bits;
+	gpa_t itt_addr;
+	u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+	struct list_head coll_list;
+
+	u32 collection_id;
+	u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+				((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_ite {
+	struct list_head ite_list;
+
+	struct vgic_irq *irq;
+	struct its_collection *collection;
+	u32 event_id;
+};
+
+struct vgic_translation_cache_entry {
+	struct list_head	entry;
+	phys_addr_t		db;
+	u32			devid;
+	u32			eventid;
+	struct vgic_irq		*irq;
+};
+
+/**
+ * struct vgic_its_abi - ITS abi ops and settings
+ * @cte_esz: collection table entry size
+ * @dte_esz: device table entry size
+ * @ite_esz: interrupt translation table entry size
+ * @save tables: save the ITS tables into guest RAM
+ * @restore_tables: restore the ITS internal structs from tables
+ *  stored in guest RAM
+ * @commit: initialize the registers which expose the ABI settings,
+ *  especially the entry sizes
+ */
+struct vgic_its_abi {
+	int cte_esz;
+	int dte_esz;
+	int ite_esz;
+	int (*save_tables)(struct vgic_its *its);
+	int (*restore_tables)(struct vgic_its *its);
+	int (*commit)(struct vgic_its *its);
+};
+
+#define ABI_0_ESZ	8
+#define ESZ_MAX		ABI_0_ESZ
+
+static const struct vgic_its_abi its_table_abi_versions[] = {
+	[0] = {
+	 .cte_esz = ABI_0_ESZ,
+	 .dte_esz = ABI_0_ESZ,
+	 .ite_esz = ABI_0_ESZ,
+	 .save_tables = vgic_its_save_tables_v0,
+	 .restore_tables = vgic_its_restore_tables_v0,
+	 .commit = vgic_its_commit_v0,
+	},
+};
+
+#define NR_ITS_ABIS	ARRAY_SIZE(its_table_abi_versions)
+
+inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
+{
+	return &its_table_abi_versions[its->abi_rev];
+}
+
+static int vgic_its_set_abi(struct vgic_its *its, u32 rev)
+{
+	const struct vgic_its_abi *abi;
+
+	its->abi_rev = rev;
+	abi = vgic_its_get_abi(its);
+	return abi->commit(its);
+}
+
+/*
+ * Find and returns a device in the device table for an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
+{
+	struct its_device *device;
+
+	list_for_each_entry(device, &its->device_list, dev_list)
+		if (device_id == device->device_id)
+			return device;
+
+	return NULL;
+}
+
+/*
+ * Find and returns an interrupt translation table entry (ITTE) for a given
+ * Device ID/Event ID pair on an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
+				  u32 event_id)
+{
+	struct its_device *device;
+	struct its_ite *ite;
+
+	device = find_its_device(its, device_id);
+	if (device == NULL)
+		return NULL;
+
+	list_for_each_entry(ite, &device->itt_head, ite_list)
+		if (ite->event_id == event_id)
+			return ite;
+
+	return NULL;
+}
+
+/* To be used as an iterator this macro misses the enclosing parentheses */
+#define for_each_lpi_its(dev, ite, its) \
+	list_for_each_entry(dev, &(its)->device_list, dev_list) \
+		list_for_each_entry(ite, &(dev)->itt_head, ite_list)
+
+#define GIC_LPI_OFFSET 8192
+
+#define VITS_TYPER_IDBITS 16
+#define VITS_TYPER_DEVBITS 16
+#define VITS_DTE_MAX_DEVID_OFFSET	(BIT(14) - 1)
+#define VITS_ITE_MAX_EVENTID_OFFSET	(BIT(16) - 1)
+
+/*
+ * Finds and returns a collection in the ITS collection table.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
+{
+	struct its_collection *collection;
+
+	list_for_each_entry(collection, &its->collection_list, coll_list) {
+		if (coll_id == collection->collection_id)
+			return collection;
+	}
+
+	return NULL;
+}
+
+#define LPI_PROP_ENABLE_BIT(p)	((p) & LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p)	((p) & 0xfc)
+
+/*
+ * Reads the configuration data for a given LPI from guest memory and
+ * updates the fields in struct vgic_irq.
+ * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
+ * VCPU. Unconditionally applies if filter_vcpu is NULL.
+ */
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+			     struct kvm_vcpu *filter_vcpu, bool needs_inv)
+{
+	u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+	u8 prop;
+	int ret;
+	unsigned long flags;
+
+	ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+				  &prop, 1);
+
+	if (ret)
+		return ret;
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+	if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
+		irq->priority = LPI_PROP_PRIORITY(prop);
+		irq->enabled = LPI_PROP_ENABLE_BIT(prop);
+
+		if (!irq->hw) {
+			vgic_queue_irq_unlock(kvm, irq, flags);
+			return 0;
+		}
+	}
+
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+	if (irq->hw)
+		return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
+
+	return 0;
+}
+
+/*
+ * Create a snapshot of the current LPIs targeting @vcpu, so that we can
+ * enumerate those LPIs without holding any lock.
+ * Returns their number and puts the kmalloc'ed array into intid_ptr.
+ */
+int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq;
+	unsigned long flags;
+	u32 *intids;
+	int irq_count, i = 0;
+
+	/*
+	 * There is an obvious race between allocating the array and LPIs
+	 * being mapped/unmapped. If we ended up here as a result of a
+	 * command, we're safe (locks are held, preventing another
+	 * command). If coming from another path (such as enabling LPIs),
+	 * we must be careful not to overrun the array.
+	 */
+	irq_count = READ_ONCE(dist->lpi_list_count);
+	intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
+	if (!intids)
+		return -ENOMEM;
+
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+		if (i == irq_count)
+			break;
+		/* We don't need to "get" the IRQ, as we hold the list lock. */
+		if (vcpu && irq->target_vcpu != vcpu)
+			continue;
+		intids[i++] = irq->intid;
+	}
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+	*intid_ptr = intids;
+	return i;
+}
+
+static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	irq->target_vcpu = vcpu;
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+	if (irq->hw) {
+		struct its_vlpi_map map;
+
+		ret = its_get_vlpi(irq->host_irq, &map);
+		if (ret)
+			return ret;
+
+		if (map.vpe)
+			atomic_dec(&map.vpe->vlpi_count);
+		map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+		atomic_inc(&map.vpe->vlpi_count);
+
+		ret = its_map_vlpi(irq->host_irq, &map);
+	}
+
+	return ret;
+}
+
+/*
+ * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
+ * is targeting) to the VGIC's view, which deals with target VCPUs.
+ * Needs to be called whenever either the collection for a LPIs has
+ * changed or the collection itself got retargeted.
+ */
+static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
+{
+	struct kvm_vcpu *vcpu;
+
+	if (!its_is_collection_mapped(ite->collection))
+		return;
+
+	vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+	update_affinity(ite->irq, vcpu);
+}
+
+/*
+ * Updates the target VCPU for every LPI targeting this collection.
+ * Must be called with the its_lock mutex held.
+ */
+static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
+				       struct its_collection *coll)
+{
+	struct its_device *device;
+	struct its_ite *ite;
+
+	for_each_lpi_its(device, ite, its) {
+		if (!ite->collection || coll != ite->collection)
+			continue;
+
+		update_affinity_ite(kvm, ite);
+	}
+}
+
+static u32 max_lpis_propbaser(u64 propbaser)
+{
+	int nr_idbits = (propbaser & 0x1f) + 1;
+
+	return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
+}
+
+/*
+ * Sync the pending table pending bit of LPIs targeting @vcpu
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ */
+static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+	gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+	struct vgic_irq *irq;
+	int last_byte_offset = -1;
+	int ret = 0;
+	u32 *intids;
+	int nr_irqs, i;
+	unsigned long flags;
+	u8 pendmask;
+
+	nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids);
+	if (nr_irqs < 0)
+		return nr_irqs;
+
+	for (i = 0; i < nr_irqs; i++) {
+		int byte_offset, bit_nr;
+
+		byte_offset = intids[i] / BITS_PER_BYTE;
+		bit_nr = intids[i] % BITS_PER_BYTE;
+
+		/*
+		 * For contiguously allocated LPIs chances are we just read
+		 * this very same byte in the last iteration. Reuse that.
+		 */
+		if (byte_offset != last_byte_offset) {
+			ret = kvm_read_guest_lock(vcpu->kvm,
+						  pendbase + byte_offset,
+						  &pendmask, 1);
+			if (ret) {
+				kfree(intids);
+				return ret;
+			}
+			last_byte_offset = byte_offset;
+		}
+
+		irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->pending_latch = pendmask & (1U << bit_nr);
+		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	kfree(intids);
+
+	return ret;
+}
+
+static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
+					      struct vgic_its *its,
+					      gpa_t addr, unsigned int len)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	u64 reg = GITS_TYPER_PLPIS;
+
+	/*
+	 * We use linear CPU numbers for redistributor addressing,
+	 * so GITS_TYPER.PTA is 0.
+	 * Also we force all PROPBASER registers to be the same, so
+	 * CommonLPIAff is 0 as well.
+	 * To avoid memory waste in the guest, we keep the number of IDBits and
+	 * DevBits low - as least for the time being.
+	 */
+	reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT;
+	reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
+	reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
+
+	return extract_bytes(reg, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
+					     struct vgic_its *its,
+					     gpa_t addr, unsigned int len)
+{
+	u32 val;
+
+	val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
+	val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
+	return val;
+}
+
+static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
+					    struct vgic_its *its,
+					    gpa_t addr, unsigned int len,
+					    unsigned long val)
+{
+	u32 rev = GITS_IIDR_REV(val);
+
+	if (rev >= NR_ITS_ABIS)
+		return -EINVAL;
+	return vgic_its_set_abi(its, rev);
+}
+
+static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
+					       struct vgic_its *its,
+					       gpa_t addr, unsigned int len)
+{
+	switch (addr & 0xffff) {
+	case GITS_PIDR0:
+		return 0x92;	/* part number, bits[7:0] */
+	case GITS_PIDR1:
+		return 0xb4;	/* part number, bits[11:8] */
+	case GITS_PIDR2:
+		return GIC_PIDR2_ARCH_GICv3 | 0x0b;
+	case GITS_PIDR4:
+		return 0x40;	/* This is a 64K software visible page */
+	/* The following are the ID registers for (any) GIC. */
+	case GITS_CIDR0:
+		return 0x0d;
+	case GITS_CIDR1:
+		return 0xf0;
+	case GITS_CIDR2:
+		return 0x05;
+	case GITS_CIDR3:
+		return 0xb1;
+	}
+
+	return 0;
+}
+
+static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
+					       phys_addr_t db,
+					       u32 devid, u32 eventid)
+{
+	struct vgic_translation_cache_entry *cte;
+
+	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+		/*
+		 * If we hit a NULL entry, there is nothing after this
+		 * point.
+		 */
+		if (!cte->irq)
+			break;
+
+		if (cte->db != db || cte->devid != devid ||
+		    cte->eventid != eventid)
+			continue;
+
+		/*
+		 * Move this entry to the head, as it is the most
+		 * recently used.
+		 */
+		if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
+			list_move(&cte->entry, &dist->lpi_translation_cache);
+
+		return cte->irq;
+	}
+
+	return NULL;
+}
+
+static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
+					     u32 devid, u32 eventid)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+	irq = __vgic_its_check_cache(dist, db, devid, eventid);
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+	return irq;
+}
+
+static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
+				       u32 devid, u32 eventid,
+				       struct vgic_irq *irq)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_translation_cache_entry *cte;
+	unsigned long flags;
+	phys_addr_t db;
+
+	/* Do not cache a directly injected interrupt */
+	if (irq->hw)
+		return;
+
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+	if (unlikely(list_empty(&dist->lpi_translation_cache)))
+		goto out;
+
+	/*
+	 * We could have raced with another CPU caching the same
+	 * translation behind our back, so let's check it is not in
+	 * already
+	 */
+	db = its->vgic_its_base + GITS_TRANSLATER;
+	if (__vgic_its_check_cache(dist, db, devid, eventid))
+		goto out;
+
+	/* Always reuse the last entry (LRU policy) */
+	cte = list_last_entry(&dist->lpi_translation_cache,
+			      typeof(*cte), entry);
+
+	/*
+	 * Caching the translation implies having an extra reference
+	 * to the interrupt, so drop the potential reference on what
+	 * was in the cache, and increment it on the new interrupt.
+	 */
+	if (cte->irq)
+		__vgic_put_lpi_locked(kvm, cte->irq);
+
+	vgic_get_irq_kref(irq);
+
+	cte->db		= db;
+	cte->devid	= devid;
+	cte->eventid	= eventid;
+	cte->irq	= irq;
+
+	/* Move the new translation to the head of the list */
+	list_move(&cte->entry, &dist->lpi_translation_cache);
+
+out:
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+void vgic_its_invalidate_cache(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_translation_cache_entry *cte;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+		/*
+		 * If we hit a NULL entry, there is nothing after this
+		 * point.
+		 */
+		if (!cte->irq)
+			break;
+
+		__vgic_put_lpi_locked(kvm, cte->irq);
+		cte->irq = NULL;
+	}
+
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
+			 u32 devid, u32 eventid, struct vgic_irq **irq)
+{
+	struct kvm_vcpu *vcpu;
+	struct its_ite *ite;
+
+	if (!its->enabled)
+		return -EBUSY;
+
+	ite = find_ite(its, devid, eventid);
+	if (!ite || !its_is_collection_mapped(ite->collection))
+		return E_ITS_INT_UNMAPPED_INTERRUPT;
+
+	vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+	if (!vcpu)
+		return E_ITS_INT_UNMAPPED_INTERRUPT;
+
+	if (!vcpu->arch.vgic_cpu.lpis_enabled)
+		return -EBUSY;
+
+	vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
+
+	*irq = ite->irq;
+	return 0;
+}
+
+struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
+{
+	u64 address;
+	struct kvm_io_device *kvm_io_dev;
+	struct vgic_io_device *iodev;
+
+	if (!vgic_has_its(kvm))
+		return ERR_PTR(-ENODEV);
+
+	if (!(msi->flags & KVM_MSI_VALID_DEVID))
+		return ERR_PTR(-EINVAL);
+
+	address = (u64)msi->address_hi << 32 | msi->address_lo;
+
+	kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
+	if (!kvm_io_dev)
+		return ERR_PTR(-EINVAL);
+
+	if (kvm_io_dev->ops != &kvm_io_gic_ops)
+		return ERR_PTR(-EINVAL);
+
+	iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+	if (iodev->iodev_type != IODEV_ITS)
+		return ERR_PTR(-EINVAL);
+
+	return iodev->its;
+}
+
+/*
+ * Find the target VCPU and the LPI number for a given devid/eventid pair
+ * and make this IRQ pending, possibly injecting it.
+ * Must be called with the its_lock mutex held.
+ * Returns 0 on success, a positive error value for any ITS mapping
+ * related errors and negative error values for generic errors.
+ */
+static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
+				u32 devid, u32 eventid)
+{
+	struct vgic_irq *irq = NULL;
+	unsigned long flags;
+	int err;
+
+	err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq);
+	if (err)
+		return err;
+
+	if (irq->hw)
+		return irq_set_irqchip_state(irq->host_irq,
+					     IRQCHIP_STATE_PENDING, true);
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	irq->pending_latch = true;
+	vgic_queue_irq_unlock(kvm, irq, flags);
+
+	return 0;
+}
+
+int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
+{
+	struct vgic_irq *irq;
+	unsigned long flags;
+	phys_addr_t db;
+
+	db = (u64)msi->address_hi << 32 | msi->address_lo;
+	irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
+
+	if (!irq)
+		return -1;
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	irq->pending_latch = true;
+	vgic_queue_irq_unlock(kvm, irq, flags);
+
+	return 0;
+}
+
+/*
+ * Queries the KVM IO bus framework to get the ITS pointer from the given
+ * doorbell address.
+ * We then call vgic_its_trigger_msi() with the decoded data.
+ * According to the KVM_SIGNAL_MSI API description returns 1 on success.
+ */
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	struct vgic_its *its;
+	int ret;
+
+	if (!vgic_its_inject_cached_translation(kvm, msi))
+		return 1;
+
+	its = vgic_msi_to_its(kvm, msi);
+	if (IS_ERR(its))
+		return PTR_ERR(its);
+
+	mutex_lock(&its->its_lock);
+	ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data);
+	mutex_unlock(&its->its_lock);
+
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * KVM_SIGNAL_MSI demands a return value > 0 for success and 0
+	 * if the guest has blocked the MSI. So we map any LPI mapping
+	 * related error to that.
+	 */
+	if (ret)
+		return 0;
+	else
+		return 1;
+}
+
+/* Requires the its_lock to be held. */
+static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
+{
+	list_del(&ite->ite_list);
+
+	/* This put matches the get in vgic_add_lpi. */
+	if (ite->irq) {
+		if (ite->irq->hw)
+			WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+
+		vgic_put_irq(kvm, ite->irq);
+	}
+
+	kfree(ite);
+}
+
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+	return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd)	its_cmd_mask_field(cmd, 0,  0,  8)
+#define its_cmd_get_deviceid(cmd)	its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_size(cmd)		(its_cmd_mask_field(cmd, 1,  0,  5) + 1)
+#define its_cmd_get_id(cmd)		its_cmd_mask_field(cmd, 1,  0, 32)
+#define its_cmd_get_physical_id(cmd)	its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd)	its_cmd_mask_field(cmd, 2,  0, 16)
+#define its_cmd_get_ittaddr(cmd)	(its_cmd_mask_field(cmd, 2,  8, 44) << 8)
+#define its_cmd_get_target_addr(cmd)	its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd)	its_cmd_mask_field(cmd, 2, 63,  1)
+
+/*
+ * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
+				       u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	struct its_ite *ite;
+
+	ite = find_ite(its, device_id, event_id);
+	if (ite && its_is_collection_mapped(ite->collection)) {
+		/*
+		 * Though the spec talks about removing the pending state, we
+		 * don't bother here since we clear the ITTE anyway and the
+		 * pending state is a property of the ITTE struct.
+		 */
+		vgic_its_invalidate_cache(kvm);
+
+		its_free_ite(kvm, ite);
+		return 0;
+	}
+
+	return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+}
+
+/*
+ * The MOVI command moves an ITTE to a different collection.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
+				    u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	u32 coll_id = its_cmd_get_collection(its_cmd);
+	struct kvm_vcpu *vcpu;
+	struct its_ite *ite;
+	struct its_collection *collection;
+
+	ite = find_ite(its, device_id, event_id);
+	if (!ite)
+		return E_ITS_MOVI_UNMAPPED_INTERRUPT;
+
+	if (!its_is_collection_mapped(ite->collection))
+		return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+	collection = find_collection(its, coll_id);
+	if (!its_is_collection_mapped(collection))
+		return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+	ite->collection = collection;
+	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+	vgic_its_invalidate_cache(kvm);
+
+	return update_affinity(ite->irq, vcpu);
+}
+
+/*
+ * Check whether an ID can be stored into the corresponding guest table.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
+			      gpa_t *eaddr)
+{
+	int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+	u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
+	phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
+	int esz = GITS_BASER_ENTRY_SIZE(baser);
+	int index, idx;
+	gfn_t gfn;
+	bool ret;
+
+	switch (type) {
+	case GITS_BASER_TYPE_DEVICE:
+		if (id >= BIT_ULL(VITS_TYPER_DEVBITS))
+			return false;
+		break;
+	case GITS_BASER_TYPE_COLLECTION:
+		/* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */
+		if (id >= BIT_ULL(16))
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	if (!(baser & GITS_BASER_INDIRECT)) {
+		phys_addr_t addr;
+
+		if (id >= (l1_tbl_size / esz))
+			return false;
+
+		addr = base + id * esz;
+		gfn = addr >> PAGE_SHIFT;
+
+		if (eaddr)
+			*eaddr = addr;
+
+		goto out;
+	}
+
+	/* calculate and check the index into the 1st level */
+	index = id / (SZ_64K / esz);
+	if (index >= (l1_tbl_size / sizeof(u64)))
+		return false;
+
+	/* Each 1st level entry is represented by a 64-bit value. */
+	if (kvm_read_guest_lock(its->dev->kvm,
+			   base + index * sizeof(indirect_ptr),
+			   &indirect_ptr, sizeof(indirect_ptr)))
+		return false;
+
+	indirect_ptr = le64_to_cpu(indirect_ptr);
+
+	/* check the valid bit of the first level entry */
+	if (!(indirect_ptr & BIT_ULL(63)))
+		return false;
+
+	/* Mask the guest physical address and calculate the frame number. */
+	indirect_ptr &= GENMASK_ULL(51, 16);
+
+	/* Find the address of the actual entry */
+	index = id % (SZ_64K / esz);
+	indirect_ptr += index * esz;
+	gfn = indirect_ptr >> PAGE_SHIFT;
+
+	if (eaddr)
+		*eaddr = indirect_ptr;
+
+out:
+	idx = srcu_read_lock(&its->dev->kvm->srcu);
+	ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
+	srcu_read_unlock(&its->dev->kvm->srcu, idx);
+	return ret;
+}
+
+static int vgic_its_alloc_collection(struct vgic_its *its,
+				     struct its_collection **colp,
+				     u32 coll_id)
+{
+	struct its_collection *collection;
+
+	if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
+		return E_ITS_MAPC_COLLECTION_OOR;
+
+	collection = kzalloc(sizeof(*collection), GFP_KERNEL);
+	if (!collection)
+		return -ENOMEM;
+
+	collection->collection_id = coll_id;
+	collection->target_addr = COLLECTION_NOT_MAPPED;
+
+	list_add_tail(&collection->coll_list, &its->collection_list);
+	*colp = collection;
+
+	return 0;
+}
+
+static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
+{
+	struct its_collection *collection;
+	struct its_device *device;
+	struct its_ite *ite;
+
+	/*
+	 * Clearing the mapping for that collection ID removes the
+	 * entry from the list. If there wasn't any before, we can
+	 * go home early.
+	 */
+	collection = find_collection(its, coll_id);
+	if (!collection)
+		return;
+
+	for_each_lpi_its(device, ite, its)
+		if (ite->collection &&
+		    ite->collection->collection_id == coll_id)
+			ite->collection = NULL;
+
+	list_del(&collection->coll_list);
+	kfree(collection);
+}
+
+/* Must be called with its_lock mutex held */
+static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
+					  struct its_collection *collection,
+					  u32 event_id)
+{
+	struct its_ite *ite;
+
+	ite = kzalloc(sizeof(*ite), GFP_KERNEL);
+	if (!ite)
+		return ERR_PTR(-ENOMEM);
+
+	ite->event_id	= event_id;
+	ite->collection = collection;
+
+	list_add_tail(&ite->ite_list, &device->itt_head);
+	return ite;
+}
+
+/*
+ * The MAPTI and MAPI commands map LPIs to ITTEs.
+ * Must be called with its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
+				    u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	u32 coll_id = its_cmd_get_collection(its_cmd);
+	struct its_ite *ite;
+	struct kvm_vcpu *vcpu = NULL;
+	struct its_device *device;
+	struct its_collection *collection, *new_coll = NULL;
+	struct vgic_irq *irq;
+	int lpi_nr;
+
+	device = find_its_device(its, device_id);
+	if (!device)
+		return E_ITS_MAPTI_UNMAPPED_DEVICE;
+
+	if (event_id >= BIT_ULL(device->num_eventid_bits))
+		return E_ITS_MAPTI_ID_OOR;
+
+	if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
+		lpi_nr = its_cmd_get_physical_id(its_cmd);
+	else
+		lpi_nr = event_id;
+	if (lpi_nr < GIC_LPI_OFFSET ||
+	    lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
+		return E_ITS_MAPTI_PHYSICALID_OOR;
+
+	/* If there is an existing mapping, behavior is UNPREDICTABLE. */
+	if (find_ite(its, device_id, event_id))
+		return 0;
+
+	collection = find_collection(its, coll_id);
+	if (!collection) {
+		int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+		if (ret)
+			return ret;
+		new_coll = collection;
+	}
+
+	ite = vgic_its_alloc_ite(device, collection, event_id);
+	if (IS_ERR(ite)) {
+		if (new_coll)
+			vgic_its_free_collection(its, coll_id);
+		return PTR_ERR(ite);
+	}
+
+	if (its_is_collection_mapped(collection))
+		vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+	irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
+	if (IS_ERR(irq)) {
+		if (new_coll)
+			vgic_its_free_collection(its, coll_id);
+		its_free_ite(kvm, ite);
+		return PTR_ERR(irq);
+	}
+	ite->irq = irq;
+
+	return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
+{
+	struct its_ite *ite, *temp;
+
+	/*
+	 * The spec says that unmapping a device with still valid
+	 * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
+	 * since we cannot leave the memory unreferenced.
+	 */
+	list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
+		its_free_ite(kvm, ite);
+
+	vgic_its_invalidate_cache(kvm);
+
+	list_del(&device->dev_list);
+	kfree(device);
+}
+
+/* its lock must be held */
+static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
+{
+	struct its_device *cur, *temp;
+
+	list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
+		vgic_its_free_device(kvm, cur);
+}
+
+/* its lock must be held */
+static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
+{
+	struct its_collection *cur, *temp;
+
+	list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
+		vgic_its_free_collection(its, cur->collection_id);
+}
+
+/* Must be called with its_lock mutex held */
+static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
+						u32 device_id, gpa_t itt_addr,
+						u8 num_eventid_bits)
+{
+	struct its_device *device;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device)
+		return ERR_PTR(-ENOMEM);
+
+	device->device_id = device_id;
+	device->itt_addr = itt_addr;
+	device->num_eventid_bits = num_eventid_bits;
+	INIT_LIST_HEAD(&device->itt_head);
+
+	list_add_tail(&device->dev_list, &its->device_list);
+	return device;
+}
+
+/*
+ * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
+				    u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	bool valid = its_cmd_get_validbit(its_cmd);
+	u8 num_eventid_bits = its_cmd_get_size(its_cmd);
+	gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
+	struct its_device *device;
+
+	if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL))
+		return E_ITS_MAPD_DEVICE_OOR;
+
+	if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
+		return E_ITS_MAPD_ITTSIZE_OOR;
+
+	device = find_its_device(its, device_id);
+
+	/*
+	 * The spec says that calling MAPD on an already mapped device
+	 * invalidates all cached data for this device. We implement this
+	 * by removing the mapping and re-establishing it.
+	 */
+	if (device)
+		vgic_its_free_device(kvm, device);
+
+	/*
+	 * The spec does not say whether unmapping a not-mapped device
+	 * is an error, so we are done in any case.
+	 */
+	if (!valid)
+		return 0;
+
+	device = vgic_its_alloc_device(its, device_id, itt_addr,
+				       num_eventid_bits);
+
+	return PTR_ERR_OR_ZERO(device);
+}
+
+/*
+ * The MAPC command maps collection IDs to redistributors.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
+				    u64 *its_cmd)
+{
+	u16 coll_id;
+	u32 target_addr;
+	struct its_collection *collection;
+	bool valid;
+
+	valid = its_cmd_get_validbit(its_cmd);
+	coll_id = its_cmd_get_collection(its_cmd);
+	target_addr = its_cmd_get_target_addr(its_cmd);
+
+	if (target_addr >= atomic_read(&kvm->online_vcpus))
+		return E_ITS_MAPC_PROCNUM_OOR;
+
+	if (!valid) {
+		vgic_its_free_collection(its, coll_id);
+		vgic_its_invalidate_cache(kvm);
+	} else {
+		collection = find_collection(its, coll_id);
+
+		if (!collection) {
+			int ret;
+
+			ret = vgic_its_alloc_collection(its, &collection,
+							coll_id);
+			if (ret)
+				return ret;
+			collection->target_addr = target_addr;
+		} else {
+			collection->target_addr = target_addr;
+			update_affinity_collection(kvm, its, collection);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * The CLEAR command removes the pending state for a particular LPI.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
+				     u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	struct its_ite *ite;
+
+
+	ite = find_ite(its, device_id, event_id);
+	if (!ite)
+		return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
+
+	ite->irq->pending_latch = false;
+
+	if (ite->irq->hw)
+		return irq_set_irqchip_state(ite->irq->host_irq,
+					     IRQCHIP_STATE_PENDING, false);
+
+	return 0;
+}
+
+/*
+ * The INV command syncs the configuration bits from the memory table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
+				   u64 *its_cmd)
+{
+	u32 device_id = its_cmd_get_deviceid(its_cmd);
+	u32 event_id = its_cmd_get_id(its_cmd);
+	struct its_ite *ite;
+
+
+	ite = find_ite(its, device_id, event_id);
+	if (!ite)
+		return E_ITS_INV_UNMAPPED_INTERRUPT;
+
+	return update_lpi_config(kvm, ite->irq, NULL, true);
+}
+
+/*
+ * The INVALL command requests flushing of all IRQ data in this collection.
+ * Find the VCPU mapped to that collection, then iterate over the VM's list
+ * of mapped LPIs and update the configuration for each IRQ which targets
+ * the specified vcpu. The configuration will be read from the in-memory
+ * configuration table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
+				      u64 *its_cmd)
+{
+	u32 coll_id = its_cmd_get_collection(its_cmd);
+	struct its_collection *collection;
+	struct kvm_vcpu *vcpu;
+	struct vgic_irq *irq;
+	u32 *intids;
+	int irq_count, i;
+
+	collection = find_collection(its, coll_id);
+	if (!its_is_collection_mapped(collection))
+		return E_ITS_INVALL_UNMAPPED_COLLECTION;
+
+	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+	irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
+	if (irq_count < 0)
+		return irq_count;
+
+	for (i = 0; i < irq_count; i++) {
+		irq = vgic_get_irq(kvm, NULL, intids[i]);
+		if (!irq)
+			continue;
+		update_lpi_config(kvm, irq, vcpu, false);
+		vgic_put_irq(kvm, irq);
+	}
+
+	kfree(intids);
+
+	if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
+		its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
+
+	return 0;
+}
+
+/*
+ * The MOVALL command moves the pending state of all IRQs targeting one
+ * redistributor to another. We don't hold the pending state in the VCPUs,
+ * but in the IRQs instead, so there is really not much to do for us here.
+ * However the spec says that no IRQ must target the old redistributor
+ * afterwards, so we make sure that no LPI is using the associated target_vcpu.
+ * This command affects all LPIs in the system that target that redistributor.
+ */
+static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
+				      u64 *its_cmd)
+{
+	u32 target1_addr = its_cmd_get_target_addr(its_cmd);
+	u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+	struct kvm_vcpu *vcpu1, *vcpu2;
+	struct vgic_irq *irq;
+	u32 *intids;
+	int irq_count, i;
+
+	if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
+	    target2_addr >= atomic_read(&kvm->online_vcpus))
+		return E_ITS_MOVALL_PROCNUM_OOR;
+
+	if (target1_addr == target2_addr)
+		return 0;
+
+	vcpu1 = kvm_get_vcpu(kvm, target1_addr);
+	vcpu2 = kvm_get_vcpu(kvm, target2_addr);
+
+	irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
+	if (irq_count < 0)
+		return irq_count;
+
+	for (i = 0; i < irq_count; i++) {
+		irq = vgic_get_irq(kvm, NULL, intids[i]);
+
+		update_affinity(irq, vcpu2);
+
+		vgic_put_irq(kvm, irq);
+	}
+
+	vgic_its_invalidate_cache(kvm);
+
+	kfree(intids);
+	return 0;
+}
+
+/*
+ * The INT command injects the LPI associated with that DevID/EvID pair.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
+				   u64 *its_cmd)
+{
+	u32 msi_data = its_cmd_get_id(its_cmd);
+	u64 msi_devid = its_cmd_get_deviceid(its_cmd);
+
+	return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
+}
+
+/*
+ * This function is called with the its_cmd lock held, but the ITS data
+ * structure lock dropped.
+ */
+static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
+				   u64 *its_cmd)
+{
+	int ret = -ENODEV;
+
+	mutex_lock(&its->its_lock);
+	switch (its_cmd_get_command(its_cmd)) {
+	case GITS_CMD_MAPD:
+		ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_MAPC:
+		ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_MAPI:
+		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_MAPTI:
+		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_MOVI:
+		ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_DISCARD:
+		ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_CLEAR:
+		ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_MOVALL:
+		ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_INT:
+		ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_INV:
+		ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_INVALL:
+		ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
+		break;
+	case GITS_CMD_SYNC:
+		/* we ignore this command: we are in sync all of the time */
+		ret = 0;
+		break;
+	}
+	mutex_unlock(&its->its_lock);
+
+	return ret;
+}
+
+static u64 vgic_sanitise_its_baser(u64 reg)
+{
+	reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
+				  GITS_BASER_SHAREABILITY_SHIFT,
+				  vgic_sanitise_shareability);
+	reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
+				  GITS_BASER_INNER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_inner_cacheability);
+	reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
+				  GITS_BASER_OUTER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_outer_cacheability);
+
+	/* We support only one (ITS) page size: 64K */
+	reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
+
+	return reg;
+}
+
+static u64 vgic_sanitise_its_cbaser(u64 reg)
+{
+	reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
+				  GITS_CBASER_SHAREABILITY_SHIFT,
+				  vgic_sanitise_shareability);
+	reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
+				  GITS_CBASER_INNER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_inner_cacheability);
+	reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
+				  GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_outer_cacheability);
+
+	/* Sanitise the physical address to be 64k aligned. */
+	reg &= ~GENMASK_ULL(15, 12);
+
+	return reg;
+}
+
+static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
+					       struct vgic_its *its,
+					       gpa_t addr, unsigned int len)
+{
+	return extract_bytes(its->cbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
+				       gpa_t addr, unsigned int len,
+				       unsigned long val)
+{
+	/* When GITS_CTLR.Enable is 1, this register is RO. */
+	if (its->enabled)
+		return;
+
+	mutex_lock(&its->cmd_lock);
+	its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
+	its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
+	its->creadr = 0;
+	/*
+	 * CWRITER is architecturally UNKNOWN on reset, but we need to reset
+	 * it to CREADR to make sure we start with an empty command buffer.
+	 */
+	its->cwriter = its->creadr;
+	mutex_unlock(&its->cmd_lock);
+}
+
+#define ITS_CMD_BUFFER_SIZE(baser)	((((baser) & 0xff) + 1) << 12)
+#define ITS_CMD_SIZE			32
+#define ITS_CMD_OFFSET(reg)		((reg) & GENMASK(19, 5))
+
+/* Must be called with the cmd_lock held. */
+static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
+{
+	gpa_t cbaser;
+	u64 cmd_buf[4];
+
+	/* Commands are only processed when the ITS is enabled. */
+	if (!its->enabled)
+		return;
+
+	cbaser = GITS_CBASER_ADDRESS(its->cbaser);
+
+	while (its->cwriter != its->creadr) {
+		int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
+					      cmd_buf, ITS_CMD_SIZE);
+		/*
+		 * If kvm_read_guest() fails, this could be due to the guest
+		 * programming a bogus value in CBASER or something else going
+		 * wrong from which we cannot easily recover.
+		 * According to section 6.3.2 in the GICv3 spec we can just
+		 * ignore that command then.
+		 */
+		if (!ret)
+			vgic_its_handle_command(kvm, its, cmd_buf);
+
+		its->creadr += ITS_CMD_SIZE;
+		if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
+			its->creadr = 0;
+	}
+}
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * To avoid any races in the first place, we take the its_cmd lock, which
+ * protects our ring buffer variables, so that there is only one user
+ * per ITS handling commands at a given time.
+ */
+static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
+					gpa_t addr, unsigned int len,
+					unsigned long val)
+{
+	u64 reg;
+
+	if (!its)
+		return;
+
+	mutex_lock(&its->cmd_lock);
+
+	reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
+	reg = ITS_CMD_OFFSET(reg);
+	if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+		mutex_unlock(&its->cmd_lock);
+		return;
+	}
+	its->cwriter = reg;
+
+	vgic_its_process_commands(kvm, its);
+
+	mutex_unlock(&its->cmd_lock);
+}
+
+static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
+						struct vgic_its *its,
+						gpa_t addr, unsigned int len)
+{
+	return extract_bytes(its->cwriter, addr & 0x7, len);
+}
+
+static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
+					       struct vgic_its *its,
+					       gpa_t addr, unsigned int len)
+{
+	return extract_bytes(its->creadr, addr & 0x7, len);
+}
+
+static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm,
+					      struct vgic_its *its,
+					      gpa_t addr, unsigned int len,
+					      unsigned long val)
+{
+	u32 cmd_offset;
+	int ret = 0;
+
+	mutex_lock(&its->cmd_lock);
+
+	if (its->enabled) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	cmd_offset = ITS_CMD_OFFSET(val);
+	if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	its->creadr = cmd_offset;
+out:
+	mutex_unlock(&its->cmd_lock);
+	return ret;
+}
+
+#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
+static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
+					      struct vgic_its *its,
+					      gpa_t addr, unsigned int len)
+{
+	u64 reg;
+
+	switch (BASER_INDEX(addr)) {
+	case 0:
+		reg = its->baser_device_table;
+		break;
+	case 1:
+		reg = its->baser_coll_table;
+		break;
+	default:
+		reg = 0;
+		break;
+	}
+
+	return extract_bytes(reg, addr & 7, len);
+}
+
+#define GITS_BASER_RO_MASK	(GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
+static void vgic_mmio_write_its_baser(struct kvm *kvm,
+				      struct vgic_its *its,
+				      gpa_t addr, unsigned int len,
+				      unsigned long val)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	u64 entry_size, table_type;
+	u64 reg, *regptr, clearbits = 0;
+
+	/* When GITS_CTLR.Enable is 1, we ignore write accesses. */
+	if (its->enabled)
+		return;
+
+	switch (BASER_INDEX(addr)) {
+	case 0:
+		regptr = &its->baser_device_table;
+		entry_size = abi->dte_esz;
+		table_type = GITS_BASER_TYPE_DEVICE;
+		break;
+	case 1:
+		regptr = &its->baser_coll_table;
+		entry_size = abi->cte_esz;
+		table_type = GITS_BASER_TYPE_COLLECTION;
+		clearbits = GITS_BASER_INDIRECT;
+		break;
+	default:
+		return;
+	}
+
+	reg = update_64bit_reg(*regptr, addr & 7, len, val);
+	reg &= ~GITS_BASER_RO_MASK;
+	reg &= ~clearbits;
+
+	reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
+	reg |= table_type << GITS_BASER_TYPE_SHIFT;
+	reg = vgic_sanitise_its_baser(reg);
+
+	*regptr = reg;
+
+	if (!(reg & GITS_BASER_VALID)) {
+		/* Take the its_lock to prevent a race with a save/restore */
+		mutex_lock(&its->its_lock);
+		switch (table_type) {
+		case GITS_BASER_TYPE_DEVICE:
+			vgic_its_free_device_list(kvm, its);
+			break;
+		case GITS_BASER_TYPE_COLLECTION:
+			vgic_its_free_collection_list(kvm, its);
+			break;
+		}
+		mutex_unlock(&its->its_lock);
+	}
+}
+
+static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
+					     struct vgic_its *its,
+					     gpa_t addr, unsigned int len)
+{
+	u32 reg = 0;
+
+	mutex_lock(&its->cmd_lock);
+	if (its->creadr == its->cwriter)
+		reg |= GITS_CTLR_QUIESCENT;
+	if (its->enabled)
+		reg |= GITS_CTLR_ENABLE;
+	mutex_unlock(&its->cmd_lock);
+
+	return reg;
+}
+
+static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	mutex_lock(&its->cmd_lock);
+
+	/*
+	 * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
+	 * device/collection BASER are invalid
+	 */
+	if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
+		(!(its->baser_device_table & GITS_BASER_VALID) ||
+		 !(its->baser_coll_table & GITS_BASER_VALID) ||
+		 !(its->cbaser & GITS_CBASER_VALID)))
+		goto out;
+
+	its->enabled = !!(val & GITS_CTLR_ENABLE);
+	if (!its->enabled)
+		vgic_its_invalidate_cache(kvm);
+
+	/*
+	 * Try to process any pending commands. This function bails out early
+	 * if the ITS is disabled or no commands have been queued.
+	 */
+	vgic_its_process_commands(kvm, its);
+
+out:
+	mutex_unlock(&its->cmd_lock);
+}
+
+#define REGISTER_ITS_DESC(off, rd, wr, length, acc)		\
+{								\
+	.reg_offset = off,					\
+	.len = length,						\
+	.access_flags = acc,					\
+	.its_read = rd,						\
+	.its_write = wr,					\
+}
+
+#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\
+{								\
+	.reg_offset = off,					\
+	.len = length,						\
+	.access_flags = acc,					\
+	.its_read = rd,						\
+	.its_write = wr,					\
+	.uaccess_its_write = uwr,				\
+}
+
+static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
+			      gpa_t addr, unsigned int len, unsigned long val)
+{
+	/* Ignore */
+}
+
+static struct vgic_register_region its_registers[] = {
+	REGISTER_ITS_DESC(GITS_CTLR,
+		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
+		vgic_mmio_read_its_iidr, its_mmio_write_wi,
+		vgic_mmio_uaccess_write_its_iidr, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_TYPER,
+		vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_CBASER,
+		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_CWRITER,
+		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC_UACCESS(GITS_CREADR,
+		vgic_mmio_read_its_creadr, its_mmio_write_wi,
+		vgic_mmio_uaccess_write_its_creadr, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_BASER,
+		vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_ITS_DESC(GITS_IDREGS_BASE,
+		vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
+		VGIC_ACCESS_32bit),
+};
+
+/* This is called on setting the LPI enable bit in the redistributor. */
+void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
+		its_sync_lpi_pending_table(vcpu);
+}
+
+static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its,
+				   u64 addr)
+{
+	struct vgic_io_device *iodev = &its->iodev;
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	its->vgic_its_base = addr;
+	iodev->regions = its_registers;
+	iodev->nr_regions = ARRAY_SIZE(its_registers);
+	kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
+
+	iodev->base_addr = its->vgic_its_base;
+	iodev->iodev_type = IODEV_ITS;
+	iodev->its = its;
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
+				      KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
+out:
+	mutex_unlock(&kvm->slots_lock);
+
+	return ret;
+}
+
+/* Default is 16 cached LPIs per vcpu */
+#define LPI_DEFAULT_PCPU_CACHE_SIZE	16
+
+void vgic_lpi_translation_cache_init(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	unsigned int sz;
+	int i;
+
+	if (!list_empty(&dist->lpi_translation_cache))
+		return;
+
+	sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
+
+	for (i = 0; i < sz; i++) {
+		struct vgic_translation_cache_entry *cte;
+
+		/* An allocation failure is not fatal */
+		cte = kzalloc(sizeof(*cte), GFP_KERNEL);
+		if (WARN_ON(!cte))
+			break;
+
+		INIT_LIST_HEAD(&cte->entry);
+		list_add(&cte->entry, &dist->lpi_translation_cache);
+	}
+}
+
+void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_translation_cache_entry *cte, *tmp;
+
+	vgic_its_invalidate_cache(kvm);
+
+	list_for_each_entry_safe(cte, tmp,
+				 &dist->lpi_translation_cache, entry) {
+		list_del(&cte->entry);
+		kfree(cte);
+	}
+}
+
+#define INITIAL_BASER_VALUE						  \
+	(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)		| \
+	 GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)		| \
+	 GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)		| \
+	 GITS_BASER_PAGE_SIZE_64K)
+
+#define INITIAL_PROPBASER_VALUE						  \
+	(GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)		| \
+	 GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)	| \
+	 GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+	struct vgic_its *its;
+
+	if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+		return -ENODEV;
+
+	its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+	if (!its)
+		return -ENOMEM;
+
+	if (vgic_initialized(dev->kvm)) {
+		int ret = vgic_v4_init(dev->kvm);
+		if (ret < 0) {
+			kfree(its);
+			return ret;
+		}
+
+		vgic_lpi_translation_cache_init(dev->kvm);
+	}
+
+	mutex_init(&its->its_lock);
+	mutex_init(&its->cmd_lock);
+
+	its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+	INIT_LIST_HEAD(&its->device_list);
+	INIT_LIST_HEAD(&its->collection_list);
+
+	dev->kvm->arch.vgic.msis_require_devid = true;
+	dev->kvm->arch.vgic.has_its = true;
+	its->enabled = false;
+	its->dev = dev;
+
+	its->baser_device_table = INITIAL_BASER_VALUE			|
+		((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
+	its->baser_coll_table = INITIAL_BASER_VALUE |
+		((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
+	dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
+
+	dev->private = its;
+
+	return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+	struct kvm *kvm = kvm_dev->kvm;
+	struct vgic_its *its = kvm_dev->private;
+
+	mutex_lock(&its->its_lock);
+
+	vgic_its_free_device_list(kvm, its);
+	vgic_its_free_collection_list(kvm, its);
+
+	mutex_unlock(&its->its_lock);
+	kfree(its);
+	kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */
+}
+
+static int vgic_its_has_attr_regs(struct kvm_device *dev,
+				  struct kvm_device_attr *attr)
+{
+	const struct vgic_register_region *region;
+	gpa_t offset = attr->attr;
+	int align;
+
+	align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7;
+
+	if (offset & align)
+		return -EINVAL;
+
+	region = vgic_find_mmio_region(its_registers,
+				       ARRAY_SIZE(its_registers),
+				       offset);
+	if (!region)
+		return -ENXIO;
+
+	return 0;
+}
+
+static int vgic_its_attr_regs_access(struct kvm_device *dev,
+				     struct kvm_device_attr *attr,
+				     u64 *reg, bool is_write)
+{
+	const struct vgic_register_region *region;
+	struct vgic_its *its;
+	gpa_t addr, offset;
+	unsigned int len;
+	int align, ret = 0;
+
+	its = dev->private;
+	offset = attr->attr;
+
+	/*
+	 * Although the spec supports upper/lower 32-bit accesses to
+	 * 64-bit ITS registers, the userspace ABI requires 64-bit
+	 * accesses to all 64-bit wide registers. We therefore only
+	 * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID
+	 * registers
+	 */
+	if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4))
+		align = 0x3;
+	else
+		align = 0x7;
+
+	if (offset & align)
+		return -EINVAL;
+
+	mutex_lock(&dev->kvm->lock);
+
+	if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	region = vgic_find_mmio_region(its_registers,
+				       ARRAY_SIZE(its_registers),
+				       offset);
+	if (!region) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	if (!lock_all_vcpus(dev->kvm)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	addr = its->vgic_its_base + offset;
+
+	len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;
+
+	if (is_write) {
+		if (region->uaccess_its_write)
+			ret = region->uaccess_its_write(dev->kvm, its, addr,
+							len, *reg);
+		else
+			region->its_write(dev->kvm, its, addr, len, *reg);
+	} else {
+		*reg = region->its_read(dev->kvm, its, addr, len);
+	}
+	unlock_all_vcpus(dev->kvm);
+out:
+	mutex_unlock(&dev->kvm->lock);
+	return ret;
+}
+
+static u32 compute_next_devid_offset(struct list_head *h,
+				     struct its_device *dev)
+{
+	struct its_device *next;
+	u32 next_offset;
+
+	if (list_is_last(&dev->dev_list, h))
+		return 0;
+	next = list_next_entry(dev, dev_list);
+	next_offset = next->device_id - dev->device_id;
+
+	return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET);
+}
+
+static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite)
+{
+	struct its_ite *next;
+	u32 next_offset;
+
+	if (list_is_last(&ite->ite_list, h))
+		return 0;
+	next = list_next_entry(ite, ite_list);
+	next_offset = next->event_id - ite->event_id;
+
+	return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET);
+}
+
+/**
+ * entry_fn_t - Callback called on a table entry restore path
+ * @its: its handle
+ * @id: id of the entry
+ * @entry: pointer to the entry
+ * @opaque: pointer to an opaque data
+ *
+ * Return: < 0 on error, 0 if last element was identified, id offset to next
+ * element otherwise
+ */
+typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
+			  void *opaque);
+
+/**
+ * scan_its_table - Scan a contiguous table in guest RAM and applies a function
+ * to each entry
+ *
+ * @its: its handle
+ * @base: base gpa of the table
+ * @size: size of the table in bytes
+ * @esz: entry size in bytes
+ * @start_id: the ID of the first entry in the table
+ * (non zero for 2d level tables)
+ * @fn: function to apply on each entry
+ *
+ * Return: < 0 on error, 0 if last element was identified, 1 otherwise
+ * (the last element may not be found on second level tables)
+ */
+static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
+			  int start_id, entry_fn_t fn, void *opaque)
+{
+	struct kvm *kvm = its->dev->kvm;
+	unsigned long len = size;
+	int id = start_id;
+	gpa_t gpa = base;
+	char entry[ESZ_MAX];
+	int ret;
+
+	memset(entry, 0, esz);
+
+	while (len > 0) {
+		int next_offset;
+		size_t byte_offset;
+
+		ret = kvm_read_guest_lock(kvm, gpa, entry, esz);
+		if (ret)
+			return ret;
+
+		next_offset = fn(its, id, entry, opaque);
+		if (next_offset <= 0)
+			return next_offset;
+
+		byte_offset = next_offset * esz;
+		id += next_offset;
+		gpa += byte_offset;
+		len -= byte_offset;
+	}
+	return 1;
+}
+
+/**
+ * vgic_its_save_ite - Save an interrupt translation entry at @gpa
+ */
+static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
+			      struct its_ite *ite, gpa_t gpa, int ite_esz)
+{
+	struct kvm *kvm = its->dev->kvm;
+	u32 next_offset;
+	u64 val;
+
+	next_offset = compute_next_eventid_offset(&dev->itt_head, ite);
+	val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) |
+	       ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
+		ite->collection->collection_id;
+	val = cpu_to_le64(val);
+	return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
+}
+
+/**
+ * vgic_its_restore_ite - restore an interrupt translation entry
+ * @event_id: id used for indexing
+ * @ptr: pointer to the ITE entry
+ * @opaque: pointer to the its_device
+ */
+static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
+				void *ptr, void *opaque)
+{
+	struct its_device *dev = (struct its_device *)opaque;
+	struct its_collection *collection;
+	struct kvm *kvm = its->dev->kvm;
+	struct kvm_vcpu *vcpu = NULL;
+	u64 val;
+	u64 *p = (u64 *)ptr;
+	struct vgic_irq *irq;
+	u32 coll_id, lpi_id;
+	struct its_ite *ite;
+	u32 offset;
+
+	val = *p;
+
+	val = le64_to_cpu(val);
+
+	coll_id = val & KVM_ITS_ITE_ICID_MASK;
+	lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT;
+
+	if (!lpi_id)
+		return 1; /* invalid entry, no choice but to scan next entry */
+
+	if (lpi_id < VGIC_MIN_LPI)
+		return -EINVAL;
+
+	offset = val >> KVM_ITS_ITE_NEXT_SHIFT;
+	if (event_id + offset >= BIT_ULL(dev->num_eventid_bits))
+		return -EINVAL;
+
+	collection = find_collection(its, coll_id);
+	if (!collection)
+		return -EINVAL;
+
+	ite = vgic_its_alloc_ite(dev, collection, event_id);
+	if (IS_ERR(ite))
+		return PTR_ERR(ite);
+
+	if (its_is_collection_mapped(collection))
+		vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+	irq = vgic_add_lpi(kvm, lpi_id, vcpu);
+	if (IS_ERR(irq))
+		return PTR_ERR(irq);
+	ite->irq = irq;
+
+	return offset;
+}
+
+static int vgic_its_ite_cmp(void *priv, struct list_head *a,
+			    struct list_head *b)
+{
+	struct its_ite *itea = container_of(a, struct its_ite, ite_list);
+	struct its_ite *iteb = container_of(b, struct its_ite, ite_list);
+
+	if (itea->event_id < iteb->event_id)
+		return -1;
+	else
+		return 1;
+}
+
+static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	gpa_t base = device->itt_addr;
+	struct its_ite *ite;
+	int ret;
+	int ite_esz = abi->ite_esz;
+
+	list_sort(NULL, &device->itt_head, vgic_its_ite_cmp);
+
+	list_for_each_entry(ite, &device->itt_head, ite_list) {
+		gpa_t gpa = base + ite->event_id * ite_esz;
+
+		/*
+		 * If an LPI carries the HW bit, this means that this
+		 * interrupt is controlled by GICv4, and we do not
+		 * have direct access to that state. Let's simply fail
+		 * the save operation...
+		 */
+		if (ite->irq->hw)
+			return -EACCES;
+
+		ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/**
+ * vgic_its_restore_itt - restore the ITT of a device
+ *
+ * @its: its handle
+ * @dev: device handle
+ *
+ * Return 0 on success, < 0 on error
+ */
+static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	gpa_t base = dev->itt_addr;
+	int ret;
+	int ite_esz = abi->ite_esz;
+	size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz;
+
+	ret = scan_its_table(its, base, max_size, ite_esz, 0,
+			     vgic_its_restore_ite, dev);
+
+	/* scan_its_table returns +1 if all ITEs are invalid */
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+/**
+ * vgic_its_save_dte - Save a device table entry at a given GPA
+ *
+ * @its: ITS handle
+ * @dev: ITS device
+ * @ptr: GPA
+ */
+static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
+			     gpa_t ptr, int dte_esz)
+{
+	struct kvm *kvm = its->dev->kvm;
+	u64 val, itt_addr_field;
+	u32 next_offset;
+
+	itt_addr_field = dev->itt_addr >> 8;
+	next_offset = compute_next_devid_offset(&its->device_list, dev);
+	val = (1ULL << KVM_ITS_DTE_VALID_SHIFT |
+	       ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) |
+	       (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
+		(dev->num_eventid_bits - 1));
+	val = cpu_to_le64(val);
+	return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
+}
+
+/**
+ * vgic_its_restore_dte - restore a device table entry
+ *
+ * @its: its handle
+ * @id: device id the DTE corresponds to
+ * @ptr: kernel VA where the 8 byte DTE is located
+ * @opaque: unused
+ *
+ * Return: < 0 on error, 0 if the dte is the last one, id offset to the
+ * next dte otherwise
+ */
+static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
+				void *ptr, void *opaque)
+{
+	struct its_device *dev;
+	gpa_t itt_addr;
+	u8 num_eventid_bits;
+	u64 entry = *(u64 *)ptr;
+	bool valid;
+	u32 offset;
+	int ret;
+
+	entry = le64_to_cpu(entry);
+
+	valid = entry >> KVM_ITS_DTE_VALID_SHIFT;
+	num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1;
+	itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK)
+			>> KVM_ITS_DTE_ITTADDR_SHIFT) << 8;
+
+	if (!valid)
+		return 1;
+
+	/* dte entry is valid */
+	offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;
+
+	dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	ret = vgic_its_restore_itt(its, dev);
+	if (ret) {
+		vgic_its_free_device(its->dev->kvm, dev);
+		return ret;
+	}
+
+	return offset;
+}
+
+static int vgic_its_device_cmp(void *priv, struct list_head *a,
+			       struct list_head *b)
+{
+	struct its_device *deva = container_of(a, struct its_device, dev_list);
+	struct its_device *devb = container_of(b, struct its_device, dev_list);
+
+	if (deva->device_id < devb->device_id)
+		return -1;
+	else
+		return 1;
+}
+
+/**
+ * vgic_its_save_device_tables - Save the device table and all ITT
+ * into guest RAM
+ *
+ * L1/L2 handling is hidden by vgic_its_check_id() helper which directly
+ * returns the GPA of the device entry
+ */
+static int vgic_its_save_device_tables(struct vgic_its *its)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	u64 baser = its->baser_device_table;
+	struct its_device *dev;
+	int dte_esz = abi->dte_esz;
+
+	if (!(baser & GITS_BASER_VALID))
+		return 0;
+
+	list_sort(NULL, &its->device_list, vgic_its_device_cmp);
+
+	list_for_each_entry(dev, &its->device_list, dev_list) {
+		int ret;
+		gpa_t eaddr;
+
+		if (!vgic_its_check_id(its, baser,
+				       dev->device_id, &eaddr))
+			return -EINVAL;
+
+		ret = vgic_its_save_itt(its, dev);
+		if (ret)
+			return ret;
+
+		ret = vgic_its_save_dte(its, dev, eaddr, dte_esz);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/**
+ * handle_l1_dte - callback used for L1 device table entries (2 stage case)
+ *
+ * @its: its handle
+ * @id: index of the entry in the L1 table
+ * @addr: kernel VA
+ * @opaque: unused
+ *
+ * L1 table entries are scanned by steps of 1 entry
+ * Return < 0 if error, 0 if last dte was found when scanning the L2
+ * table, +1 otherwise (meaning next L1 entry must be scanned)
+ */
+static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
+			 void *opaque)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	int l2_start_id = id * (SZ_64K / abi->dte_esz);
+	u64 entry = *(u64 *)addr;
+	int dte_esz = abi->dte_esz;
+	gpa_t gpa;
+	int ret;
+
+	entry = le64_to_cpu(entry);
+
+	if (!(entry & KVM_ITS_L1E_VALID_MASK))
+		return 1;
+
+	gpa = entry & KVM_ITS_L1E_ADDR_MASK;
+
+	ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
+			     l2_start_id, vgic_its_restore_dte, NULL);
+
+	return ret;
+}
+
+/**
+ * vgic_its_restore_device_tables - Restore the device table and all ITT
+ * from guest RAM to internal data structs
+ */
+static int vgic_its_restore_device_tables(struct vgic_its *its)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	u64 baser = its->baser_device_table;
+	int l1_esz, ret;
+	int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+	gpa_t l1_gpa;
+
+	if (!(baser & GITS_BASER_VALID))
+		return 0;
+
+	l1_gpa = GITS_BASER_ADDR_48_to_52(baser);
+
+	if (baser & GITS_BASER_INDIRECT) {
+		l1_esz = GITS_LVL1_ENTRY_SIZE;
+		ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
+				     handle_l1_dte, NULL);
+	} else {
+		l1_esz = abi->dte_esz;
+		ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
+				     vgic_its_restore_dte, NULL);
+	}
+
+	/* scan_its_table returns +1 if all entries are invalid */
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
+static int vgic_its_save_cte(struct vgic_its *its,
+			     struct its_collection *collection,
+			     gpa_t gpa, int esz)
+{
+	u64 val;
+
+	val = (1ULL << KVM_ITS_CTE_VALID_SHIFT |
+	       ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
+	       collection->collection_id);
+	val = cpu_to_le64(val);
+	return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
+}
+
+static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
+{
+	struct its_collection *collection;
+	struct kvm *kvm = its->dev->kvm;
+	u32 target_addr, coll_id;
+	u64 val;
+	int ret;
+
+	BUG_ON(esz > sizeof(val));
+	ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
+	if (ret)
+		return ret;
+	val = le64_to_cpu(val);
+	if (!(val & KVM_ITS_CTE_VALID_MASK))
+		return 0;
+
+	target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT);
+	coll_id = val & KVM_ITS_CTE_ICID_MASK;
+
+	if (target_addr != COLLECTION_NOT_MAPPED &&
+	    target_addr >= atomic_read(&kvm->online_vcpus))
+		return -EINVAL;
+
+	collection = find_collection(its, coll_id);
+	if (collection)
+		return -EEXIST;
+	ret = vgic_its_alloc_collection(its, &collection, coll_id);
+	if (ret)
+		return ret;
+	collection->target_addr = target_addr;
+	return 1;
+}
+
+/**
+ * vgic_its_save_collection_table - Save the collection table into
+ * guest RAM
+ */
+static int vgic_its_save_collection_table(struct vgic_its *its)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	u64 baser = its->baser_coll_table;
+	gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
+	struct its_collection *collection;
+	u64 val;
+	size_t max_size, filled = 0;
+	int ret, cte_esz = abi->cte_esz;
+
+	if (!(baser & GITS_BASER_VALID))
+		return 0;
+
+	max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+
+	list_for_each_entry(collection, &its->collection_list, coll_list) {
+		ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
+		if (ret)
+			return ret;
+		gpa += cte_esz;
+		filled += cte_esz;
+	}
+
+	if (filled == max_size)
+		return 0;
+
+	/*
+	 * table is not fully filled, add a last dummy element
+	 * with valid bit unset
+	 */
+	val = 0;
+	BUG_ON(cte_esz > sizeof(val));
+	ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
+	return ret;
+}
+
+/**
+ * vgic_its_restore_collection_table - reads the collection table
+ * in guest memory and restores the ITS internal state. Requires the
+ * BASER registers to be restored before.
+ */
+static int vgic_its_restore_collection_table(struct vgic_its *its)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	u64 baser = its->baser_coll_table;
+	int cte_esz = abi->cte_esz;
+	size_t max_size, read = 0;
+	gpa_t gpa;
+	int ret;
+
+	if (!(baser & GITS_BASER_VALID))
+		return 0;
+
+	gpa = GITS_BASER_ADDR_48_to_52(baser);
+
+	max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+
+	while (read < max_size) {
+		ret = vgic_its_restore_cte(its, gpa, cte_esz);
+		if (ret <= 0)
+			break;
+		gpa += cte_esz;
+		read += cte_esz;
+	}
+
+	if (ret > 0)
+		return 0;
+
+	return ret;
+}
+
+/**
+ * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
+ * according to v0 ABI
+ */
+static int vgic_its_save_tables_v0(struct vgic_its *its)
+{
+	int ret;
+
+	ret = vgic_its_save_device_tables(its);
+	if (ret)
+		return ret;
+
+	return vgic_its_save_collection_table(its);
+}
+
+/**
+ * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
+ * to internal data structs according to V0 ABI
+ *
+ */
+static int vgic_its_restore_tables_v0(struct vgic_its *its)
+{
+	int ret;
+
+	ret = vgic_its_restore_collection_table(its);
+	if (ret)
+		return ret;
+
+	return vgic_its_restore_device_tables(its);
+}
+
+static int vgic_its_commit_v0(struct vgic_its *its)
+{
+	const struct vgic_its_abi *abi;
+
+	abi = vgic_its_get_abi(its);
+	its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+	its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+
+	its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
+					<< GITS_BASER_ENTRY_SIZE_SHIFT);
+
+	its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
+					<< GITS_BASER_ENTRY_SIZE_SHIFT);
+	return 0;
+}
+
+static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
+{
+	/* We need to keep the ABI specific field values */
+	its->baser_coll_table &= ~GITS_BASER_VALID;
+	its->baser_device_table &= ~GITS_BASER_VALID;
+	its->cbaser = 0;
+	its->creadr = 0;
+	its->cwriter = 0;
+	its->enabled = 0;
+	vgic_its_free_device_list(kvm, its);
+	vgic_its_free_collection_list(kvm, its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		switch (attr->attr) {
+		case KVM_VGIC_ITS_ADDR_TYPE:
+			return 0;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return 0;
+		case KVM_DEV_ARM_ITS_CTRL_RESET:
+			return 0;
+		case KVM_DEV_ARM_ITS_SAVE_TABLES:
+			return 0;
+		case KVM_DEV_ARM_ITS_RESTORE_TABLES:
+			return 0;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS:
+		return vgic_its_has_attr_regs(dev, attr);
+	}
+	return -ENXIO;
+}
+
+static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
+{
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+	int ret = 0;
+
+	if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
+		return 0;
+
+	mutex_lock(&kvm->lock);
+	mutex_lock(&its->its_lock);
+
+	if (!lock_all_vcpus(kvm)) {
+		mutex_unlock(&its->its_lock);
+		mutex_unlock(&kvm->lock);
+		return -EBUSY;
+	}
+
+	switch (attr) {
+	case KVM_DEV_ARM_ITS_CTRL_RESET:
+		vgic_its_reset(kvm, its);
+		break;
+	case KVM_DEV_ARM_ITS_SAVE_TABLES:
+		ret = abi->save_tables(its);
+		break;
+	case KVM_DEV_ARM_ITS_RESTORE_TABLES:
+		ret = abi->restore_tables(its);
+		break;
+	}
+
+	unlock_all_vcpus(kvm);
+	mutex_unlock(&its->its_lock);
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	struct vgic_its *its = dev->private;
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		unsigned long type = (unsigned long)attr->attr;
+		u64 addr;
+
+		if (type != KVM_VGIC_ITS_ADDR_TYPE)
+			return -ENODEV;
+
+		if (copy_from_user(&addr, uaddr, sizeof(addr)))
+			return -EFAULT;
+
+		ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+					addr, SZ_64K);
+		if (ret)
+			return ret;
+
+		return vgic_register_its_iodev(dev->kvm, its, addr);
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		return vgic_its_ctrl(dev->kvm, its, attr->attr);
+	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		u64 reg;
+
+		if (get_user(reg, uaddr))
+			return -EFAULT;
+
+		return vgic_its_attr_regs_access(dev, attr, &reg, true);
+	}
+	}
+	return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+			     struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+		struct vgic_its *its = dev->private;
+		u64 addr = its->vgic_its_base;
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		unsigned long type = (unsigned long)attr->attr;
+
+		if (type != KVM_VGIC_ITS_ADDR_TYPE)
+			return -ENODEV;
+
+		if (copy_to_user(uaddr, &addr, sizeof(addr)))
+			return -EFAULT;
+		break;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		u64 reg;
+		int ret;
+
+		ret = vgic_its_attr_regs_access(dev, attr, &reg, false);
+		if (ret)
+			return ret;
+		return put_user(reg, uaddr);
+	}
+	default:
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+	.name = "kvm-arm-vgic-its",
+	.create = vgic_its_create,
+	.destroy = vgic_its_destroy,
+	.set_attr = vgic_its_set_attr,
+	.get_attr = vgic_its_get_attr,
+	.has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+	return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+				       KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
new file mode 100644
index 000000000000..44419679f91a
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -0,0 +1,741 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGIC: KVM DEVICE API
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_mmu.h>
+#include <asm/cputype.h>
+#include "vgic.h"
+
+/* common helpers */
+
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+		      phys_addr_t addr, phys_addr_t alignment)
+{
+	if (addr & ~kvm_phys_mask(kvm))
+		return -E2BIG;
+
+	if (!IS_ALIGNED(addr, alignment))
+		return -EINVAL;
+
+	if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
+		return -EEXIST;
+
+	return 0;
+}
+
+static int vgic_check_type(struct kvm *kvm, int type_needed)
+{
+	if (kvm->arch.vgic.vgic_model != type_needed)
+		return -ENODEV;
+	else
+		return 0;
+}
+
+/**
+ * kvm_vgic_addr - set or get vgic VM base addresses
+ * @kvm:   pointer to the vm struct
+ * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
+ * @addr:  pointer to address value
+ * @write: if true set the address in the VM address space, if false read the
+ *          address
+ *
+ * Set or get the vgic base addresses for the distributor and the virtual CPU
+ * interface in the VM physical address space.  These addresses are properties
+ * of the emulated core/SoC and therefore user space initially knows this
+ * information.
+ * Check them for sanity (alignment, double assignment). We can't check for
+ * overlapping regions in case of a virtual GICv3 here, since we don't know
+ * the number of VCPUs yet, so we defer this check to map_resources().
+ */
+int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
+{
+	int r = 0;
+	struct vgic_dist *vgic = &kvm->arch.vgic;
+	phys_addr_t *addr_ptr, alignment;
+	u64 undef_value = VGIC_ADDR_UNDEF;
+
+	mutex_lock(&kvm->lock);
+	switch (type) {
+	case KVM_VGIC_V2_ADDR_TYPE_DIST:
+		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		addr_ptr = &vgic->vgic_dist_base;
+		alignment = SZ_4K;
+		break;
+	case KVM_VGIC_V2_ADDR_TYPE_CPU:
+		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		addr_ptr = &vgic->vgic_cpu_base;
+		alignment = SZ_4K;
+		break;
+	case KVM_VGIC_V3_ADDR_TYPE_DIST:
+		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+		addr_ptr = &vgic->vgic_dist_base;
+		alignment = SZ_64K;
+		break;
+	case KVM_VGIC_V3_ADDR_TYPE_REDIST: {
+		struct vgic_redist_region *rdreg;
+
+		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+		if (r)
+			break;
+		if (write) {
+			r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
+			goto out;
+		}
+		rdreg = list_first_entry(&vgic->rd_regions,
+					 struct vgic_redist_region, list);
+		if (!rdreg)
+			addr_ptr = &undef_value;
+		else
+			addr_ptr = &rdreg->base;
+		break;
+	}
+	case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
+	{
+		struct vgic_redist_region *rdreg;
+		u8 index;
+
+		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+		if (r)
+			break;
+
+		index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK;
+
+		if (write) {
+			gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK;
+			u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK)
+					>> KVM_VGIC_V3_RDIST_COUNT_SHIFT;
+			u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK)
+					>> KVM_VGIC_V3_RDIST_FLAGS_SHIFT;
+
+			if (!count || flags)
+				r = -EINVAL;
+			else
+				r = vgic_v3_set_redist_base(kvm, index,
+							    base, count);
+			goto out;
+		}
+
+		rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+		if (!rdreg) {
+			r = -ENOENT;
+			goto out;
+		}
+
+		*addr = index;
+		*addr |= rdreg->base;
+		*addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
+		goto out;
+	}
+	default:
+		r = -ENODEV;
+	}
+
+	if (r)
+		goto out;
+
+	if (write) {
+		r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment);
+		if (!r)
+			*addr_ptr = *addr;
+	} else {
+		*addr = *addr_ptr;
+	}
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int vgic_set_common_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	int r;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		u64 addr;
+		unsigned long type = (unsigned long)attr->attr;
+
+		if (copy_from_user(&addr, uaddr, sizeof(addr)))
+			return -EFAULT;
+
+		r = kvm_vgic_addr(dev->kvm, type, &addr, true);
+		return (r == -ENODEV) ? -ENXIO : r;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+		u32 val;
+		int ret = 0;
+
+		if (get_user(val, uaddr))
+			return -EFAULT;
+
+		/*
+		 * We require:
+		 * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
+		 * - at most 1024 interrupts
+		 * - a multiple of 32 interrupts
+		 */
+		if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
+		    val > VGIC_MAX_RESERVED ||
+		    (val & 31))
+			return -EINVAL;
+
+		mutex_lock(&dev->kvm->lock);
+
+		if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
+			ret = -EBUSY;
+		else
+			dev->kvm->arch.vgic.nr_spis =
+				val - VGIC_NR_PRIVATE_IRQS;
+
+		mutex_unlock(&dev->kvm->lock);
+
+		return ret;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			mutex_lock(&dev->kvm->lock);
+			r = vgic_init(dev->kvm);
+			mutex_unlock(&dev->kvm->lock);
+			return r;
+		}
+		break;
+	}
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_get_common_attr(struct kvm_device *dev,
+				struct kvm_device_attr *attr)
+{
+	int r = -ENXIO;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		u64 addr;
+		unsigned long type = (unsigned long)attr->attr;
+
+		r = kvm_vgic_addr(dev->kvm, type, &addr, false);
+		if (r)
+			return (r == -ENODEV) ? -ENXIO : r;
+
+		if (copy_to_user(uaddr, &addr, sizeof(addr)))
+			return -EFAULT;
+		break;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+		r = put_user(dev->kvm->arch.vgic.nr_spis +
+			     VGIC_NR_PRIVATE_IRQS, uaddr);
+		break;
+	}
+	}
+
+	return r;
+}
+
+static int vgic_create(struct kvm_device *dev, u32 type)
+{
+	return kvm_vgic_create(dev->kvm, type);
+}
+
+static void vgic_destroy(struct kvm_device *dev)
+{
+	kfree(dev);
+}
+
+int kvm_register_vgic_device(unsigned long type)
+{
+	int ret = -ENODEV;
+
+	switch (type) {
+	case KVM_DEV_TYPE_ARM_VGIC_V2:
+		ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+					      KVM_DEV_TYPE_ARM_VGIC_V2);
+		break;
+	case KVM_DEV_TYPE_ARM_VGIC_V3:
+		ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+					      KVM_DEV_TYPE_ARM_VGIC_V3);
+
+		if (ret)
+			break;
+		ret = kvm_vgic_register_its_device();
+		break;
+	}
+
+	return ret;
+}
+
+int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+		       struct vgic_reg_attr *reg_attr)
+{
+	int cpuid;
+
+	cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
+		 KVM_DEV_ARM_VGIC_CPUID_SHIFT;
+
+	if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
+		return -EINVAL;
+
+	reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+	reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+	return 0;
+}
+
+/* unlocks vcpus from @vcpu_lock_idx and smaller */
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+	struct kvm_vcpu *tmp_vcpu;
+
+	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+		tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+		mutex_unlock(&tmp_vcpu->mutex);
+	}
+}
+
+void unlock_all_vcpus(struct kvm *kvm)
+{
+	unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+/* Returns true if all vcpus were locked, false otherwise */
+bool lock_all_vcpus(struct kvm *kvm)
+{
+	struct kvm_vcpu *tmp_vcpu;
+	int c;
+
+	/*
+	 * Any time a vcpu is run, vcpu_load is called which tries to grab the
+	 * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
+	 * that no other VCPUs are run and fiddle with the vgic state while we
+	 * access it.
+	 */
+	kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+		if (!mutex_trylock(&tmp_vcpu->mutex)) {
+			unlock_vcpus(kvm, c - 1);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
+ *
+ * @dev:      kvm device handle
+ * @attr:     kvm device attribute
+ * @reg:      address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_v2_attr_regs_access(struct kvm_device *dev,
+				    struct kvm_device_attr *attr,
+				    u32 *reg, bool is_write)
+{
+	struct vgic_reg_attr reg_attr;
+	gpa_t addr;
+	struct kvm_vcpu *vcpu;
+	int ret;
+
+	ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
+	if (ret)
+		return ret;
+
+	vcpu = reg_attr.vcpu;
+	addr = reg_attr.addr;
+
+	mutex_lock(&dev->kvm->lock);
+
+	ret = vgic_init(dev->kvm);
+	if (ret)
+		goto out;
+
+	if (!lock_all_vcpus(dev->kvm)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg);
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+		ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	unlock_all_vcpus(dev->kvm);
+out:
+	mutex_unlock(&dev->kvm->lock);
+	return ret;
+}
+
+static int vgic_v2_set_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_set_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+		u32 reg;
+
+		if (get_user(reg, uaddr))
+			return -EFAULT;
+
+		return vgic_v2_attr_regs_access(dev, attr, &reg, true);
+	}
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_v2_get_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_get_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+		u32 reg = 0;
+
+		ret = vgic_v2_attr_regs_access(dev, attr, &reg, false);
+		if (ret)
+			return ret;
+		return put_user(reg, uaddr);
+	}
+	}
+
+	return -ENXIO;
+}
+
+static int vgic_v2_has_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		switch (attr->attr) {
+		case KVM_VGIC_V2_ADDR_TYPE_DIST:
+		case KVM_VGIC_V2_ADDR_TYPE_CPU:
+			return 0;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		return vgic_v2_has_attr_regs(dev, attr);
+	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+		return 0;
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return 0;
+		}
+	}
+	return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+	.name = "kvm-arm-vgic-v2",
+	.create = vgic_create,
+	.destroy = vgic_destroy,
+	.set_attr = vgic_v2_set_attr,
+	.get_attr = vgic_v2_get_attr,
+	.has_attr = vgic_v2_has_attr,
+};
+
+int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+		       struct vgic_reg_attr *reg_attr)
+{
+	unsigned long vgic_mpidr, mpidr_reg;
+
+	/*
+	 * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group,
+	 * attr might not hold MPIDR. Hence assume vcpu0.
+	 */
+	if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) {
+		vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >>
+			      KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT;
+
+		mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr);
+		reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg);
+	} else {
+		reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0);
+	}
+
+	if (!reg_attr->vcpu)
+		return -EINVAL;
+
+	reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+	return 0;
+}
+
+/*
+ * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
+ *
+ * @dev:      kvm device handle
+ * @attr:     kvm device attribute
+ * @reg:      address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_v3_attr_regs_access(struct kvm_device *dev,
+				    struct kvm_device_attr *attr,
+				    u64 *reg, bool is_write)
+{
+	struct vgic_reg_attr reg_attr;
+	gpa_t addr;
+	struct kvm_vcpu *vcpu;
+	int ret;
+	u32 tmp32;
+
+	ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
+	if (ret)
+		return ret;
+
+	vcpu = reg_attr.vcpu;
+	addr = reg_attr.addr;
+
+	mutex_lock(&dev->kvm->lock);
+
+	if (unlikely(!vgic_initialized(dev->kvm))) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (!lock_all_vcpus(dev->kvm)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+		if (is_write)
+			tmp32 = *reg;
+
+		ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32);
+		if (!is_write)
+			*reg = tmp32;
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
+		if (is_write)
+			tmp32 = *reg;
+
+		ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32);
+		if (!is_write)
+			*reg = tmp32;
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+		u64 regid;
+
+		regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
+		ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write,
+						  regid, reg);
+		break;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+		unsigned int info, intid;
+
+		info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
+			KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT;
+		if (info == VGIC_LEVEL_INFO_LINE_LEVEL) {
+			intid = attr->attr &
+				KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK;
+			ret = vgic_v3_line_level_info_uaccess(vcpu, is_write,
+							      intid, reg);
+		} else {
+			ret = -EINVAL;
+		}
+		break;
+	}
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	unlock_all_vcpus(dev->kvm);
+out:
+	mutex_unlock(&dev->kvm->lock);
+	return ret;
+}
+
+static int vgic_v3_set_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_set_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
+		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+		u32 tmp32;
+		u64 reg;
+
+		if (get_user(tmp32, uaddr))
+			return -EFAULT;
+
+		reg = tmp32;
+		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		u64 reg;
+
+		if (get_user(reg, uaddr))
+			return -EFAULT;
+
+		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+	}
+	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+		u64 reg;
+		u32 tmp32;
+
+		if (get_user(tmp32, uaddr))
+			return -EFAULT;
+
+		reg = tmp32;
+		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+		int ret;
+
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
+			mutex_lock(&dev->kvm->lock);
+
+			if (!lock_all_vcpus(dev->kvm)) {
+				mutex_unlock(&dev->kvm->lock);
+				return -EBUSY;
+			}
+			ret = vgic_v3_save_pending_tables(dev->kvm);
+			unlock_all_vcpus(dev->kvm);
+			mutex_unlock(&dev->kvm->lock);
+			return ret;
+		}
+		break;
+	}
+	}
+	return -ENXIO;
+}
+
+static int vgic_v3_get_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	int ret;
+
+	ret = vgic_get_common_attr(dev, attr);
+	if (ret != -ENXIO)
+		return ret;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
+		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+		u64 reg;
+		u32 tmp32;
+
+		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+		if (ret)
+			return ret;
+		tmp32 = reg;
+		return put_user(tmp32, uaddr);
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+		u64 reg;
+
+		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+		if (ret)
+			return ret;
+		return put_user(reg, uaddr);
+	}
+	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+		u64 reg;
+		u32 tmp32;
+
+		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+		if (ret)
+			return ret;
+		tmp32 = reg;
+		return put_user(tmp32, uaddr);
+	}
+	}
+	return -ENXIO;
+}
+
+static int vgic_v3_has_attr(struct kvm_device *dev,
+			    struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_ADDR:
+		switch (attr->attr) {
+		case KVM_VGIC_V3_ADDR_TYPE_DIST:
+		case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+		case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
+			return 0;
+		}
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
+	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
+		return vgic_v3_has_attr_regs(dev, attr);
+	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+		return 0;
+	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+		if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
+		      KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) ==
+		      VGIC_LEVEL_INFO_LINE_LEVEL)
+			return 0;
+		break;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CTRL:
+		switch (attr->attr) {
+		case KVM_DEV_ARM_VGIC_CTRL_INIT:
+			return 0;
+		case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
+			return 0;
+		}
+	}
+	return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+	.name = "kvm-arm-vgic-v3",
+	.create = vgic_create,
+	.destroy = vgic_destroy,
+	.set_attr = vgic_v3_set_attr,
+	.get_attr = vgic_v3_get_attr,
+	.has_attr = vgic_v3_has_attr,
+};
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
new file mode 100644
index 000000000000..a016f07adc28
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGICv2 MMIO handling functions
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/nospec.h>
+
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/*
+ * The Revision field in the IIDR have the following meanings:
+ *
+ * Revision 1: Report GICv2 interrupts as group 0 instead of group 1
+ * Revision 2: Interrupt groups are guest-configurable and signaled using
+ * 	       their configured groups.
+ */
+
+static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
+					    gpa_t addr, unsigned int len)
+{
+	struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+	u32 value;
+
+	switch (addr & 0x0c) {
+	case GIC_DIST_CTRL:
+		value = vgic->enabled ? GICD_ENABLE : 0;
+		break;
+	case GIC_DIST_CTR:
+		value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
+		value = (value >> 5) - 1;
+		value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
+		break;
+	case GIC_DIST_IIDR:
+		value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
+			(vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
+			(IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
+		break;
+	default:
+		return 0;
+	}
+
+	return value;
+}
+
+static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len,
+				    unsigned long val)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	bool was_enabled = dist->enabled;
+
+	switch (addr & 0x0c) {
+	case GIC_DIST_CTRL:
+		dist->enabled = val & GICD_ENABLE;
+		if (!was_enabled && dist->enabled)
+			vgic_kick_vcpus(vcpu->kvm);
+		break;
+	case GIC_DIST_CTR:
+	case GIC_DIST_IIDR:
+		/* Nothing to do */
+		return;
+	}
+}
+
+static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
+					   gpa_t addr, unsigned int len,
+					   unsigned long val)
+{
+	switch (addr & 0x0c) {
+	case GIC_DIST_IIDR:
+		if (val != vgic_mmio_read_v2_misc(vcpu, addr, len))
+			return -EINVAL;
+
+		/*
+		 * If we observe a write to GICD_IIDR we know that userspace
+		 * has been updated and has had a chance to cope with older
+		 * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting
+		 * interrupts as group 1, and therefore we now allow groups to
+		 * be user writable.  Doing this by default would break
+		 * migration from old kernels to new kernels with legacy
+		 * userspace.
+		 */
+		vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
+		return 0;
+	}
+
+	vgic_mmio_write_v2_misc(vcpu, addr, len, val);
+	return 0;
+}
+
+static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu,
+					    gpa_t addr, unsigned int len,
+					    unsigned long val)
+{
+	if (vcpu->kvm->arch.vgic.v2_groups_user_writable)
+		vgic_mmio_write_group(vcpu, addr, len, val);
+
+	return 0;
+}
+
+static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
+				 gpa_t addr, unsigned int len,
+				 unsigned long val)
+{
+	int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus);
+	int intid = val & 0xf;
+	int targets = (val >> 16) & 0xff;
+	int mode = (val >> 24) & 0x03;
+	int c;
+	struct kvm_vcpu *vcpu;
+	unsigned long flags;
+
+	switch (mode) {
+	case 0x0:		/* as specified by targets */
+		break;
+	case 0x1:
+		targets = (1U << nr_vcpus) - 1;			/* all, ... */
+		targets &= ~(1U << source_vcpu->vcpu_id);	/* but self */
+		break;
+	case 0x2:		/* this very vCPU only */
+		targets = (1U << source_vcpu->vcpu_id);
+		break;
+	case 0x3:		/* reserved */
+		return;
+	}
+
+	kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) {
+		struct vgic_irq *irq;
+
+		if (!(targets & (1U << c)))
+			continue;
+
+		irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->pending_latch = true;
+		irq->source |= 1U << source_vcpu->vcpu_id;
+
+		vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags);
+		vgic_put_irq(source_vcpu->kvm, irq);
+	}
+}
+
+static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
+					   gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+	int i;
+	u64 val = 0;
+
+	for (i = 0; i < len; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		val |= (u64)irq->targets << (i * 8);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return val;
+}
+
+static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
+				   gpa_t addr, unsigned int len,
+				   unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+	u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0);
+	int i;
+	unsigned long flags;
+
+	/* GICD_ITARGETSR[0-7] are read-only */
+	if (intid < VGIC_NR_PRIVATE_IRQS)
+		return;
+
+	for (i = 0; i < len; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
+		int target;
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		irq->targets = (val >> (i * 8)) & cpu_mask;
+		target = irq->targets ? __ffs(irq->targets) : 0;
+		irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
+
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
+					    gpa_t addr, unsigned int len)
+{
+	u32 intid = addr & 0x0f;
+	int i;
+	u64 val = 0;
+
+	for (i = 0; i < len; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		val |= (u64)irq->source << (i * 8);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+	return val;
+}
+
+static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	u32 intid = addr & 0x0f;
+	int i;
+	unsigned long flags;
+
+	for (i = 0; i < len; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		irq->source &= ~((val >> (i * 8)) & 0xff);
+		if (!irq->source)
+			irq->pending_latch = false;
+
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	u32 intid = addr & 0x0f;
+	int i;
+	unsigned long flags;
+
+	for (i = 0; i < len; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		irq->source |= (val >> (i * 8)) & 0xff;
+
+		if (irq->source) {
+			irq->pending_latch = true;
+			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+		} else {
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		}
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+#define GICC_ARCH_VERSION_V2	0x2
+
+/* These are for userland accesses only, there is no guest-facing emulation. */
+static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
+					   gpa_t addr, unsigned int len)
+{
+	struct vgic_vmcr vmcr;
+	u32 val;
+
+	vgic_get_vmcr(vcpu, &vmcr);
+
+	switch (addr & 0xff) {
+	case GIC_CPU_CTRL:
+		val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT;
+		val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT;
+		val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT;
+		val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT;
+		val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT;
+		val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT;
+
+		break;
+	case GIC_CPU_PRIMASK:
+		/*
+		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+		 * the PMR field as GICH_VMCR.VMPriMask rather than
+		 * GICC_PMR.Priority, so we expose the upper five bits of
+		 * priority mask to userspace using the lower bits in the
+		 * unsigned long.
+		 */
+		val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
+			GICV_PMR_PRIORITY_SHIFT;
+		break;
+	case GIC_CPU_BINPOINT:
+		val = vmcr.bpr;
+		break;
+	case GIC_CPU_ALIAS_BINPOINT:
+		val = vmcr.abpr;
+		break;
+	case GIC_CPU_IDENT:
+		val = ((PRODUCT_ID_KVM << 20) |
+		       (GICC_ARCH_VERSION_V2 << 16) |
+		       IMPLEMENTER_ARM);
+		break;
+	default:
+		return 0;
+	}
+
+	return val;
+}
+
+static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
+				   gpa_t addr, unsigned int len,
+				   unsigned long val)
+{
+	struct vgic_vmcr vmcr;
+
+	vgic_get_vmcr(vcpu, &vmcr);
+
+	switch (addr & 0xff) {
+	case GIC_CPU_CTRL:
+		vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0);
+		vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1);
+		vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl);
+		vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn);
+		vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR);
+		vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS);
+
+		break;
+	case GIC_CPU_PRIMASK:
+		/*
+		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+		 * the PMR field as GICH_VMCR.VMPriMask rather than
+		 * GICC_PMR.Priority, so we expose the upper five bits of
+		 * priority mask to userspace using the lower bits in the
+		 * unsigned long.
+		 */
+		vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
+			GICV_PMR_PRIORITY_MASK;
+		break;
+	case GIC_CPU_BINPOINT:
+		vmcr.bpr = val;
+		break;
+	case GIC_CPU_ALIAS_BINPOINT:
+		vmcr.abpr = val;
+		break;
+	}
+
+	vgic_set_vmcr(vcpu, &vmcr);
+}
+
+static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
+					gpa_t addr, unsigned int len)
+{
+	int n; /* which APRn is this */
+
+	n = (addr >> 2) & 0x3;
+
+	if (kvm_vgic_global_state.type == VGIC_V2) {
+		/* GICv2 hardware systems support max. 32 groups */
+		if (n != 0)
+			return 0;
+		return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr;
+	} else {
+		struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+		if (n > vgic_v3_max_apr_idx(vcpu))
+			return 0;
+
+		n = array_index_nospec(n, 4);
+
+		/* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
+		return vgicv3->vgic_ap1r[n];
+	}
+}
+
+static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val)
+{
+	int n; /* which APRn is this */
+
+	n = (addr >> 2) & 0x3;
+
+	if (kvm_vgic_global_state.type == VGIC_V2) {
+		/* GICv2 hardware systems support max. 32 groups */
+		if (n != 0)
+			return;
+		vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val;
+	} else {
+		struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+		if (n > vgic_v3_max_apr_idx(vcpu))
+			return;
+
+		n = array_index_nospec(n, 4);
+
+		/* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
+		vgicv3->vgic_ap1r[n] = val;
+	}
+}
+
+static const struct vgic_register_region vgic_v2_dist_registers[] = {
+	REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL,
+		vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc,
+		NULL, vgic_mmio_uaccess_write_v2_misc,
+		12, VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
+		vgic_mmio_read_group, vgic_mmio_write_group,
+		NULL, vgic_mmio_uaccess_write_v2_group, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
+		vgic_mmio_read_enable, vgic_mmio_write_senable,
+		NULL, vgic_uaccess_write_senable, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
+		vgic_mmio_read_enable, vgic_mmio_write_cenable,
+		NULL, vgic_uaccess_write_cenable, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
+		vgic_mmio_read_pending, vgic_mmio_write_spending,
+		NULL, vgic_uaccess_write_spending, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
+		vgic_mmio_read_pending, vgic_mmio_write_cpending,
+		NULL, vgic_uaccess_write_cpending, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
+		vgic_mmio_read_active, vgic_mmio_write_sactive,
+		vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
+		vgic_mmio_read_active, vgic_mmio_write_cactive,
+		vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
+		vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
+		8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
+		vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8,
+		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
+		vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
+		vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR,
+		vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16,
+		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET,
+		vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16,
+		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+};
+
+static const struct vgic_register_region vgic_v2_cpu_registers[] = {
+	REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL,
+		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK,
+		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT,
+		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT,
+		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
+		vgic_mmio_read_apr, vgic_mmio_write_apr, 16,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
+		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+		VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
+{
+	dev->regions = vgic_v2_dist_registers;
+	dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+
+	kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+	return SZ_4K;
+}
+
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	const struct vgic_register_region *region;
+	struct vgic_io_device iodev;
+	struct vgic_reg_attr reg_attr;
+	struct kvm_vcpu *vcpu;
+	gpa_t addr;
+	int ret;
+
+	ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
+	if (ret)
+		return ret;
+
+	vcpu = reg_attr.vcpu;
+	addr = reg_attr.addr;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+		iodev.regions = vgic_v2_dist_registers;
+		iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+		iodev.base_addr = 0;
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+		iodev.regions = vgic_v2_cpu_registers;
+		iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+		iodev.base_addr = 0;
+		break;
+	default:
+		return -ENXIO;
+	}
+
+	/* We only support aligned 32-bit accesses. */
+	if (addr & 3)
+		return -ENXIO;
+
+	region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
+	if (!region)
+		return -ENXIO;
+
+	return 0;
+}
+
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			  int offset, u32 *val)
+{
+	struct vgic_io_device dev = {
+		.regions = vgic_v2_cpu_registers,
+		.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+		.iodev_type = IODEV_CPUIF,
+	};
+
+	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
+
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			 int offset, u32 *val)
+{
+	struct vgic_io_device dev = {
+		.regions = vgic_v2_dist_registers,
+		.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+		.iodev_type = IODEV_DIST,
+	};
+
+	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
new file mode 100644
index 000000000000..89a14ec8b33b
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -0,0 +1,1063 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGICv3 MMIO handling functions
+ */
+
+#include <linux/bitfield.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/* extract @num bytes at @offset bytes offset in data */
+unsigned long extract_bytes(u64 data, unsigned int offset,
+			    unsigned int num)
+{
+	return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
+}
+
+/* allows updates of any half of a 64-bit register (or the whole thing) */
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+		     unsigned long val)
+{
+	int lower = (offset & 4) * 8;
+	int upper = lower + 8 * len - 1;
+
+	reg &= ~GENMASK_ULL(upper, lower);
+	val &= GENMASK_ULL(len * 8 - 1, 0);
+
+	return reg | ((u64)val << lower);
+}
+
+bool vgic_has_its(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+		return false;
+
+	return dist->has_its;
+}
+
+bool vgic_supports_direct_msis(struct kvm *kvm)
+{
+	return (kvm_vgic_global_state.has_gicv4_1 ||
+		(kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
+}
+
+/*
+ * The Revision field in the IIDR have the following meanings:
+ *
+ * Revision 2: Interrupt groups are guest-configurable and signaled using
+ * 	       their configured groups.
+ */
+
+static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
+					    gpa_t addr, unsigned int len)
+{
+	struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+	u32 value = 0;
+
+	switch (addr & 0x0c) {
+	case GICD_CTLR:
+		if (vgic->enabled)
+			value |= GICD_CTLR_ENABLE_SS_G1;
+		value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+		if (vgic->nassgireq)
+			value |= GICD_CTLR_nASSGIreq;
+		break;
+	case GICD_TYPER:
+		value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
+		value = (value >> 5) - 1;
+		if (vgic_has_its(vcpu->kvm)) {
+			value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
+			value |= GICD_TYPER_LPIS;
+		} else {
+			value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+		}
+		break;
+	case GICD_TYPER2:
+		if (kvm_vgic_global_state.has_gicv4_1)
+			value = GICD_TYPER2_nASSGIcap;
+		break;
+	case GICD_IIDR:
+		value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
+			(vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
+			(IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
+		break;
+	default:
+		return 0;
+	}
+
+	return value;
+}
+
+static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len,
+				    unsigned long val)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+	switch (addr & 0x0c) {
+	case GICD_CTLR: {
+		bool was_enabled, is_hwsgi;
+
+		mutex_lock(&vcpu->kvm->lock);
+
+		was_enabled = dist->enabled;
+		is_hwsgi = dist->nassgireq;
+
+		dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
+
+		/* Not a GICv4.1? No HW SGIs */
+		if (!kvm_vgic_global_state.has_gicv4_1)
+			val &= ~GICD_CTLR_nASSGIreq;
+
+		/* Dist stays enabled? nASSGIreq is RO */
+		if (was_enabled && dist->enabled) {
+			val &= ~GICD_CTLR_nASSGIreq;
+			val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi);
+		}
+
+		/* Switching HW SGIs? */
+		dist->nassgireq = val & GICD_CTLR_nASSGIreq;
+		if (is_hwsgi != dist->nassgireq)
+			vgic_v4_configure_vsgis(vcpu->kvm);
+
+		if (kvm_vgic_global_state.has_gicv4_1 &&
+		    was_enabled != dist->enabled)
+			kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
+		else if (!was_enabled && dist->enabled)
+			vgic_kick_vcpus(vcpu->kvm);
+
+		mutex_unlock(&vcpu->kvm->lock);
+		break;
+	}
+	case GICD_TYPER:
+	case GICD_TYPER2:
+	case GICD_IIDR:
+		/* This is at best for documentation purposes... */
+		return;
+	}
+}
+
+static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
+					   gpa_t addr, unsigned int len,
+					   unsigned long val)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+	switch (addr & 0x0c) {
+	case GICD_TYPER2:
+	case GICD_IIDR:
+		if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
+			return -EINVAL;
+		return 0;
+	case GICD_CTLR:
+		/* Not a GICv4.1? No HW SGIs */
+		if (!kvm_vgic_global_state.has_gicv4_1)
+			val &= ~GICD_CTLR_nASSGIreq;
+
+		dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
+		dist->nassgireq = val & GICD_CTLR_nASSGIreq;
+		return 0;
+	}
+
+	vgic_mmio_write_v3_misc(vcpu, addr, len, val);
+	return 0;
+}
+
+static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
+					    gpa_t addr, unsigned int len)
+{
+	int intid = VGIC_ADDR_TO_INTID(addr, 64);
+	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+	unsigned long ret = 0;
+
+	if (!irq)
+		return 0;
+
+	/* The upper word is RAZ for us. */
+	if (!(addr & 4))
+		ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+
+	vgic_put_irq(vcpu->kvm, irq);
+	return ret;
+}
+
+static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len,
+				    unsigned long val)
+{
+	int intid = VGIC_ADDR_TO_INTID(addr, 64);
+	struct vgic_irq *irq;
+	unsigned long flags;
+
+	/* The upper word is WI for us since we don't implement Aff3. */
+	if (addr & 4)
+		return;
+
+	irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+	if (!irq)
+		return;
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+	/* We only care about and preserve Aff0, Aff1 and Aff2. */
+	irq->mpidr = val & GENMASK(23, 0);
+	irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
+
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+	vgic_put_irq(vcpu->kvm, irq);
+}
+
+static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
+					     gpa_t addr, unsigned int len)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+	return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
+}
+
+
+static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	bool was_enabled = vgic_cpu->lpis_enabled;
+
+	if (!vgic_has_its(vcpu->kvm))
+		return;
+
+	vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+
+	if (was_enabled && !vgic_cpu->lpis_enabled) {
+		vgic_flush_pending_lpis(vcpu);
+		vgic_its_invalidate_cache(vcpu->kvm);
+	}
+
+	if (!was_enabled && vgic_cpu->lpis_enabled)
+		vgic_enable_lpis(vcpu);
+}
+
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+					      gpa_t addr, unsigned int len)
+{
+	unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
+	int target_vcpu_id = vcpu->vcpu_id;
+	gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
+			(rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
+	u64 value;
+
+	value = (u64)(mpidr & GENMASK(23, 0)) << 32;
+	value |= ((target_vcpu_id & 0xffff) << 8);
+
+	if (addr == last_rdist_typer)
+		value |= GICR_TYPER_LAST;
+	if (vgic_has_its(vcpu->kvm))
+		value |= GICR_TYPER_PLPIS;
+
+	return extract_bytes(value, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
+					     gpa_t addr, unsigned int len)
+{
+	return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
+					      gpa_t addr, unsigned int len)
+{
+	switch (addr & 0xffff) {
+	case GICD_PIDR2:
+		/* report a GICv3 compliant implementation */
+		return 0x3b;
+	}
+
+	return 0;
+}
+
+static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
+						  gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	u32 value = 0;
+	int i;
+
+	/*
+	 * pending state of interrupt is latched in pending_latch variable.
+	 * Userspace will save and restore pending state and line_level
+	 * separately.
+	 * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt
+	 * for handling of ISPENDR and ICPENDR.
+	 */
+	for (i = 0; i < len * 8; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+		bool state = irq->pending_latch;
+
+		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+			int err;
+
+			err = irq_get_irqchip_state(irq->host_irq,
+						    IRQCHIP_STATE_PENDING,
+						    &state);
+			WARN_ON(err);
+		}
+
+		if (state)
+			value |= (1U << i);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return value;
+}
+
+static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
+					 gpa_t addr, unsigned int len,
+					 unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for (i = 0; i < len * 8; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		if (test_bit(i, &val)) {
+			/*
+			 * pending_latch is set irrespective of irq type
+			 * (level or edge) to avoid dependency that VM should
+			 * restore irq config before pending info.
+			 */
+			irq->pending_latch = true;
+			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+		} else {
+			irq->pending_latch = false;
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		}
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return 0;
+}
+
+/* We want to avoid outer shareable. */
+u64 vgic_sanitise_shareability(u64 field)
+{
+	switch (field) {
+	case GIC_BASER_OuterShareable:
+		return GIC_BASER_InnerShareable;
+	default:
+		return field;
+	}
+}
+
+/* Avoid any inner non-cacheable mapping. */
+u64 vgic_sanitise_inner_cacheability(u64 field)
+{
+	switch (field) {
+	case GIC_BASER_CACHE_nCnB:
+	case GIC_BASER_CACHE_nC:
+		return GIC_BASER_CACHE_RaWb;
+	default:
+		return field;
+	}
+}
+
+/* Non-cacheable or same-as-inner are OK. */
+u64 vgic_sanitise_outer_cacheability(u64 field)
+{
+	switch (field) {
+	case GIC_BASER_CACHE_SameAsInner:
+	case GIC_BASER_CACHE_nC:
+		return field;
+	default:
+		return GIC_BASER_CACHE_nC;
+	}
+}
+
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+			u64 (*sanitise_fn)(u64))
+{
+	u64 field = (reg & field_mask) >> field_shift;
+
+	field = sanitise_fn(field) << field_shift;
+	return (reg & ~field_mask) | field;
+}
+
+#define PROPBASER_RES0_MASK						\
+	(GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
+#define PENDBASER_RES0_MASK						\
+	(BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |	\
+	 GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
+
+static u64 vgic_sanitise_pendbaser(u64 reg)
+{
+	reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
+				  GICR_PENDBASER_SHAREABILITY_SHIFT,
+				  vgic_sanitise_shareability);
+	reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
+				  GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_inner_cacheability);
+	reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
+				  GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_outer_cacheability);
+
+	reg &= ~PENDBASER_RES0_MASK;
+
+	return reg;
+}
+
+static u64 vgic_sanitise_propbaser(u64 reg)
+{
+	reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
+				  GICR_PROPBASER_SHAREABILITY_SHIFT,
+				  vgic_sanitise_shareability);
+	reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
+				  GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_inner_cacheability);
+	reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
+				  GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
+				  vgic_sanitise_outer_cacheability);
+
+	reg &= ~PROPBASER_RES0_MASK;
+	return reg;
+}
+
+static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
+					     gpa_t addr, unsigned int len)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+	return extract_bytes(dist->propbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	u64 old_propbaser, propbaser;
+
+	/* Storing a value with LPIs already enabled is undefined */
+	if (vgic_cpu->lpis_enabled)
+		return;
+
+	do {
+		old_propbaser = READ_ONCE(dist->propbaser);
+		propbaser = old_propbaser;
+		propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
+		propbaser = vgic_sanitise_propbaser(propbaser);
+	} while (cmpxchg64(&dist->propbaser, old_propbaser,
+			   propbaser) != old_propbaser);
+}
+
+static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
+					     gpa_t addr, unsigned int len)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	u64 value = vgic_cpu->pendbaser;
+
+	value &= ~GICR_PENDBASER_PTZ;
+
+	return extract_bytes(value, addr & 7, len);
+}
+
+static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	u64 old_pendbaser, pendbaser;
+
+	/* Storing a value with LPIs already enabled is undefined */
+	if (vgic_cpu->lpis_enabled)
+		return;
+
+	do {
+		old_pendbaser = READ_ONCE(vgic_cpu->pendbaser);
+		pendbaser = old_pendbaser;
+		pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
+		pendbaser = vgic_sanitise_pendbaser(pendbaser);
+	} while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser,
+			   pendbaser) != old_pendbaser);
+}
+
+/*
+ * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
+ * redistributors, while SPIs are covered by registers in the distributor
+ * block. Trying to set private IRQs in this block gets ignored.
+ * We take some special care here to fix the calculation of the register
+ * offset.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \
+	{								\
+		.reg_offset = off,					\
+		.bits_per_irq = bpi,					\
+		.len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8,		\
+		.access_flags = acc,					\
+		.read = vgic_mmio_read_raz,				\
+		.write = vgic_mmio_write_wi,				\
+	}, {								\
+		.reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8,	\
+		.bits_per_irq = bpi,					\
+		.len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8,	\
+		.access_flags = acc,					\
+		.read = rd,						\
+		.write = wr,						\
+		.uaccess_read = ur,					\
+		.uaccess_write = uw,					\
+	}
+
+static const struct vgic_register_region vgic_v3_dist_registers[] = {
+	REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR,
+		vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc,
+		NULL, vgic_mmio_uaccess_write_v3_misc,
+		16, VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICD_STATUSR,
+		vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
+		vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
+		vgic_mmio_read_enable, vgic_mmio_write_senable,
+		NULL, vgic_uaccess_write_senable, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
+		vgic_mmio_read_enable, vgic_mmio_write_cenable,
+	       NULL, vgic_uaccess_write_cenable, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
+		vgic_mmio_read_pending, vgic_mmio_write_spending,
+		vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
+		vgic_mmio_read_pending, vgic_mmio_write_cpending,
+		vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
+		vgic_mmio_read_active, vgic_mmio_write_sactive,
+		vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
+		vgic_mmio_read_active, vgic_mmio_write_cactive,
+		vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive,
+		1, VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
+		vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
+		8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
+		vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8,
+		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
+		vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
+		vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
+		vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
+		vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+		VGIC_ACCESS_32bit),
+};
+
+static const struct vgic_register_region vgic_v3_rd_registers[] = {
+	/* RD_base registers */
+	REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
+		vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICR_STATUSR,
+		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
+		vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
+		vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
+		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
+		vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
+		vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
+		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
+		vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+		VGIC_ACCESS_32bit),
+	/* SGI_base registers */
+	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0,
+		vgic_mmio_read_group, vgic_mmio_write_group, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0,
+		vgic_mmio_read_enable, vgic_mmio_write_senable,
+		NULL, vgic_uaccess_write_senable, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0,
+		vgic_mmio_read_enable, vgic_mmio_write_cenable,
+		NULL, vgic_uaccess_write_cenable, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
+		vgic_mmio_read_pending, vgic_mmio_write_spending,
+		vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
+		vgic_mmio_read_pending, vgic_mmio_write_cpending,
+		vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0,
+		vgic_mmio_read_active, vgic_mmio_write_sactive,
+		vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0,
+		vgic_mmio_read_active, vgic_mmio_write_cactive,
+		vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0,
+		vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
+		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0,
+		vgic_mmio_read_config, vgic_mmio_write_config, 8,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0,
+		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+		VGIC_ACCESS_32bit),
+	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR,
+		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+		VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
+{
+	dev->regions = vgic_v3_dist_registers;
+	dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
+
+	kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+	return SZ_64K;
+}
+
+/**
+ * vgic_register_redist_iodev - register a single redist iodev
+ * @vcpu:    The VCPU to which the redistributor belongs
+ *
+ * Register a KVM iodev for this VCPU's redistributor using the address
+ * provided.
+ *
+ * Return 0 on success, -ERRNO otherwise.
+ */
+int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct vgic_dist *vgic = &kvm->arch.vgic;
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+	struct vgic_redist_region *rdreg;
+	gpa_t rd_base;
+	int ret;
+
+	if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr))
+		return 0;
+
+	/*
+	 * We may be creating VCPUs before having set the base address for the
+	 * redistributor region, in which case we will come back to this
+	 * function for all VCPUs when the base address is set.  Just return
+	 * without doing any work for now.
+	 */
+	rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions);
+	if (!rdreg)
+		return 0;
+
+	if (!vgic_v3_check_base(kvm))
+		return -EINVAL;
+
+	vgic_cpu->rdreg = rdreg;
+
+	rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
+
+	kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
+	rd_dev->base_addr = rd_base;
+	rd_dev->iodev_type = IODEV_REDIST;
+	rd_dev->regions = vgic_v3_rd_registers;
+	rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
+	rd_dev->redist_vcpu = vcpu;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
+				      2 * SZ_64K, &rd_dev->dev);
+	mutex_unlock(&kvm->slots_lock);
+
+	if (ret)
+		return ret;
+
+	rdreg->free_index++;
+	return 0;
+}
+
+static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
+{
+	struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+
+	kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev);
+}
+
+static int vgic_register_all_redist_iodevs(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	int c, ret = 0;
+
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		ret = vgic_register_redist_iodev(vcpu);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		/* The current c failed, so we start with the previous one. */
+		mutex_lock(&kvm->slots_lock);
+		for (c--; c >= 0; c--) {
+			vcpu = kvm_get_vcpu(kvm, c);
+			vgic_unregister_redist_iodev(vcpu);
+		}
+		mutex_unlock(&kvm->slots_lock);
+	}
+
+	return ret;
+}
+
+/**
+ * vgic_v3_insert_redist_region - Insert a new redistributor region
+ *
+ * Performs various checks before inserting the rdist region in the list.
+ * Those tests depend on whether the size of the rdist region is known
+ * (ie. count != 0). The list is sorted by rdist region index.
+ *
+ * @kvm: kvm handle
+ * @index: redist region index
+ * @base: base of the new rdist region
+ * @count: number of redistributors the region is made of (0 in the old style
+ * single region, whose size is induced from the number of vcpus)
+ *
+ * Return 0 on success, < 0 otherwise
+ */
+static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
+					gpa_t base, uint32_t count)
+{
+	struct vgic_dist *d = &kvm->arch.vgic;
+	struct vgic_redist_region *rdreg;
+	struct list_head *rd_regions = &d->rd_regions;
+	size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
+	int ret;
+
+	/* single rdist region already set ?*/
+	if (!count && !list_empty(rd_regions))
+		return -EINVAL;
+
+	/* cross the end of memory ? */
+	if (base + size < base)
+		return -EINVAL;
+
+	if (list_empty(rd_regions)) {
+		if (index != 0)
+			return -EINVAL;
+	} else {
+		rdreg = list_last_entry(rd_regions,
+					struct vgic_redist_region, list);
+		if (index != rdreg->index + 1)
+			return -EINVAL;
+
+		/* Cannot add an explicitly sized regions after legacy region */
+		if (!rdreg->count)
+			return -EINVAL;
+	}
+
+	/*
+	 * For legacy single-region redistributor regions (!count),
+	 * check that the redistributor region does not overlap with the
+	 * distributor's address space.
+	 */
+	if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
+		vgic_dist_overlap(kvm, base, size))
+		return -EINVAL;
+
+	/* collision with any other rdist region? */
+	if (vgic_v3_rdist_overlap(kvm, base, size))
+		return -EINVAL;
+
+	rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL);
+	if (!rdreg)
+		return -ENOMEM;
+
+	rdreg->base = VGIC_ADDR_UNDEF;
+
+	ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K);
+	if (ret)
+		goto free;
+
+	rdreg->base = base;
+	rdreg->count = count;
+	rdreg->free_index = 0;
+	rdreg->index = index;
+
+	list_add_tail(&rdreg->list, rd_regions);
+	return 0;
+free:
+	kfree(rdreg);
+	return ret;
+}
+
+int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
+{
+	int ret;
+
+	ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
+	if (ret)
+		return ret;
+
+	/*
+	 * Register iodevs for each existing VCPU.  Adding more VCPUs
+	 * afterwards will register the iodevs when needed.
+	 */
+	ret = vgic_register_all_redist_iodevs(kvm);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	const struct vgic_register_region *region;
+	struct vgic_io_device iodev;
+	struct vgic_reg_attr reg_attr;
+	struct kvm_vcpu *vcpu;
+	gpa_t addr;
+	int ret;
+
+	ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
+	if (ret)
+		return ret;
+
+	vcpu = reg_attr.vcpu;
+	addr = reg_attr.addr;
+
+	switch (attr->group) {
+	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+		iodev.regions = vgic_v3_dist_registers;
+		iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
+		iodev.base_addr = 0;
+		break;
+	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{
+		iodev.regions = vgic_v3_rd_registers;
+		iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
+		iodev.base_addr = 0;
+		break;
+	}
+	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+		u64 reg, id;
+
+		id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
+		return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, &reg);
+	}
+	default:
+		return -ENXIO;
+	}
+
+	/* We only support aligned 32-bit accesses. */
+	if (addr & 3)
+		return -ENXIO;
+
+	region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
+	if (!region)
+		return -ENXIO;
+
+	return 0;
+}
+/*
+ * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
+ * generation register ICC_SGI1R_EL1) with a given VCPU.
+ * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
+ * return -1.
+ */
+static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
+{
+	unsigned long affinity;
+	int level0;
+
+	/*
+	 * Split the current VCPU's MPIDR into affinity level 0 and the
+	 * rest as this is what we have to compare against.
+	 */
+	affinity = kvm_vcpu_get_mpidr_aff(vcpu);
+	level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
+	affinity &= ~MPIDR_LEVEL_MASK;
+
+	/* bail out if the upper three levels don't match */
+	if (sgi_aff != affinity)
+		return -1;
+
+	/* Is this VCPU's bit set in the mask ? */
+	if (!(sgi_cpu_mask & BIT(level0)))
+		return -1;
+
+	return level0;
+}
+
+/*
+ * The ICC_SGI* registers encode the affinity differently from the MPIDR,
+ * so provide a wrapper to use the existing defines to isolate a certain
+ * affinity level.
+ */
+#define SGI_AFFINITY_LEVEL(reg, level) \
+	((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
+	>> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+
+/**
+ * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
+ * @vcpu: The VCPU requesting a SGI
+ * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU
+ * @allow_group1: Does the sysreg access allow generation of G1 SGIs
+ *
+ * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
+ * This will trap in sys_regs.c and call this function.
+ * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
+ * target processors as well as a bitmask of 16 Aff0 CPUs.
+ * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
+ * check for matching ones. If this bit is set, we signal all, but not the
+ * calling VCPU.
+ */
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu *c_vcpu;
+	u16 target_cpus;
+	u64 mpidr;
+	int sgi, c;
+	int vcpu_id = vcpu->vcpu_id;
+	bool broadcast;
+	unsigned long flags;
+
+	sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
+	broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+	target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
+	mpidr = SGI_AFFINITY_LEVEL(reg, 3);
+	mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
+	mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
+
+	/*
+	 * We iterate over all VCPUs to find the MPIDRs matching the request.
+	 * If we have handled one CPU, we clear its bit to detect early
+	 * if we are already finished. This avoids iterating through all
+	 * VCPUs when most of the times we just signal a single VCPU.
+	 */
+	kvm_for_each_vcpu(c, c_vcpu, kvm) {
+		struct vgic_irq *irq;
+
+		/* Exit early if we have dealt with all requested CPUs */
+		if (!broadcast && target_cpus == 0)
+			break;
+
+		/* Don't signal the calling VCPU */
+		if (broadcast && c == vcpu_id)
+			continue;
+
+		if (!broadcast) {
+			int level0;
+
+			level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
+			if (level0 == -1)
+				continue;
+
+			/* remove this matching VCPU from the mask */
+			target_cpus &= ~BIT(level0);
+		}
+
+		irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		/*
+		 * An access targetting Group0 SGIs can only generate
+		 * those, while an access targetting Group1 SGIs can
+		 * generate interrupts of either group.
+		 */
+		if (!irq->group || allow_group1) {
+			if (!irq->hw) {
+				irq->pending_latch = true;
+				vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+			} else {
+				/* HW SGI? Ask the GIC to inject it */
+				int err;
+				err = irq_set_irqchip_state(irq->host_irq,
+							    IRQCHIP_STATE_PENDING,
+							    true);
+				WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+				raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+			}
+		} else {
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		}
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			 int offset, u32 *val)
+{
+	struct vgic_io_device dev = {
+		.regions = vgic_v3_dist_registers,
+		.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers),
+	};
+
+	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
+
+int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			   int offset, u32 *val)
+{
+	struct vgic_io_device rd_dev = {
+		.regions = vgic_v3_rd_registers,
+		.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers),
+	};
+
+	return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val);
+}
+
+int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+				    u32 intid, u64 *val)
+{
+	if (intid % 32)
+		return -EINVAL;
+
+	if (is_write)
+		vgic_write_irq_line_level_info(vcpu, intid, *val);
+	else
+		*val = vgic_read_irq_line_level_info(vcpu, intid);
+
+	return 0;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
new file mode 100644
index 000000000000..b2d73fc0d1ef
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -0,0 +1,1088 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGIC MMIO handling functions
+ */
+
+#include <linux/bitops.h>
+#include <linux/bsearch.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_arch_timer.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+				 gpa_t addr, unsigned int len)
+{
+	return 0;
+}
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+				 gpa_t addr, unsigned int len)
+{
+	return -1UL;
+}
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+			unsigned int len, unsigned long val)
+{
+	/* Ignore */
+}
+
+int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+			       unsigned int len, unsigned long val)
+{
+	/* Ignore */
+	return 0;
+}
+
+unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
+				   gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	u32 value = 0;
+	int i;
+
+	/* Loop over all IRQs affected by this read */
+	for (i = 0; i < len * 8; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		if (irq->group)
+			value |= BIT(i);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return value;
+}
+
+static void vgic_update_vsgi(struct vgic_irq *irq)
+{
+	WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group));
+}
+
+void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
+			   unsigned int len, unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for (i = 0; i < len * 8; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->group = !!(val & BIT(i));
+		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+			vgic_update_vsgi(irq);
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		} else {
+			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+		}
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+/*
+ * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
+ * of the enabled bit, so there is only one function for both here.
+ */
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	u32 value = 0;
+	int i;
+
+	/* Loop over all IRQs affected by this read */
+	for (i = 0; i < len * 8; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		if (irq->enabled)
+			value |= (1U << i);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return value;
+}
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+			if (!irq->enabled) {
+				struct irq_data *data;
+
+				irq->enabled = true;
+				data = &irq_to_desc(irq->host_irq)->irq_data;
+				while (irqd_irq_disabled(data))
+					enable_irq(irq->host_irq);
+			}
+
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+			vgic_put_irq(vcpu->kvm, irq);
+
+			continue;
+		} else if (vgic_irq_is_mapped_level(irq)) {
+			bool was_high = irq->line_level;
+
+			/*
+			 * We need to update the state of the interrupt because
+			 * the guest might have changed the state of the device
+			 * while the interrupt was disabled at the VGIC level.
+			 */
+			irq->line_level = vgic_get_phys_line_level(irq);
+			/*
+			 * Deactivate the physical interrupt so the GIC will let
+			 * us know when it is asserted again.
+			 */
+			if (!irq->active && was_high && !irq->line_level)
+				vgic_irq_set_phys_active(irq, false);
+		}
+		irq->enabled = true;
+		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled)
+			disable_irq_nosync(irq->host_irq);
+
+		irq->enabled = false;
+
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
+			       gpa_t addr, unsigned int len,
+			       unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->enabled = true;
+		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return 0;
+}
+
+int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
+			       gpa_t addr, unsigned int len,
+			       unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->enabled = false;
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return 0;
+}
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	u32 value = 0;
+	int i;
+
+	/* Loop over all IRQs affected by this read */
+	for (i = 0; i < len * 8; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+		unsigned long flags;
+		bool val;
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+			int err;
+
+			val = false;
+			err = irq_get_irqchip_state(irq->host_irq,
+						    IRQCHIP_STATE_PENDING,
+						    &val);
+			WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+		} else {
+			val = irq_is_pending(irq);
+		}
+
+		value |= ((u32)val << i);
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return value;
+}
+
+static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
+{
+	return (vgic_irq_is_sgi(irq->intid) &&
+		vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2);
+}
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+			      gpa_t addr, unsigned int len,
+			      unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		/* GICD_ISPENDR0 SGI bits are WI */
+		if (is_vgic_v2_sgi(vcpu, irq)) {
+			vgic_put_irq(vcpu->kvm, irq);
+			continue;
+		}
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+			/* HW SGI? Ask the GIC to inject it */
+			int err;
+			err = irq_set_irqchip_state(irq->host_irq,
+						    IRQCHIP_STATE_PENDING,
+						    true);
+			WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+			vgic_put_irq(vcpu->kvm, irq);
+
+			continue;
+		}
+
+		irq->pending_latch = true;
+		if (irq->hw)
+			vgic_irq_set_phys_active(irq, true);
+
+		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->pending_latch = true;
+
+		/*
+		 * GICv2 SGIs are terribly broken. We can't restore
+		 * the source of the interrupt, so just pick the vcpu
+		 * itself as the source...
+		 */
+		if (is_vgic_v2_sgi(vcpu, irq))
+			irq->source |= BIT(vcpu->vcpu_id);
+
+		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return 0;
+}
+
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
+{
+	irq->pending_latch = false;
+
+	/*
+	 * We don't want the guest to effectively mask the physical
+	 * interrupt by doing a write to SPENDR followed by a write to
+	 * CPENDR for HW interrupts, so we clear the active state on
+	 * the physical side if the virtual interrupt is not active.
+	 * This may lead to taking an additional interrupt on the
+	 * host, but that should not be a problem as the worst that
+	 * can happen is an additional vgic injection.  We also clear
+	 * the pending state to maintain proper semantics for edge HW
+	 * interrupts.
+	 */
+	vgic_irq_set_phys_pending(irq, false);
+	if (!irq->active)
+		vgic_irq_set_phys_active(irq, false);
+}
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+			      gpa_t addr, unsigned int len,
+			      unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		/* GICD_ICPENDR0 SGI bits are WI */
+		if (is_vgic_v2_sgi(vcpu, irq)) {
+			vgic_put_irq(vcpu->kvm, irq);
+			continue;
+		}
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+			/* HW SGI? Ask the GIC to clear its pending bit */
+			int err;
+			err = irq_set_irqchip_state(irq->host_irq,
+						    IRQCHIP_STATE_PENDING,
+						    false);
+			WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+			vgic_put_irq(vcpu->kvm, irq);
+
+			continue;
+		}
+
+		if (irq->hw)
+			vgic_hw_irq_cpending(vcpu, irq);
+		else
+			irq->pending_latch = false;
+
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+	unsigned long flags;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		/*
+		 * More fun with GICv2 SGIs! If we're clearing one of them
+		 * from userspace, which source vcpu to clear? Let's not
+		 * even think of it, and blow the whole set.
+		 */
+		if (is_vgic_v2_sgi(vcpu, irq))
+			irq->source = 0;
+
+		irq->pending_latch = false;
+
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return 0;
+}
+
+/*
+ * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
+ * is not queued on some running VCPU's LRs, because then the change to the
+ * active state can be overwritten when the VCPU's state is synced coming back
+ * from the guest.
+ *
+ * For shared interrupts as well as GICv3 private interrupts, we have to
+ * stop all the VCPUs because interrupts can be migrated while we don't hold
+ * the IRQ locks and we don't want to be chasing moving targets.
+ *
+ * For GICv2 private interrupts we don't have to do anything because
+ * userspace accesses to the VGIC state already require all VCPUs to be
+ * stopped, and only the VCPU itself can modify its private interrupts
+ * active state, which guarantees that the VCPU is not running.
+ */
+static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
+{
+	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+	    intid >= VGIC_NR_PRIVATE_IRQS)
+		kvm_arm_halt_guest(vcpu->kvm);
+}
+
+/* See vgic_access_active_prepare */
+static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
+{
+	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+	    intid >= VGIC_NR_PRIVATE_IRQS)
+		kvm_arm_resume_guest(vcpu->kvm);
+}
+
+static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+					     gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	u32 value = 0;
+	int i;
+
+	/* Loop over all IRQs affected by this read */
+	for (i = 0; i < len * 8; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		/*
+		 * Even for HW interrupts, don't evaluate the HW state as
+		 * all the guest is interested in is the virtual state.
+		 */
+		if (irq->active)
+			value |= (1U << i);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return value;
+}
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	u32 val;
+
+	mutex_lock(&vcpu->kvm->lock);
+	vgic_access_active_prepare(vcpu, intid);
+
+	val = __vgic_mmio_read_active(vcpu, addr, len);
+
+	vgic_access_active_finish(vcpu, intid);
+	mutex_unlock(&vcpu->kvm->lock);
+
+	return val;
+}
+
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len)
+{
+	return __vgic_mmio_read_active(vcpu, addr, len);
+}
+
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+				      bool active, bool is_uaccess)
+{
+	if (is_uaccess)
+		return;
+
+	irq->active = active;
+	vgic_irq_set_phys_active(irq, active);
+}
+
+static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+				    bool active)
+{
+	unsigned long flags;
+	struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu();
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+	if (irq->hw && !vgic_irq_is_sgi(irq->intid)) {
+		vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
+	} else if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+		/*
+		 * GICv4.1 VSGI feature doesn't track an active state,
+		 * so let's not kid ourselves, there is nothing we can
+		 * do here.
+		 */
+		irq->active = false;
+	} else {
+		u32 model = vcpu->kvm->arch.vgic.vgic_model;
+		u8 active_source;
+
+		irq->active = active;
+
+		/*
+		 * The GICv2 architecture indicates that the source CPUID for
+		 * an SGI should be provided during an EOI which implies that
+		 * the active state is stored somewhere, but at the same time
+		 * this state is not architecturally exposed anywhere and we
+		 * have no way of knowing the right source.
+		 *
+		 * This may lead to a VCPU not being able to receive
+		 * additional instances of a particular SGI after migration
+		 * for a GICv2 VM on some GIC implementations.  Oh well.
+		 */
+		active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
+
+		if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+		    active && vgic_irq_is_sgi(irq->intid))
+			irq->active_source = active_source;
+	}
+
+	if (irq->active)
+		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+	else
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+}
+
+static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+				      gpa_t addr, unsigned int len,
+				      unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+		vgic_mmio_change_active(vcpu, irq, false);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+
+	mutex_lock(&vcpu->kvm->lock);
+	vgic_access_active_prepare(vcpu, intid);
+
+	__vgic_mmio_write_cactive(vcpu, addr, len, val);
+
+	vgic_access_active_finish(vcpu, intid);
+	mutex_unlock(&vcpu->kvm->lock);
+}
+
+int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	__vgic_mmio_write_cactive(vcpu, addr, len, val);
+	return 0;
+}
+
+static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+				      gpa_t addr, unsigned int len,
+				      unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+	int i;
+
+	for_each_set_bit(i, &val, len * 8) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+		vgic_mmio_change_active(vcpu, irq, true);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+
+	mutex_lock(&vcpu->kvm->lock);
+	vgic_access_active_prepare(vcpu, intid);
+
+	__vgic_mmio_write_sactive(vcpu, addr, len, val);
+
+	vgic_access_active_finish(vcpu, intid);
+	mutex_unlock(&vcpu->kvm->lock);
+}
+
+int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len,
+				     unsigned long val)
+{
+	__vgic_mmio_write_sactive(vcpu, addr, len, val);
+	return 0;
+}
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+				      gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+	int i;
+	u64 val = 0;
+
+	for (i = 0; i < len; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		val |= (u64)irq->priority << (i * 8);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return val;
+}
+
+/*
+ * We currently don't handle changing the priority of an interrupt that
+ * is already pending on a VCPU. If there is a need for this, we would
+ * need to make this VCPU exit and re-evaluate the priorities, potentially
+ * leading to this interrupt getting presented now to the guest (if it has
+ * been masked by the priority mask before).
+ */
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+			      gpa_t addr, unsigned int len,
+			      unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+	int i;
+	unsigned long flags;
+
+	for (i = 0; i < len; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		/* Narrow the priority range to what we actually support */
+		irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
+		if (irq->hw && vgic_irq_is_sgi(irq->intid))
+			vgic_update_vsgi(irq);
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+	u32 value = 0;
+	int i;
+
+	for (i = 0; i < len * 4; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		if (irq->config == VGIC_CONFIG_EDGE)
+			value |= (2U << (i * 2));
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return value;
+}
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+			    gpa_t addr, unsigned int len,
+			    unsigned long val)
+{
+	u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+	int i;
+	unsigned long flags;
+
+	for (i = 0; i < len * 4; i++) {
+		struct vgic_irq *irq;
+
+		/*
+		 * The configuration cannot be changed for SGIs in general,
+		 * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
+		 * code relies on PPIs being level triggered, so we also
+		 * make them read-only here.
+		 */
+		if (intid + i < VGIC_NR_PRIVATE_IRQS)
+			continue;
+
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		if (test_bit(i * 2 + 1, &val))
+			irq->config = VGIC_CONFIG_EDGE;
+		else
+			irq->config = VGIC_CONFIG_LEVEL;
+
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
+{
+	int i;
+	u64 val = 0;
+	int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+
+	for (i = 0; i < 32; i++) {
+		struct vgic_irq *irq;
+
+		if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
+			continue;
+
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+		if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
+			val |= (1U << i);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return val;
+}
+
+void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
+				    const u64 val)
+{
+	int i;
+	int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+	unsigned long flags;
+
+	for (i = 0; i < 32; i++) {
+		struct vgic_irq *irq;
+		bool new_level;
+
+		if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
+			continue;
+
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+		/*
+		 * Line level is set irrespective of irq type
+		 * (level or edge) to avoid dependency that VM should
+		 * restore irq config before line level.
+		 */
+		new_level = !!(val & (1U << i));
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		irq->line_level = new_level;
+		if (new_level)
+			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+		else
+			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+static int match_region(const void *key, const void *elt)
+{
+	const unsigned int offset = (unsigned long)key;
+	const struct vgic_register_region *region = elt;
+
+	if (offset < region->reg_offset)
+		return -1;
+
+	if (offset >= region->reg_offset + region->len)
+		return 1;
+
+	return 0;
+}
+
+const struct vgic_register_region *
+vgic_find_mmio_region(const struct vgic_register_region *regions,
+		      int nr_regions, unsigned int offset)
+{
+	return bsearch((void *)(uintptr_t)offset, regions, nr_regions,
+		       sizeof(regions[0]), match_region);
+}
+
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_set_vmcr(vcpu, vmcr);
+	else
+		vgic_v3_set_vmcr(vcpu, vmcr);
+}
+
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_get_vmcr(vcpu, vmcr);
+	else
+		vgic_v3_get_vmcr(vcpu, vmcr);
+}
+
+/*
+ * kvm_mmio_read_buf() returns a value in a format where it can be converted
+ * to a byte array and be directly observed as the guest wanted it to appear
+ * in memory if it had done the store itself, which is LE for the GIC, as the
+ * guest knows the GIC is always LE.
+ *
+ * We convert this value to the CPUs native format to deal with it as a data
+ * value.
+ */
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
+{
+	unsigned long data = kvm_mmio_read_buf(val, len);
+
+	switch (len) {
+	case 1:
+		return data;
+	case 2:
+		return le16_to_cpu(data);
+	case 4:
+		return le32_to_cpu(data);
+	default:
+		return le64_to_cpu(data);
+	}
+}
+
+/*
+ * kvm_mmio_write_buf() expects a value in a format such that if converted to
+ * a byte array it is observed as the guest would see it if it could perform
+ * the load directly.  Since the GIC is LE, and the guest knows this, the
+ * guest expects a value in little endian format.
+ *
+ * We convert the data value from the CPUs native format to LE so that the
+ * value is returned in the proper format.
+ */
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+				unsigned long data)
+{
+	switch (len) {
+	case 1:
+		break;
+	case 2:
+		data = cpu_to_le16(data);
+		break;
+	case 4:
+		data = cpu_to_le32(data);
+		break;
+	default:
+		data = cpu_to_le64(data);
+	}
+
+	kvm_mmio_write_buf(buf, len, data);
+}
+
+static
+struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
+{
+	return container_of(dev, struct vgic_io_device, dev);
+}
+
+static bool check_region(const struct kvm *kvm,
+			 const struct vgic_register_region *region,
+			 gpa_t addr, int len)
+{
+	int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+
+	switch (len) {
+	case sizeof(u8):
+		flags = VGIC_ACCESS_8bit;
+		break;
+	case sizeof(u32):
+		flags = VGIC_ACCESS_32bit;
+		break;
+	case sizeof(u64):
+		flags = VGIC_ACCESS_64bit;
+		break;
+	default:
+		return false;
+	}
+
+	if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) {
+		if (!region->bits_per_irq)
+			return true;
+
+		/* Do we access a non-allocated IRQ? */
+		return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs;
+	}
+
+	return false;
+}
+
+const struct vgic_register_region *
+vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
+		     gpa_t addr, int len)
+{
+	const struct vgic_register_region *region;
+
+	region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
+				       addr - iodev->base_addr);
+	if (!region || !check_region(vcpu->kvm, region, addr, len))
+		return NULL;
+
+	return region;
+}
+
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+			     gpa_t addr, u32 *val)
+{
+	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+	const struct vgic_register_region *region;
+	struct kvm_vcpu *r_vcpu;
+
+	region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
+	if (!region) {
+		*val = 0;
+		return 0;
+	}
+
+	r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+	if (region->uaccess_read)
+		*val = region->uaccess_read(r_vcpu, addr, sizeof(u32));
+	else
+		*val = region->read(r_vcpu, addr, sizeof(u32));
+
+	return 0;
+}
+
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+			      gpa_t addr, const u32 *val)
+{
+	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+	const struct vgic_register_region *region;
+	struct kvm_vcpu *r_vcpu;
+
+	region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
+	if (!region)
+		return 0;
+
+	r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+	if (region->uaccess_write)
+		return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val);
+
+	region->write(r_vcpu, addr, sizeof(u32), *val);
+	return 0;
+}
+
+/*
+ * Userland access to VGIC registers.
+ */
+int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
+		 bool is_write, int offset, u32 *val)
+{
+	if (is_write)
+		return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
+	else
+		return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
+}
+
+static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+			      gpa_t addr, int len, void *val)
+{
+	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+	const struct vgic_register_region *region;
+	unsigned long data = 0;
+
+	region = vgic_get_mmio_region(vcpu, iodev, addr, len);
+	if (!region) {
+		memset(val, 0, len);
+		return 0;
+	}
+
+	switch (iodev->iodev_type) {
+	case IODEV_CPUIF:
+		data = region->read(vcpu, addr, len);
+		break;
+	case IODEV_DIST:
+		data = region->read(vcpu, addr, len);
+		break;
+	case IODEV_REDIST:
+		data = region->read(iodev->redist_vcpu, addr, len);
+		break;
+	case IODEV_ITS:
+		data = region->its_read(vcpu->kvm, iodev->its, addr, len);
+		break;
+	}
+
+	vgic_data_host_to_mmio_bus(val, len, data);
+	return 0;
+}
+
+static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+			       gpa_t addr, int len, const void *val)
+{
+	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+	const struct vgic_register_region *region;
+	unsigned long data = vgic_data_mmio_bus_to_host(val, len);
+
+	region = vgic_get_mmio_region(vcpu, iodev, addr, len);
+	if (!region)
+		return 0;
+
+	switch (iodev->iodev_type) {
+	case IODEV_CPUIF:
+		region->write(vcpu, addr, len, data);
+		break;
+	case IODEV_DIST:
+		region->write(vcpu, addr, len, data);
+		break;
+	case IODEV_REDIST:
+		region->write(iodev->redist_vcpu, addr, len, data);
+		break;
+	case IODEV_ITS:
+		region->its_write(vcpu->kvm, iodev->its, addr, len, data);
+		break;
+	}
+
+	return 0;
+}
+
+struct kvm_io_device_ops kvm_io_gic_ops = {
+	.read = dispatch_mmio_read,
+	.write = dispatch_mmio_write,
+};
+
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+			     enum vgic_type type)
+{
+	struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
+	int ret = 0;
+	unsigned int len;
+
+	switch (type) {
+	case VGIC_V2:
+		len = vgic_v2_init_dist_iodev(io_device);
+		break;
+	case VGIC_V3:
+		len = vgic_v3_init_dist_iodev(io_device);
+		break;
+	default:
+		BUG_ON(1);
+	}
+
+	io_device->base_addr = dist_base_address;
+	io_device->iodev_type = IODEV_DIST;
+	io_device->redist_vcpu = NULL;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
+				      len, &io_device->dev);
+	mutex_unlock(&kvm->slots_lock);
+
+	return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h
new file mode 100644
index 000000000000..fefcca2b14dc
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-mmio.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+#ifndef __KVM_ARM_VGIC_MMIO_H__
+#define __KVM_ARM_VGIC_MMIO_H__
+
+struct vgic_register_region {
+	unsigned int reg_offset;
+	unsigned int len;
+	unsigned int bits_per_irq;
+	unsigned int access_flags;
+	union {
+		unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+				      unsigned int len);
+		unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
+					  gpa_t addr, unsigned int len);
+	};
+	union {
+		void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
+			      unsigned int len, unsigned long val);
+		void (*its_write)(struct kvm *kvm, struct vgic_its *its,
+				  gpa_t addr, unsigned int len,
+				  unsigned long val);
+	};
+	unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr,
+				      unsigned int len);
+	union {
+		int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr,
+				     unsigned int len, unsigned long val);
+		int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its,
+					 gpa_t addr, unsigned int len,
+					 unsigned long val);
+	};
+};
+
+extern struct kvm_io_device_ops kvm_io_gic_ops;
+
+#define VGIC_ACCESS_8bit	1
+#define VGIC_ACCESS_32bit	2
+#define VGIC_ACCESS_64bit	4
+
+/*
+ * Generate a mask that covers the number of bytes required to address
+ * up to 1024 interrupts, each represented by <bits> bits. This assumes
+ * that <bits> is a power of two.
+ */
+#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1)
+
+/*
+ * (addr & mask) gives us the _byte_ offset for the INT ID.
+ * We multiply this by 8 the get the _bit_ offset, then divide this by
+ * the number of bits to learn the actual INT ID.
+ * But instead of a division (which requires a "long long div" implementation),
+ * we shift by the binary logarithm of <bits>.
+ * This assumes that <bits> is a power of two.
+ */
+#define VGIC_ADDR_TO_INTID(addr, bits)  (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \
+					8 >> ilog2(bits))
+
+/*
+ * Some VGIC registers store per-IRQ information, with a different number
+ * of bits per IRQ. For those registers this macro is used.
+ * The _WITH_LENGTH version instantiates registers with a fixed length
+ * and is mutually exclusive with the _PER_IRQ version.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc)	\
+	{								\
+		.reg_offset = off,					\
+		.bits_per_irq = bpi,					\
+		.len = bpi * 1024 / 8,					\
+		.access_flags = acc,					\
+		.read = rd,						\
+		.write = wr,						\
+		.uaccess_read = ur,					\
+		.uaccess_write = uw,					\
+	}
+
+#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc)		\
+	{								\
+		.reg_offset = off,					\
+		.bits_per_irq = 0,					\
+		.len = length,						\
+		.access_flags = acc,					\
+		.read = rd,						\
+		.write = wr,						\
+	}
+
+#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \
+	{								\
+		.reg_offset = off,					\
+		.bits_per_irq = 0,					\
+		.len = length,						\
+		.access_flags = acc,					\
+		.read = rd,						\
+		.write = wr,						\
+		.uaccess_read = urd,					\
+		.uaccess_write = uwr,					\
+	}
+
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
+
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+				unsigned long data);
+
+unsigned long extract_bytes(u64 data, unsigned int offset,
+			    unsigned int num);
+
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+		     unsigned long val);
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+				 gpa_t addr, unsigned int len);
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+				 gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+			unsigned int len, unsigned long val);
+
+int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+			       unsigned int len, unsigned long val);
+
+unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr,
+				   unsigned int len);
+
+void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
+			   unsigned int len, unsigned long val);
+
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val);
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val);
+
+int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
+			       gpa_t addr, unsigned int len,
+			       unsigned long val);
+
+int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
+			       gpa_t addr, unsigned int len,
+			       unsigned long val);
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+				     gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+			      gpa_t addr, unsigned int len,
+			      unsigned long val);
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+			      gpa_t addr, unsigned int len,
+			      unsigned long val);
+
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val);
+
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+				gpa_t addr, unsigned int len,
+				unsigned long val);
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len);
+
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val);
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+			     gpa_t addr, unsigned int len,
+			     unsigned long val);
+
+int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len,
+				    unsigned long val);
+
+int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len,
+				    unsigned long val);
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+				      gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+			      gpa_t addr, unsigned int len,
+			      unsigned long val);
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+				    gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+			    gpa_t addr, unsigned int len,
+			    unsigned long val);
+
+int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
+		 bool is_write, int offset, u32 *val);
+
+u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid);
+
+void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
+				    const u64 val);
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
+
+u64 vgic_sanitise_outer_cacheability(u64 reg);
+u64 vgic_sanitise_inner_cacheability(u64 reg);
+u64 vgic_sanitise_shareability(u64 reg);
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+			u64 (*sanitise_fn)(u64));
+
+/* Find the proper register handler entry given a certain address offset */
+const struct vgic_register_region *
+vgic_find_mmio_region(const struct vgic_register_region *regions,
+		      int nr_regions, unsigned int offset);
+
+#endif
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
new file mode 100644
index 000000000000..621cc168fe3f
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+
+static inline void vgic_v2_write_lr(int lr, u32 val)
+{
+	void __iomem *base = kvm_vgic_global_state.vctrl_base;
+
+	writel_relaxed(val, base + GICH_LR0 + (lr * 4));
+}
+
+void vgic_v2_init_lrs(void)
+{
+	int i;
+
+	for (i = 0; i < kvm_vgic_global_state.nr_lr; i++)
+		vgic_v2_write_lr(i, 0);
+}
+
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	cpuif->vgic_hcr |= GICH_HCR_UIE;
+}
+
+static bool lr_signals_eoi_mi(u32 lr_val)
+{
+	return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) &&
+	       !(lr_val & GICH_LR_HW);
+}
+
+/*
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ *   - transferred as is in case of edge sensitive IRQs
+ *   - set to the line-level (resample time) for level sensitive IRQs
+ */
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+	int lr;
+
+	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+
+	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
+		u32 val = cpuif->vgic_lr[lr];
+		u32 cpuid, intid = val & GICH_LR_VIRTUALID;
+		struct vgic_irq *irq;
+
+		/* Extract the source vCPU id from the LR */
+		cpuid = val & GICH_LR_PHYSID_CPUID;
+		cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+		cpuid &= 7;
+
+		/* Notify fds when the guest EOI'ed a level-triggered SPI */
+		if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+			kvm_notify_acked_irq(vcpu->kvm, 0,
+					     intid - VGIC_NR_PRIVATE_IRQS);
+
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+		raw_spin_lock(&irq->irq_lock);
+
+		/* Always preserve the active bit */
+		irq->active = !!(val & GICH_LR_ACTIVE_BIT);
+
+		if (irq->active && vgic_irq_is_sgi(intid))
+			irq->active_source = cpuid;
+
+		/* Edge is the only case where we preserve the pending bit */
+		if (irq->config == VGIC_CONFIG_EDGE &&
+		    (val & GICH_LR_PENDING_BIT)) {
+			irq->pending_latch = true;
+
+			if (vgic_irq_is_sgi(intid))
+				irq->source |= (1 << cpuid);
+		}
+
+		/*
+		 * Clear soft pending state when level irqs have been acked.
+		 */
+		if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
+			irq->pending_latch = false;
+
+		/*
+		 * Level-triggered mapped IRQs are special because we only
+		 * observe rising edges as input to the VGIC.
+		 *
+		 * If the guest never acked the interrupt we have to sample
+		 * the physical line and set the line level, because the
+		 * device state could have changed or we simply need to
+		 * process the still pending interrupt later.
+		 *
+		 * If this causes us to lower the level, we have to also clear
+		 * the physical active state, since we will otherwise never be
+		 * told when the interrupt becomes asserted again.
+		 */
+		if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
+			irq->line_level = vgic_get_phys_line_level(irq);
+
+			if (!irq->line_level)
+				vgic_irq_set_phys_active(irq, false);
+		}
+
+		raw_spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	vgic_cpu->used_lrs = 0;
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ *   it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+	u32 val = irq->intid;
+	bool allow_pending = true;
+
+	if (irq->active) {
+		val |= GICH_LR_ACTIVE_BIT;
+		if (vgic_irq_is_sgi(irq->intid))
+			val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
+		if (vgic_irq_is_multi_sgi(irq)) {
+			allow_pending = false;
+			val |= GICH_LR_EOI;
+		}
+	}
+
+	if (irq->group)
+		val |= GICH_LR_GROUP1;
+
+	if (irq->hw) {
+		val |= GICH_LR_HW;
+		val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
+		/*
+		 * Never set pending+active on a HW interrupt, as the
+		 * pending state is kept at the physical distributor
+		 * level.
+		 */
+		if (irq->active)
+			allow_pending = false;
+	} else {
+		if (irq->config == VGIC_CONFIG_LEVEL) {
+			val |= GICH_LR_EOI;
+
+			/*
+			 * Software resampling doesn't work very well
+			 * if we allow P+A, so let's not do that.
+			 */
+			if (irq->active)
+				allow_pending = false;
+		}
+	}
+
+	if (allow_pending && irq_is_pending(irq)) {
+		val |= GICH_LR_PENDING_BIT;
+
+		if (irq->config == VGIC_CONFIG_EDGE)
+			irq->pending_latch = false;
+
+		if (vgic_irq_is_sgi(irq->intid)) {
+			u32 src = ffs(irq->source);
+
+			if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+					   irq->intid))
+				return;
+
+			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+			irq->source &= ~(1 << (src - 1));
+			if (irq->source) {
+				irq->pending_latch = true;
+				val |= GICH_LR_EOI;
+			}
+		}
+	}
+
+	/*
+	 * Level-triggered mapped IRQs are special because we only observe
+	 * rising edges as input to the VGIC.  We therefore lower the line
+	 * level here, so that we can take new virtual IRQs.  See
+	 * vgic_v2_fold_lr_state for more info.
+	 */
+	if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT))
+		irq->line_level = false;
+
+	/* The GICv2 LR only holds five bits of priority. */
+	val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+}
+
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
+}
+
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+	u32 vmcr;
+
+	vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) &
+		GICH_VMCR_ENABLE_GRP0_MASK;
+	vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) &
+		GICH_VMCR_ENABLE_GRP1_MASK;
+	vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) &
+		GICH_VMCR_ACK_CTL_MASK;
+	vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) &
+		GICH_VMCR_FIQ_EN_MASK;
+	vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) &
+		GICH_VMCR_CBPR_MASK;
+	vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) &
+		GICH_VMCR_EOI_MODE_MASK;
+	vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
+		GICH_VMCR_ALIAS_BINPOINT_MASK;
+	vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
+		GICH_VMCR_BINPOINT_MASK;
+	vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
+		 GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
+
+	cpu_if->vgic_vmcr = vmcr;
+}
+
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+	u32 vmcr;
+
+	vmcr = cpu_if->vgic_vmcr;
+
+	vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >>
+		GICH_VMCR_ENABLE_GRP0_SHIFT;
+	vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >>
+		GICH_VMCR_ENABLE_GRP1_SHIFT;
+	vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >>
+		GICH_VMCR_ACK_CTL_SHIFT;
+	vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >>
+		GICH_VMCR_FIQ_EN_SHIFT;
+	vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >>
+		GICH_VMCR_CBPR_SHIFT;
+	vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >>
+		GICH_VMCR_EOI_MODE_SHIFT;
+
+	vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
+			GICH_VMCR_ALIAS_BINPOINT_SHIFT;
+	vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
+			GICH_VMCR_BINPOINT_SHIFT;
+	vmcrp->pmr  = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
+			GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
+}
+
+void vgic_v2_enable(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * By forcing VMCR to zero, the GIC will restore the binary
+	 * points to their reset values. Anything else resets to zero
+	 * anyway.
+	 */
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+
+	/* Get the show on the road... */
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
+}
+
+/* check for overlapping regions and for regions crossing the end of memory */
+static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
+{
+	if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base)
+		return false;
+	if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base)
+		return false;
+
+	if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base)
+		return true;
+	if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base)
+		return true;
+
+	return false;
+}
+
+int vgic_v2_map_resources(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	int ret = 0;
+
+	if (vgic_ready(kvm))
+		goto out;
+
+	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+	    IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
+		kvm_err("Need to set vgic cpu and dist addresses first\n");
+		ret = -ENXIO;
+		goto out;
+	}
+
+	if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) {
+		kvm_err("VGIC CPU and dist frames overlap\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Initialize the vgic if this hasn't already been done on demand by
+	 * accessing the vgic state from userspace.
+	 */
+	ret = vgic_init(kvm);
+	if (ret) {
+		kvm_err("Unable to initialize VGIC dynamic data structures\n");
+		goto out;
+	}
+
+	ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
+	if (ret) {
+		kvm_err("Unable to register VGIC MMIO regions\n");
+		goto out;
+	}
+
+	if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
+		ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+					    kvm_vgic_global_state.vcpu_base,
+					    KVM_VGIC_V2_CPU_SIZE, true);
+		if (ret) {
+			kvm_err("Unable to remap VGIC CPU to VCPU\n");
+			goto out;
+		}
+	}
+
+	dist->ready = true;
+
+out:
+	return ret;
+}
+
+DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
+
+/**
+ * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller
+ * @info:	pointer to the GIC description
+ *
+ * Returns 0 if the VGICv2 has been probed successfully, returns an error code
+ * otherwise
+ */
+int vgic_v2_probe(const struct gic_kvm_info *info)
+{
+	int ret;
+	u32 vtr;
+
+	if (!info->vctrl.start) {
+		kvm_err("GICH not present in the firmware table\n");
+		return -ENXIO;
+	}
+
+	if (!PAGE_ALIGNED(info->vcpu.start) ||
+	    !PAGE_ALIGNED(resource_size(&info->vcpu))) {
+		kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
+
+		ret = create_hyp_io_mappings(info->vcpu.start,
+					     resource_size(&info->vcpu),
+					     &kvm_vgic_global_state.vcpu_base_va,
+					     &kvm_vgic_global_state.vcpu_hyp_va);
+		if (ret) {
+			kvm_err("Cannot map GICV into hyp\n");
+			goto out;
+		}
+
+		static_branch_enable(&vgic_v2_cpuif_trap);
+	}
+
+	ret = create_hyp_io_mappings(info->vctrl.start,
+				     resource_size(&info->vctrl),
+				     &kvm_vgic_global_state.vctrl_base,
+				     &kvm_vgic_global_state.vctrl_hyp);
+	if (ret) {
+		kvm_err("Cannot map VCTRL into hyp\n");
+		goto out;
+	}
+
+	vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
+	kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
+
+	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+	if (ret) {
+		kvm_err("Cannot register GICv2 KVM device\n");
+		goto out;
+	}
+
+	kvm_vgic_global_state.can_emulate_gicv2 = true;
+	kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+	kvm_vgic_global_state.type = VGIC_V2;
+	kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
+
+	kvm_debug("vgic-v2@%llx\n", info->vctrl.start);
+
+	return 0;
+out:
+	if (kvm_vgic_global_state.vctrl_base)
+		iounmap(kvm_vgic_global_state.vctrl_base);
+	if (kvm_vgic_global_state.vcpu_base_va)
+		iounmap(kvm_vgic_global_state.vcpu_base_va);
+
+	return ret;
+}
+
+static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	u64 elrsr;
+	int i;
+
+	elrsr = readl_relaxed(base + GICH_ELRSR0);
+	if (unlikely(used_lrs > 32))
+		elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32;
+
+	for (i = 0; i < used_lrs; i++) {
+		if (elrsr & (1UL << i))
+			cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
+		else
+			cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
+
+		writel_relaxed(0, base + GICH_LR0 + (i * 4));
+	}
+}
+
+void vgic_v2_save_state(struct kvm_vcpu *vcpu)
+{
+	void __iomem *base = kvm_vgic_global_state.vctrl_base;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+
+	if (!base)
+		return;
+
+	if (used_lrs) {
+		save_lrs(vcpu, base);
+		writel_relaxed(0, base + GICH_HCR);
+	}
+}
+
+void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+	void __iomem *base = kvm_vgic_global_state.vctrl_base;
+	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	int i;
+
+	if (!base)
+		return;
+
+	if (used_lrs) {
+		writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+		for (i = 0; i < used_lrs; i++) {
+			writel_relaxed(cpu_if->vgic_lr[i],
+				       base + GICH_LR0 + (i * 4));
+		}
+	}
+}
+
+void vgic_v2_load(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	writel_relaxed(cpu_if->vgic_vmcr,
+		       kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+	writel_relaxed(cpu_if->vgic_apr,
+		       kvm_vgic_global_state.vctrl_base + GICH_APR);
+}
+
+void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+}
+
+void vgic_v2_put(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+	vgic_v2_vmcr_sync(vcpu);
+	cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
new file mode 100644
index 000000000000..5bc2ab58954b
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -0,0 +1,691 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_asm.h>
+
+#include "vgic.h"
+
+static bool group0_trap;
+static bool group1_trap;
+static bool common_trap;
+static bool gicv4_enable;
+
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	cpuif->vgic_hcr |= ICH_HCR_UIE;
+}
+
+static bool lr_signals_eoi_mi(u64 lr_val)
+{
+	return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) &&
+	       !(lr_val & ICH_LR_HW);
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+	int lr;
+
+	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+
+	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
+		u64 val = cpuif->vgic_lr[lr];
+		u32 intid, cpuid;
+		struct vgic_irq *irq;
+		bool is_v2_sgi = false;
+
+		cpuid = val & GICH_LR_PHYSID_CPUID;
+		cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+
+		if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+			intid = val & ICH_LR_VIRTUAL_ID_MASK;
+		} else {
+			intid = val & GICH_LR_VIRTUALID;
+			is_v2_sgi = vgic_irq_is_sgi(intid);
+		}
+
+		/* Notify fds when the guest EOI'ed a level-triggered IRQ */
+		if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+			kvm_notify_acked_irq(vcpu->kvm, 0,
+					     intid - VGIC_NR_PRIVATE_IRQS);
+
+		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+		if (!irq)	/* An LPI could have been unmapped. */
+			continue;
+
+		raw_spin_lock(&irq->irq_lock);
+
+		/* Always preserve the active bit */
+		irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+		if (irq->active && is_v2_sgi)
+			irq->active_source = cpuid;
+
+		/* Edge is the only case where we preserve the pending bit */
+		if (irq->config == VGIC_CONFIG_EDGE &&
+		    (val & ICH_LR_PENDING_BIT)) {
+			irq->pending_latch = true;
+
+			if (is_v2_sgi)
+				irq->source |= (1 << cpuid);
+		}
+
+		/*
+		 * Clear soft pending state when level irqs have been acked.
+		 */
+		if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
+			irq->pending_latch = false;
+
+		/*
+		 * Level-triggered mapped IRQs are special because we only
+		 * observe rising edges as input to the VGIC.
+		 *
+		 * If the guest never acked the interrupt we have to sample
+		 * the physical line and set the line level, because the
+		 * device state could have changed or we simply need to
+		 * process the still pending interrupt later.
+		 *
+		 * If this causes us to lower the level, we have to also clear
+		 * the physical active state, since we will otherwise never be
+		 * told when the interrupt becomes asserted again.
+		 */
+		if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
+			irq->line_level = vgic_get_phys_line_level(irq);
+
+			if (!irq->line_level)
+				vgic_irq_set_phys_active(irq, false);
+		}
+
+		raw_spin_unlock(&irq->irq_lock);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	vgic_cpu->used_lrs = 0;
+}
+
+/* Requires the irq to be locked already */
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+	u64 val = irq->intid;
+	bool allow_pending = true, is_v2_sgi;
+
+	is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
+		     model == KVM_DEV_TYPE_ARM_VGIC_V2);
+
+	if (irq->active) {
+		val |= ICH_LR_ACTIVE_BIT;
+		if (is_v2_sgi)
+			val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
+		if (vgic_irq_is_multi_sgi(irq)) {
+			allow_pending = false;
+			val |= ICH_LR_EOI;
+		}
+	}
+
+	if (irq->hw) {
+		val |= ICH_LR_HW;
+		val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
+		/*
+		 * Never set pending+active on a HW interrupt, as the
+		 * pending state is kept at the physical distributor
+		 * level.
+		 */
+		if (irq->active)
+			allow_pending = false;
+	} else {
+		if (irq->config == VGIC_CONFIG_LEVEL) {
+			val |= ICH_LR_EOI;
+
+			/*
+			 * Software resampling doesn't work very well
+			 * if we allow P+A, so let's not do that.
+			 */
+			if (irq->active)
+				allow_pending = false;
+		}
+	}
+
+	if (allow_pending && irq_is_pending(irq)) {
+		val |= ICH_LR_PENDING_BIT;
+
+		if (irq->config == VGIC_CONFIG_EDGE)
+			irq->pending_latch = false;
+
+		if (vgic_irq_is_sgi(irq->intid) &&
+		    model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+			u32 src = ffs(irq->source);
+
+			if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+					   irq->intid))
+				return;
+
+			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+			irq->source &= ~(1 << (src - 1));
+			if (irq->source) {
+				irq->pending_latch = true;
+				val |= ICH_LR_EOI;
+			}
+		}
+	}
+
+	/*
+	 * Level-triggered mapped IRQs are special because we only observe
+	 * rising edges as input to the VGIC.  We therefore lower the line
+	 * level here, so that we can take new virtual IRQs.  See
+	 * vgic_v3_fold_lr_state for more info.
+	 */
+	if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
+		irq->line_level = false;
+
+	if (irq->group)
+		val |= ICH_LR_GROUP;
+
+	val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+}
+
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
+}
+
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+	u32 vmcr;
+
+	if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+		vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) &
+			ICH_VMCR_ACK_CTL_MASK;
+		vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) &
+			ICH_VMCR_FIQ_EN_MASK;
+	} else {
+		/*
+		 * When emulating GICv3 on GICv3 with SRE=1 on the
+		 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
+		 */
+		vmcr = ICH_VMCR_FIQ_EN_MASK;
+	}
+
+	vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
+	vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
+	vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
+	vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
+	vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
+	vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
+	vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
+
+	cpu_if->vgic_vmcr = vmcr;
+}
+
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	u32 model = vcpu->kvm->arch.vgic.vgic_model;
+	u32 vmcr;
+
+	vmcr = cpu_if->vgic_vmcr;
+
+	if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+		vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >>
+			ICH_VMCR_ACK_CTL_SHIFT;
+		vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >>
+			ICH_VMCR_FIQ_EN_SHIFT;
+	} else {
+		/*
+		 * When emulating GICv3 on GICv3 with SRE=1 on the
+		 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
+		 */
+		vmcrp->fiqen = 1;
+		vmcrp->ackctl = 0;
+	}
+
+	vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+	vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT;
+	vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+	vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+	vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+	vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
+	vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
+}
+
+#define INITIAL_PENDBASER_VALUE						  \
+	(GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)		| \
+	GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)	| \
+	GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
+
+void vgic_v3_enable(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	/*
+	 * By forcing VMCR to zero, the GIC will restore the binary
+	 * points to their reset values. Anything else resets to zero
+	 * anyway.
+	 */
+	vgic_v3->vgic_vmcr = 0;
+
+	/*
+	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
+	 * way, so we force SRE to 1 to demonstrate this to the guest.
+	 * Also, we don't support any form of IRQ/FIQ bypass.
+	 * This goes with the spec allowing the value to be RAO/WI.
+	 */
+	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+		vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB |
+				     ICC_SRE_EL1_DFB |
+				     ICC_SRE_EL1_SRE);
+		vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
+	} else {
+		vgic_v3->vgic_sre = 0;
+	}
+
+	vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
+					   ICH_VTR_ID_BITS_MASK) >>
+					   ICH_VTR_ID_BITS_SHIFT;
+	vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
+					    ICH_VTR_PRI_BITS_MASK) >>
+					    ICH_VTR_PRI_BITS_SHIFT) + 1;
+
+	/* Get the show on the road... */
+	vgic_v3->vgic_hcr = ICH_HCR_EN;
+	if (group0_trap)
+		vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
+	if (group1_trap)
+		vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
+	if (common_trap)
+		vgic_v3->vgic_hcr |= ICH_HCR_TC;
+}
+
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
+{
+	struct kvm_vcpu *vcpu;
+	int byte_offset, bit_nr;
+	gpa_t pendbase, ptr;
+	bool status;
+	u8 val;
+	int ret;
+	unsigned long flags;
+
+retry:
+	vcpu = irq->target_vcpu;
+	if (!vcpu)
+		return 0;
+
+	pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+
+	byte_offset = irq->intid / BITS_PER_BYTE;
+	bit_nr = irq->intid % BITS_PER_BYTE;
+	ptr = pendbase + byte_offset;
+
+	ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
+	if (ret)
+		return ret;
+
+	status = val & (1 << bit_nr);
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	if (irq->target_vcpu != vcpu) {
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		goto retry;
+	}
+	irq->pending_latch = status;
+	vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+	if (status) {
+		/* clear consumed data */
+		val &= ~(1 << bit_nr);
+		ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/**
+ * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
+ * kvm lock and all vcpu lock must be held
+ */
+int vgic_v3_save_pending_tables(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq;
+	gpa_t last_ptr = ~(gpa_t)0;
+	int ret;
+	u8 val;
+
+	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+		int byte_offset, bit_nr;
+		struct kvm_vcpu *vcpu;
+		gpa_t pendbase, ptr;
+		bool stored;
+
+		vcpu = irq->target_vcpu;
+		if (!vcpu)
+			continue;
+
+		pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+
+		byte_offset = irq->intid / BITS_PER_BYTE;
+		bit_nr = irq->intid % BITS_PER_BYTE;
+		ptr = pendbase + byte_offset;
+
+		if (ptr != last_ptr) {
+			ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
+			if (ret)
+				return ret;
+			last_ptr = ptr;
+		}
+
+		stored = val & (1U << bit_nr);
+		if (stored == irq->pending_latch)
+			continue;
+
+		if (irq->pending_latch)
+			val |= 1 << bit_nr;
+		else
+			val &= ~(1 << bit_nr);
+
+		ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/**
+ * vgic_v3_rdist_overlap - check if a region overlaps with any
+ * existing redistributor region
+ *
+ * @kvm: kvm handle
+ * @base: base of the region
+ * @size: size of region
+ *
+ * Return: true if there is an overlap
+ */
+bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size)
+{
+	struct vgic_dist *d = &kvm->arch.vgic;
+	struct vgic_redist_region *rdreg;
+
+	list_for_each_entry(rdreg, &d->rd_regions, list) {
+		if ((base + size > rdreg->base) &&
+			(base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg)))
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Check for overlapping regions and for regions crossing the end of memory
+ * for base addresses which have already been set.
+ */
+bool vgic_v3_check_base(struct kvm *kvm)
+{
+	struct vgic_dist *d = &kvm->arch.vgic;
+	struct vgic_redist_region *rdreg;
+
+	if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
+	    d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
+		return false;
+
+	list_for_each_entry(rdreg, &d->rd_regions, list) {
+		if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) <
+			rdreg->base)
+			return false;
+	}
+
+	if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base))
+		return true;
+
+	return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base,
+				      KVM_VGIC_V3_DIST_SIZE);
+}
+
+/**
+ * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one
+ * which has free space to put a new rdist region.
+ *
+ * @rd_regions: redistributor region list head
+ *
+ * A redistributor regions maps n redistributors, n = region size / (2 x 64kB).
+ * Stride between redistributors is 0 and regions are filled in the index order.
+ *
+ * Return: the redist region handle, if any, that has space to map a new rdist
+ * region.
+ */
+struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions)
+{
+	struct vgic_redist_region *rdreg;
+
+	list_for_each_entry(rdreg, rd_regions, list) {
+		if (!vgic_v3_redist_region_full(rdreg))
+			return rdreg;
+	}
+	return NULL;
+}
+
+struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
+							   u32 index)
+{
+	struct list_head *rd_regions = &kvm->arch.vgic.rd_regions;
+	struct vgic_redist_region *rdreg;
+
+	list_for_each_entry(rdreg, rd_regions, list) {
+		if (rdreg->index == index)
+			return rdreg;
+	}
+	return NULL;
+}
+
+
+int vgic_v3_map_resources(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct kvm_vcpu *vcpu;
+	int ret = 0;
+	int c;
+
+	if (vgic_ready(kvm))
+		goto out;
+
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+		if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) {
+			kvm_debug("vcpu %d redistributor base not set\n", c);
+			ret = -ENXIO;
+			goto out;
+		}
+	}
+
+	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) {
+		kvm_err("Need to set vgic distributor addresses first\n");
+		ret = -ENXIO;
+		goto out;
+	}
+
+	if (!vgic_v3_check_base(kvm)) {
+		kvm_err("VGIC redist and dist frames overlap\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * For a VGICv3 we require the userland to explicitly initialize
+	 * the VGIC before we need to use it.
+	 */
+	if (!vgic_initialized(kvm)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
+	if (ret) {
+		kvm_err("Unable to register VGICv3 dist MMIO regions\n");
+		goto out;
+	}
+
+	if (kvm_vgic_global_state.has_gicv4_1)
+		vgic_v4_configure_vsgis(kvm);
+	dist->ready = true;
+
+out:
+	return ret;
+}
+
+DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
+
+static int __init early_group0_trap_cfg(char *buf)
+{
+	return strtobool(buf, &group0_trap);
+}
+early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
+
+static int __init early_group1_trap_cfg(char *buf)
+{
+	return strtobool(buf, &group1_trap);
+}
+early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
+
+static int __init early_common_trap_cfg(char *buf)
+{
+	return strtobool(buf, &common_trap);
+}
+early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
+
+static int __init early_gicv4_enable(char *buf)
+{
+	return strtobool(buf, &gicv4_enable);
+}
+early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
+
+/**
+ * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
+ * @info:	pointer to the GIC description
+ *
+ * Returns 0 if the VGICv3 has been probed successfully, returns an error code
+ * otherwise
+ */
+int vgic_v3_probe(const struct gic_kvm_info *info)
+{
+	u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
+	int ret;
+
+	/*
+	 * The ListRegs field is 5 bits, but there is a architectural
+	 * maximum of 16 list registers. Just ignore bit 4...
+	 */
+	kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
+	kvm_vgic_global_state.can_emulate_gicv2 = false;
+	kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2;
+
+	/* GICv4 support? */
+	if (info->has_v4) {
+		kvm_vgic_global_state.has_gicv4 = gicv4_enable;
+		kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable;
+		kvm_info("GICv4%s support %sabled\n",
+			 kvm_vgic_global_state.has_gicv4_1 ? ".1" : "",
+			 gicv4_enable ? "en" : "dis");
+	}
+
+	if (!info->vcpu.start) {
+		kvm_info("GICv3: no GICV resource entry\n");
+		kvm_vgic_global_state.vcpu_base = 0;
+	} else if (!PAGE_ALIGNED(info->vcpu.start)) {
+		pr_warn("GICV physical address 0x%llx not page aligned\n",
+			(unsigned long long)info->vcpu.start);
+		kvm_vgic_global_state.vcpu_base = 0;
+	} else {
+		kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+		kvm_vgic_global_state.can_emulate_gicv2 = true;
+		ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+		if (ret) {
+			kvm_err("Cannot register GICv2 KVM device.\n");
+			return ret;
+		}
+		kvm_info("vgic-v2@%llx\n", info->vcpu.start);
+	}
+	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+	if (ret) {
+		kvm_err("Cannot register GICv3 KVM device.\n");
+		kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
+		return ret;
+	}
+
+	if (kvm_vgic_global_state.vcpu_base == 0)
+		kvm_info("disabling GICv2 emulation\n");
+
+	if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
+		group0_trap = true;
+		group1_trap = true;
+	}
+
+	if (group0_trap || group1_trap || common_trap) {
+		kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n",
+			 group0_trap ? "G0" : "",
+			 group1_trap ? "G1" : "",
+			 common_trap ? "C"  : "");
+		static_branch_enable(&vgic_v3_cpuif_trap);
+	}
+
+	kvm_vgic_global_state.vctrl_base = NULL;
+	kvm_vgic_global_state.type = VGIC_V3;
+	kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
+
+	return 0;
+}
+
+void vgic_v3_load(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	/*
+	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
+	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
+	 * VMCR_EL2 save/restore in the world switch.
+	 */
+	if (likely(cpu_if->vgic_sre))
+		kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+
+	kvm_call_hyp(__vgic_v3_restore_aprs, vcpu);
+
+	if (has_vhe())
+		__vgic_v3_activate_traps(vcpu);
+
+	WARN_ON(vgic_v4_load(vcpu));
+}
+
+void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+	if (likely(cpu_if->vgic_sre))
+		cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
+}
+
+void vgic_v3_put(struct kvm_vcpu *vcpu)
+{
+	WARN_ON(vgic_v4_put(vcpu, false));
+
+	vgic_v3_vmcr_sync(vcpu);
+
+	kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
+
+	if (has_vhe())
+		__vgic_v3_deactivate_traps(vcpu);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
new file mode 100644
index 000000000000..27ac833e5ec7
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kvm_host.h>
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include "vgic.h"
+
+/*
+ * How KVM uses GICv4 (insert rude comments here):
+ *
+ * The vgic-v4 layer acts as a bridge between several entities:
+ * - The GICv4 ITS representation offered by the ITS driver
+ * - VFIO, which is in charge of the PCI endpoint
+ * - The virtual ITS, which is the only thing the guest sees
+ *
+ * The configuration of VLPIs is triggered by a callback from VFIO,
+ * instructing KVM that a PCI device has been configured to deliver
+ * MSIs to a vITS.
+ *
+ * kvm_vgic_v4_set_forwarding() is thus called with the routing entry,
+ * and this is used to find the corresponding vITS data structures
+ * (ITS instance, device, event and irq) using a process that is
+ * extremely similar to the injection of an MSI.
+ *
+ * At this stage, we can link the guest's view of an LPI (uniquely
+ * identified by the routing entry) and the host irq, using the GICv4
+ * driver mapping operation. Should the mapping succeed, we've then
+ * successfully upgraded the guest's LPI to a VLPI. We can then start
+ * with updating GICv4's view of the property table and generating an
+ * INValidation in order to kickstart the delivery of this VLPI to the
+ * guest directly, without software intervention. Well, almost.
+ *
+ * When the PCI endpoint is deconfigured, this operation is reversed
+ * with VFIO calling kvm_vgic_v4_unset_forwarding().
+ *
+ * Once the VLPI has been mapped, it needs to follow any change the
+ * guest performs on its LPI through the vITS. For that, a number of
+ * command handlers have hooks to communicate these changes to the HW:
+ * - Any invalidation triggers a call to its_prop_update_vlpi()
+ * - The INT command results in a irq_set_irqchip_state(), which
+ *   generates an INT on the corresponding VLPI.
+ * - The CLEAR command results in a irq_set_irqchip_state(), which
+ *   generates an CLEAR on the corresponding VLPI.
+ * - DISCARD translates into an unmap, similar to a call to
+ *   kvm_vgic_v4_unset_forwarding().
+ * - MOVI is translated by an update of the existing mapping, changing
+ *   the target vcpu, resulting in a VMOVI being generated.
+ * - MOVALL is translated by a string of mapping updates (similar to
+ *   the handling of MOVI). MOVALL is horrible.
+ *
+ * Note that a DISCARD/MAPTI sequence emitted from the guest without
+ * reprogramming the PCI endpoint after MAPTI does not result in a
+ * VLPI being mapped, as there is no callback from VFIO (the guest
+ * will get the interrupt via the normal SW injection). Fixing this is
+ * not trivial, and requires some horrible messing with the VFIO
+ * internals. Not fun. Don't do that.
+ *
+ * Then there is the scheduling. Each time a vcpu is about to run on a
+ * physical CPU, KVM must tell the corresponding redistributor about
+ * it. And if we've migrated our vcpu from one CPU to another, we must
+ * tell the ITS (so that the messages reach the right redistributor).
+ * This is done in two steps: first issue a irq_set_affinity() on the
+ * irq corresponding to the vcpu, then call its_make_vpe_resident().
+ * You must be in a non-preemptible context. On exit, a call to
+ * its_make_vpe_non_resident() tells the redistributor that we're done
+ * with the vcpu.
+ *
+ * Finally, the doorbell handling: Each vcpu is allocated an interrupt
+ * which will fire each time a VLPI is made pending whilst the vcpu is
+ * not running. Each time the vcpu gets blocked, the doorbell
+ * interrupt gets enabled. When the vcpu is unblocked (for whatever
+ * reason), the doorbell interrupt is disabled.
+ */
+
+#define DB_IRQ_FLAGS	(IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING)
+
+static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info)
+{
+	struct kvm_vcpu *vcpu = info;
+
+	/* We got the message, no need to fire again */
+	if (!kvm_vgic_global_state.has_gicv4_1 &&
+	    !irqd_irq_disabled(&irq_to_desc(irq)->irq_data))
+		disable_irq_nosync(irq);
+
+	vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true;
+	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+	kvm_vcpu_kick(vcpu);
+
+	return IRQ_HANDLED;
+}
+
+static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq)
+{
+	vpe->sgi_config[irq->intid].enabled	= irq->enabled;
+	vpe->sgi_config[irq->intid].group 	= irq->group;
+	vpe->sgi_config[irq->intid].priority	= irq->priority;
+}
+
+static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu)
+{
+	struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+	int i;
+
+	/*
+	 * With GICv4.1, every virtual SGI can be directly injected. So
+	 * let's pretend that they are HW interrupts, tied to a host
+	 * IRQ. The SGI code will do its magic.
+	 */
+	for (i = 0; i < VGIC_NR_SGIS; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+		struct irq_desc *desc;
+		unsigned long flags;
+		int ret;
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		if (irq->hw)
+			goto unlock;
+
+		irq->hw = true;
+		irq->host_irq = irq_find_mapping(vpe->sgi_domain, i);
+
+		/* Transfer the full irq state to the vPE */
+		vgic_v4_sync_sgi_config(vpe, irq);
+		desc = irq_to_desc(irq->host_irq);
+		ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc),
+					      false);
+		if (!WARN_ON(ret)) {
+			/* Transfer pending state */
+			ret = irq_set_irqchip_state(irq->host_irq,
+						    IRQCHIP_STATE_PENDING,
+						    irq->pending_latch);
+			WARN_ON(ret);
+			irq->pending_latch = false;
+		}
+	unlock:
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for (i = 0; i < VGIC_NR_SGIS; i++) {
+		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+		struct irq_desc *desc;
+		unsigned long flags;
+		int ret;
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+		if (!irq->hw)
+			goto unlock;
+
+		irq->hw = false;
+		ret = irq_get_irqchip_state(irq->host_irq,
+					    IRQCHIP_STATE_PENDING,
+					    &irq->pending_latch);
+		WARN_ON(ret);
+
+		desc = irq_to_desc(irq->host_irq);
+		irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+	unlock:
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+}
+
+/* Must be called with the kvm lock held */
+void vgic_v4_configure_vsgis(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	kvm_arm_halt_guest(kvm);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (dist->nassgireq)
+			vgic_v4_enable_vsgis(vcpu);
+		else
+			vgic_v4_disable_vsgis(vcpu);
+	}
+
+	kvm_arm_resume_guest(kvm);
+}
+
+/**
+ * vgic_v4_init - Initialize the GICv4 data structures
+ * @kvm:	Pointer to the VM being initialized
+ *
+ * We may be called each time a vITS is created, or when the
+ * vgic is initialized. This relies on kvm->lock to be
+ * held. In both cases, the number of vcpus should now be
+ * fixed.
+ */
+int vgic_v4_init(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct kvm_vcpu *vcpu;
+	int i, nr_vcpus, ret;
+
+	if (!kvm_vgic_global_state.has_gicv4)
+		return 0; /* Nothing to see here... move along. */
+
+	if (dist->its_vm.vpes)
+		return 0;
+
+	nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+	dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes),
+				    GFP_KERNEL);
+	if (!dist->its_vm.vpes)
+		return -ENOMEM;
+
+	dist->its_vm.nr_vpes = nr_vcpus;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+	ret = its_alloc_vcpu_irqs(&dist->its_vm);
+	if (ret < 0) {
+		kvm_err("VPE IRQ allocation failure\n");
+		kfree(dist->its_vm.vpes);
+		dist->its_vm.nr_vpes = 0;
+		dist->its_vm.vpes = NULL;
+		return ret;
+	}
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		int irq = dist->its_vm.vpes[i]->irq;
+		unsigned long irq_flags = DB_IRQ_FLAGS;
+
+		/*
+		 * Don't automatically enable the doorbell, as we're
+		 * flipping it back and forth when the vcpu gets
+		 * blocked. Also disable the lazy disabling, as the
+		 * doorbell could kick us out of the guest too
+		 * early...
+		 *
+		 * On GICv4.1, the doorbell is managed in HW and must
+		 * be left enabled.
+		 */
+		if (kvm_vgic_global_state.has_gicv4_1)
+			irq_flags &= ~IRQ_NOAUTOEN;
+		irq_set_status_flags(irq, irq_flags);
+
+		ret = request_irq(irq, vgic_v4_doorbell_handler,
+				  0, "vcpu", vcpu);
+		if (ret) {
+			kvm_err("failed to allocate vcpu IRQ%d\n", irq);
+			/*
+			 * Trick: adjust the number of vpes so we know
+			 * how many to nuke on teardown...
+			 */
+			dist->its_vm.nr_vpes = i;
+			break;
+		}
+	}
+
+	if (ret)
+		vgic_v4_teardown(kvm);
+
+	return ret;
+}
+
+/**
+ * vgic_v4_teardown - Free the GICv4 data structures
+ * @kvm:	Pointer to the VM being destroyed
+ *
+ * Relies on kvm->lock to be held.
+ */
+void vgic_v4_teardown(struct kvm *kvm)
+{
+	struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
+	int i;
+
+	if (!its_vm->vpes)
+		return;
+
+	for (i = 0; i < its_vm->nr_vpes; i++) {
+		struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i);
+		int irq = its_vm->vpes[i]->irq;
+
+		irq_clear_status_flags(irq, DB_IRQ_FLAGS);
+		free_irq(irq, vcpu);
+	}
+
+	its_free_vcpu_irqs(its_vm);
+	kfree(its_vm->vpes);
+	its_vm->nr_vpes = 0;
+	its_vm->vpes = NULL;
+}
+
+int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
+{
+	struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+	if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
+		return 0;
+
+	return its_make_vpe_non_resident(vpe, need_db);
+}
+
+int vgic_v4_load(struct kvm_vcpu *vcpu)
+{
+	struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+	int err;
+
+	if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
+		return 0;
+
+	/*
+	 * Before making the VPE resident, make sure the redistributor
+	 * corresponding to our current CPU expects us here. See the
+	 * doc in drivers/irqchip/irq-gic-v4.c to understand how this
+	 * turns into a VMOVP command at the ITS level.
+	 */
+	err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id()));
+	if (err)
+		return err;
+
+	err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled);
+	if (err)
+		return err;
+
+	/*
+	 * Now that the VPE is resident, let's get rid of a potential
+	 * doorbell interrupt that would still be pending. This is a
+	 * GICv4.0 only "feature"...
+	 */
+	if (!kvm_vgic_global_state.has_gicv4_1)
+		err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false);
+
+	return err;
+}
+
+static struct vgic_its *vgic_get_its(struct kvm *kvm,
+				     struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+	struct kvm_msi msi  = (struct kvm_msi) {
+		.address_lo	= irq_entry->msi.address_lo,
+		.address_hi	= irq_entry->msi.address_hi,
+		.data		= irq_entry->msi.data,
+		.flags		= irq_entry->msi.flags,
+		.devid		= irq_entry->msi.devid,
+	};
+
+	return vgic_msi_to_its(kvm, &msi);
+}
+
+int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
+			       struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+	struct vgic_its *its;
+	struct vgic_irq *irq;
+	struct its_vlpi_map map;
+	int ret;
+
+	if (!vgic_supports_direct_msis(kvm))
+		return 0;
+
+	/*
+	 * Get the ITS, and escape early on error (not a valid
+	 * doorbell for any of our vITSs).
+	 */
+	its = vgic_get_its(kvm, irq_entry);
+	if (IS_ERR(its))
+		return 0;
+
+	mutex_lock(&its->its_lock);
+
+	/* Perform the actual DevID/EventID -> LPI translation. */
+	ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+				   irq_entry->msi.data, &irq);
+	if (ret)
+		goto out;
+
+	/*
+	 * Emit the mapping request. If it fails, the ITS probably
+	 * isn't v4 compatible, so let's silently bail out. Holding
+	 * the ITS lock should ensure that nothing can modify the
+	 * target vcpu.
+	 */
+	map = (struct its_vlpi_map) {
+		.vm		= &kvm->arch.vgic.its_vm,
+		.vpe		= &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe,
+		.vintid		= irq->intid,
+		.properties	= ((irq->priority & 0xfc) |
+				   (irq->enabled ? LPI_PROP_ENABLED : 0) |
+				   LPI_PROP_GROUP1),
+		.db_enabled	= true,
+	};
+
+	ret = its_map_vlpi(virq, &map);
+	if (ret)
+		goto out;
+
+	irq->hw		= true;
+	irq->host_irq	= virq;
+	atomic_inc(&map.vpe->vlpi_count);
+
+out:
+	mutex_unlock(&its->its_lock);
+	return ret;
+}
+
+int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
+				 struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+	struct vgic_its *its;
+	struct vgic_irq *irq;
+	int ret;
+
+	if (!vgic_supports_direct_msis(kvm))
+		return 0;
+
+	/*
+	 * Get the ITS, and escape early on error (not a valid
+	 * doorbell for any of our vITSs).
+	 */
+	its = vgic_get_its(kvm, irq_entry);
+	if (IS_ERR(its))
+		return 0;
+
+	mutex_lock(&its->its_lock);
+
+	ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+				   irq_entry->msi.data, &irq);
+	if (ret)
+		goto out;
+
+	WARN_ON(!(irq->hw && irq->host_irq == virq));
+	if (irq->hw) {
+		atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
+		irq->hw = false;
+		ret = its_unmap_vlpi(virq);
+	}
+
+out:
+	mutex_unlock(&its->its_lock);
+	return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
new file mode 100644
index 000000000000..99b02ca730a8
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -0,0 +1,1011 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/list_sort.h>
+#include <linux/nospec.h>
+
+#include <asm/kvm_hyp.h>
+
+#include "vgic.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+struct vgic_global kvm_vgic_global_state __ro_after_init = {
+	.gicv3_cpuif = STATIC_KEY_FALSE_INIT,
+};
+
+/*
+ * Locking order is always:
+ * kvm->lock (mutex)
+ *   its->cmd_lock (mutex)
+ *     its->its_lock (mutex)
+ *       vgic_cpu->ap_list_lock		must be taken with IRQs disabled
+ *         kvm->lpi_list_lock		must be taken with IRQs disabled
+ *           vgic_irq->irq_lock		must be taken with IRQs disabled
+ *
+ * As the ap_list_lock might be taken from the timer interrupt handler,
+ * we have to disable IRQs before taking this lock and everything lower
+ * than it.
+ *
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
+ *
+ * When taking more than one ap_list_lock at the same time, always take the
+ * lowest numbered VCPU's ap_list_lock first, so:
+ *   vcpuX->vcpu_id < vcpuY->vcpu_id:
+ *     raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
+ *     raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
+ *
+ * Since the VGIC must support injecting virtual interrupts from ISRs, we have
+ * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer
+ * spinlocks for any lock that may be taken while injecting an interrupt.
+ */
+
+/*
+ * Iterate over the VM's list of mapped LPIs to find the one with a
+ * matching interrupt ID and return a reference to the IRQ structure.
+ */
+static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq = NULL;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+		if (irq->intid != intid)
+			continue;
+
+		/*
+		 * This increases the refcount, the caller is expected to
+		 * call vgic_put_irq() later once it's finished with the IRQ.
+		 */
+		vgic_get_irq_kref(irq);
+		goto out_unlock;
+	}
+	irq = NULL;
+
+out_unlock:
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+	return irq;
+}
+
+/*
+ * This looks up the virtual interrupt ID to get the corresponding
+ * struct vgic_irq. It also increases the refcount, so any caller is expected
+ * to call vgic_put_irq() once it's finished with this IRQ.
+ */
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+			      u32 intid)
+{
+	/* SGIs and PPIs */
+	if (intid <= VGIC_MAX_PRIVATE) {
+		intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
+		return &vcpu->arch.vgic_cpu.private_irqs[intid];
+	}
+
+	/* SPIs */
+	if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+		intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
+		return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
+	}
+
+	/* LPIs */
+	if (intid >= VGIC_MIN_LPI)
+		return vgic_get_lpi(kvm, intid);
+
+	WARN(1, "Looking up struct vgic_irq for reserved INTID");
+	return NULL;
+}
+
+/*
+ * We can't do anything in here, because we lack the kvm pointer to
+ * lock and remove the item from the lpi_list. So we keep this function
+ * empty and use the return value of kref_put() to trigger the freeing.
+ */
+static void vgic_irq_release(struct kref *ref)
+{
+}
+
+/*
+ * Drop the refcount on the LPI. Must be called with lpi_list_lock held.
+ */
+void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
+	if (!kref_put(&irq->refcount, vgic_irq_release))
+		return;
+
+	list_del(&irq->lpi_list);
+	dist->lpi_list_count--;
+
+	kfree(irq);
+}
+
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	unsigned long flags;
+
+	if (irq->intid < VGIC_MIN_LPI)
+		return;
+
+	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+	__vgic_put_lpi_locked(kvm, irq);
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_irq *irq, *tmp;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+	list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+		if (irq->intid >= VGIC_MIN_LPI) {
+			raw_spin_lock(&irq->irq_lock);
+			list_del(&irq->ap_list);
+			irq->vcpu = NULL;
+			raw_spin_unlock(&irq->irq_lock);
+			vgic_put_irq(vcpu->kvm, irq);
+		}
+	}
+
+	raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+}
+
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
+{
+	WARN_ON(irq_set_irqchip_state(irq->host_irq,
+				      IRQCHIP_STATE_PENDING,
+				      pending));
+}
+
+bool vgic_get_phys_line_level(struct vgic_irq *irq)
+{
+	bool line_level;
+
+	BUG_ON(!irq->hw);
+
+	if (irq->get_input_level)
+		return irq->get_input_level(irq->intid);
+
+	WARN_ON(irq_get_irqchip_state(irq->host_irq,
+				      IRQCHIP_STATE_PENDING,
+				      &line_level));
+	return line_level;
+}
+
+/* Set/Clear the physical active state */
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
+{
+
+	BUG_ON(!irq->hw);
+	WARN_ON(irq_set_irqchip_state(irq->host_irq,
+				      IRQCHIP_STATE_ACTIVE,
+				      active));
+}
+
+/**
+ * kvm_vgic_target_oracle - compute the target vcpu for an irq
+ *
+ * @irq:	The irq to route. Must be already locked.
+ *
+ * Based on the current state of the interrupt (enabled, pending,
+ * active, vcpu and target_vcpu), compute the next vcpu this should be
+ * given to. Return NULL if this shouldn't be injected at all.
+ *
+ * Requires the IRQ lock to be held.
+ */
+static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
+{
+	lockdep_assert_held(&irq->irq_lock);
+
+	/* If the interrupt is active, it must stay on the current vcpu */
+	if (irq->active)
+		return irq->vcpu ? : irq->target_vcpu;
+
+	/*
+	 * If the IRQ is not active but enabled and pending, we should direct
+	 * it to its configured target VCPU.
+	 * If the distributor is disabled, pending interrupts shouldn't be
+	 * forwarded.
+	 */
+	if (irq->enabled && irq_is_pending(irq)) {
+		if (unlikely(irq->target_vcpu &&
+			     !irq->target_vcpu->kvm->arch.vgic.enabled))
+			return NULL;
+
+		return irq->target_vcpu;
+	}
+
+	/* If neither active nor pending and enabled, then this IRQ should not
+	 * be queued to any VCPU.
+	 */
+	return NULL;
+}
+
+/*
+ * The order of items in the ap_lists defines how we'll pack things in LRs as
+ * well, the first items in the list being the first things populated in the
+ * LRs.
+ *
+ * A hard rule is that active interrupts can never be pushed out of the LRs
+ * (and therefore take priority) since we cannot reliably trap on deactivation
+ * of IRQs and therefore they have to be present in the LRs.
+ *
+ * Otherwise things should be sorted by the priority field and the GIC
+ * hardware support will take care of preemption of priority groups etc.
+ *
+ * Return negative if "a" sorts before "b", 0 to preserve order, and positive
+ * to sort "b" before "a".
+ */
+static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
+	struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
+	bool penda, pendb;
+	int ret;
+
+	/*
+	 * list_sort may call this function with the same element when
+	 * the list is fairly long.
+	 */
+	if (unlikely(irqa == irqb))
+		return 0;
+
+	raw_spin_lock(&irqa->irq_lock);
+	raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
+
+	if (irqa->active || irqb->active) {
+		ret = (int)irqb->active - (int)irqa->active;
+		goto out;
+	}
+
+	penda = irqa->enabled && irq_is_pending(irqa);
+	pendb = irqb->enabled && irq_is_pending(irqb);
+
+	if (!penda || !pendb) {
+		ret = (int)pendb - (int)penda;
+		goto out;
+	}
+
+	/* Both pending and enabled, sort by priority */
+	ret = irqa->priority - irqb->priority;
+out:
+	raw_spin_unlock(&irqb->irq_lock);
+	raw_spin_unlock(&irqa->irq_lock);
+	return ret;
+}
+
+/* Must be called with the ap_list_lock held */
+static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+	lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+	list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
+}
+
+/*
+ * Only valid injection if changing level for level-triggered IRQs or for a
+ * rising edge, and in-kernel connected IRQ lines can only be controlled by
+ * their owner.
+ */
+static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
+{
+	if (irq->owner != owner)
+		return false;
+
+	switch (irq->config) {
+	case VGIC_CONFIG_LEVEL:
+		return irq->line_level != level;
+	case VGIC_CONFIG_EDGE:
+		return level;
+	}
+
+	return false;
+}
+
+/*
+ * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
+ * Do the queuing if necessary, taking the right locks in the right order.
+ * Returns true when the IRQ was queued, false otherwise.
+ *
+ * Needs to be entered with the IRQ lock already held, but will return
+ * with all locks dropped.
+ */
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+			   unsigned long flags)
+{
+	struct kvm_vcpu *vcpu;
+
+	lockdep_assert_held(&irq->irq_lock);
+
+retry:
+	vcpu = vgic_target_oracle(irq);
+	if (irq->vcpu || !vcpu) {
+		/*
+		 * If this IRQ is already on a VCPU's ap_list, then it
+		 * cannot be moved or modified and there is no more work for
+		 * us to do.
+		 *
+		 * Otherwise, if the irq is not pending and enabled, it does
+		 * not need to be inserted into an ap_list and there is also
+		 * no more work for us to do.
+		 */
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+		/*
+		 * We have to kick the VCPU here, because we could be
+		 * queueing an edge-triggered interrupt for which we
+		 * get no EOI maintenance interrupt. In that case,
+		 * while the IRQ is already on the VCPU's AP list, the
+		 * VCPU could have EOI'ed the original interrupt and
+		 * won't see this one until it exits for some other
+		 * reason.
+		 */
+		if (vcpu) {
+			kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+			kvm_vcpu_kick(vcpu);
+		}
+		return false;
+	}
+
+	/*
+	 * We must unlock the irq lock to take the ap_list_lock where
+	 * we are going to insert this new pending interrupt.
+	 */
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+	/* someone can do stuff here, which we re-check below */
+
+	raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+	raw_spin_lock(&irq->irq_lock);
+
+	/*
+	 * Did something change behind our backs?
+	 *
+	 * There are two cases:
+	 * 1) The irq lost its pending state or was disabled behind our
+	 *    backs and/or it was queued to another VCPU's ap_list.
+	 * 2) Someone changed the affinity on this irq behind our
+	 *    backs and we are now holding the wrong ap_list_lock.
+	 *
+	 * In both cases, drop the locks and retry.
+	 */
+
+	if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
+		raw_spin_unlock(&irq->irq_lock);
+		raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock,
+					   flags);
+
+		raw_spin_lock_irqsave(&irq->irq_lock, flags);
+		goto retry;
+	}
+
+	/*
+	 * Grab a reference to the irq to reflect the fact that it is
+	 * now in the ap_list.
+	 */
+	vgic_get_irq_kref(irq);
+	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
+	irq->vcpu = vcpu;
+
+	raw_spin_unlock(&irq->irq_lock);
+	raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+
+	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+	kvm_vcpu_kick(vcpu);
+
+	return true;
+}
+
+/**
+ * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
+ * @kvm:     The VM structure pointer
+ * @cpuid:   The CPU for PPIs
+ * @intid:   The INTID to inject a new state to.
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *			      false: to ignore the call
+ *	     Level-sensitive  true:  raise the input signal
+ *			      false: lower the input signal
+ * @owner:   The opaque pointer to the owner of the IRQ being raised to verify
+ *           that the caller is allowed to inject this IRQ.  Userspace
+ *           injections will have owner == NULL.
+ *
+ * The VGIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+			bool level, void *owner)
+{
+	struct kvm_vcpu *vcpu;
+	struct vgic_irq *irq;
+	unsigned long flags;
+	int ret;
+
+	trace_vgic_update_irq_pending(cpuid, intid, level);
+
+	ret = vgic_lazy_init(kvm);
+	if (ret)
+		return ret;
+
+	vcpu = kvm_get_vcpu(kvm, cpuid);
+	if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
+		return -EINVAL;
+
+	irq = vgic_get_irq(kvm, vcpu, intid);
+	if (!irq)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+	if (!vgic_validate_injection(irq, level, owner)) {
+		/* Nothing to see here, move along... */
+		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+		vgic_put_irq(kvm, irq);
+		return 0;
+	}
+
+	if (irq->config == VGIC_CONFIG_LEVEL)
+		irq->line_level = level;
+	else
+		irq->pending_latch = true;
+
+	vgic_queue_irq_unlock(kvm, irq, flags);
+	vgic_put_irq(kvm, irq);
+
+	return 0;
+}
+
+/* @irq->irq_lock must be held */
+static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+			    unsigned int host_irq,
+			    bool (*get_input_level)(int vindid))
+{
+	struct irq_desc *desc;
+	struct irq_data *data;
+
+	/*
+	 * Find the physical IRQ number corresponding to @host_irq
+	 */
+	desc = irq_to_desc(host_irq);
+	if (!desc) {
+		kvm_err("%s: no interrupt descriptor\n", __func__);
+		return -EINVAL;
+	}
+	data = irq_desc_get_irq_data(desc);
+	while (data->parent_data)
+		data = data->parent_data;
+
+	irq->hw = true;
+	irq->host_irq = host_irq;
+	irq->hwintid = data->hwirq;
+	irq->get_input_level = get_input_level;
+	return 0;
+}
+
+/* @irq->irq_lock must be held */
+static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
+{
+	irq->hw = false;
+	irq->hwintid = 0;
+	irq->get_input_level = NULL;
+}
+
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
+			  u32 vintid, bool (*get_input_level)(int vindid))
+{
+	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+	unsigned long flags;
+	int ret;
+
+	BUG_ON(!irq);
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+	vgic_put_irq(vcpu->kvm, irq);
+
+	return ret;
+}
+
+/**
+ * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ
+ * @vcpu: The VCPU pointer
+ * @vintid: The INTID of the interrupt
+ *
+ * Reset the active and pending states of a mapped interrupt.  Kernel
+ * subsystems injecting mapped interrupts should reset their interrupt lines
+ * when we are doing a reset of the VM.
+ */
+void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
+{
+	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+	unsigned long flags;
+
+	if (!irq->hw)
+		goto out;
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	irq->active = false;
+	irq->pending_latch = false;
+	irq->line_level = false;
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+out:
+	vgic_put_irq(vcpu->kvm, irq);
+}
+
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+	struct vgic_irq *irq;
+	unsigned long flags;
+
+	if (!vgic_initialized(vcpu->kvm))
+		return -EAGAIN;
+
+	irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+	BUG_ON(!irq);
+
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	kvm_vgic_unmap_irq(irq);
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+	vgic_put_irq(vcpu->kvm, irq);
+
+	return 0;
+}
+
+/**
+ * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
+ *
+ * @vcpu:   Pointer to the VCPU (used for PPIs)
+ * @intid:  The virtual INTID identifying the interrupt (PPI or SPI)
+ * @owner:  Opaque pointer to the owner
+ *
+ * Returns 0 if intid is not already used by another in-kernel device and the
+ * owner is set, otherwise returns an error code.
+ */
+int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
+{
+	struct vgic_irq *irq;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!vgic_initialized(vcpu->kvm))
+		return -EAGAIN;
+
+	/* SGIs and LPIs cannot be wired up to any device */
+	if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
+		return -EINVAL;
+
+	irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	if (irq->owner && irq->owner != owner)
+		ret = -EEXIST;
+	else
+		irq->owner = owner;
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+	return ret;
+}
+
+/**
+ * vgic_prune_ap_list - Remove non-relevant interrupts from the list
+ *
+ * @vcpu: The VCPU pointer
+ *
+ * Go over the list of "interesting" interrupts, and prune those that we
+ * won't have to consider in the near future.
+ */
+static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_irq *irq, *tmp;
+
+	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+retry:
+	raw_spin_lock(&vgic_cpu->ap_list_lock);
+
+	list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+		struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
+		bool target_vcpu_needs_kick = false;
+
+		raw_spin_lock(&irq->irq_lock);
+
+		BUG_ON(vcpu != irq->vcpu);
+
+		target_vcpu = vgic_target_oracle(irq);
+
+		if (!target_vcpu) {
+			/*
+			 * We don't need to process this interrupt any
+			 * further, move it off the list.
+			 */
+			list_del(&irq->ap_list);
+			irq->vcpu = NULL;
+			raw_spin_unlock(&irq->irq_lock);
+
+			/*
+			 * This vgic_put_irq call matches the
+			 * vgic_get_irq_kref in vgic_queue_irq_unlock,
+			 * where we added the LPI to the ap_list. As
+			 * we remove the irq from the list, we drop
+			 * also drop the refcount.
+			 */
+			vgic_put_irq(vcpu->kvm, irq);
+			continue;
+		}
+
+		if (target_vcpu == vcpu) {
+			/* We're on the right CPU */
+			raw_spin_unlock(&irq->irq_lock);
+			continue;
+		}
+
+		/* This interrupt looks like it has to be migrated. */
+
+		raw_spin_unlock(&irq->irq_lock);
+		raw_spin_unlock(&vgic_cpu->ap_list_lock);
+
+		/*
+		 * Ensure locking order by always locking the smallest
+		 * ID first.
+		 */
+		if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
+			vcpuA = vcpu;
+			vcpuB = target_vcpu;
+		} else {
+			vcpuA = target_vcpu;
+			vcpuB = vcpu;
+		}
+
+		raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+		raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
+				      SINGLE_DEPTH_NESTING);
+		raw_spin_lock(&irq->irq_lock);
+
+		/*
+		 * If the affinity has been preserved, move the
+		 * interrupt around. Otherwise, it means things have
+		 * changed while the interrupt was unlocked, and we
+		 * need to replay this.
+		 *
+		 * In all cases, we cannot trust the list not to have
+		 * changed, so we restart from the beginning.
+		 */
+		if (target_vcpu == vgic_target_oracle(irq)) {
+			struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
+
+			list_del(&irq->ap_list);
+			irq->vcpu = target_vcpu;
+			list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
+			target_vcpu_needs_kick = true;
+		}
+
+		raw_spin_unlock(&irq->irq_lock);
+		raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
+		raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+
+		if (target_vcpu_needs_kick) {
+			kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu);
+			kvm_vcpu_kick(target_vcpu);
+		}
+
+		goto retry;
+	}
+
+	raw_spin_unlock(&vgic_cpu->ap_list_lock);
+}
+
+static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_fold_lr_state(vcpu);
+	else
+		vgic_v3_fold_lr_state(vcpu);
+}
+
+/* Requires the irq_lock to be held. */
+static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
+				    struct vgic_irq *irq, int lr)
+{
+	lockdep_assert_held(&irq->irq_lock);
+
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_populate_lr(vcpu, irq, lr);
+	else
+		vgic_v3_populate_lr(vcpu, irq, lr);
+}
+
+static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_clear_lr(vcpu, lr);
+	else
+		vgic_v3_clear_lr(vcpu, lr);
+}
+
+static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
+{
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_set_underflow(vcpu);
+	else
+		vgic_v3_set_underflow(vcpu);
+}
+
+/* Requires the ap_list_lock to be held. */
+static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
+				 bool *multi_sgi)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_irq *irq;
+	int count = 0;
+
+	*multi_sgi = false;
+
+	lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+		int w;
+
+		raw_spin_lock(&irq->irq_lock);
+		/* GICv2 SGIs can count for more than one... */
+		w = vgic_irq_get_lr_count(irq);
+		raw_spin_unlock(&irq->irq_lock);
+
+		count += w;
+		*multi_sgi |= (w > 1);
+	}
+	return count;
+}
+
+/* Requires the VCPU's ap_list_lock to be held. */
+static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_irq *irq;
+	int count;
+	bool multi_sgi;
+	u8 prio = 0xff;
+
+	lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+	count = compute_ap_list_depth(vcpu, &multi_sgi);
+	if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
+		vgic_sort_ap_list(vcpu);
+
+	count = 0;
+
+	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+		raw_spin_lock(&irq->irq_lock);
+
+		/*
+		 * If we have multi-SGIs in the pipeline, we need to
+		 * guarantee that they are all seen before any IRQ of
+		 * lower priority. In that case, we need to filter out
+		 * these interrupts by exiting early. This is easy as
+		 * the AP list has been sorted already.
+		 */
+		if (multi_sgi && irq->priority > prio) {
+			_raw_spin_unlock(&irq->irq_lock);
+			break;
+		}
+
+		if (likely(vgic_target_oracle(irq) == vcpu)) {
+			vgic_populate_lr(vcpu, irq, count++);
+
+			if (irq->source)
+				prio = irq->priority;
+		}
+
+		raw_spin_unlock(&irq->irq_lock);
+
+		if (count == kvm_vgic_global_state.nr_lr) {
+			if (!list_is_last(&irq->ap_list,
+					  &vgic_cpu->ap_list_head))
+				vgic_set_underflow(vcpu);
+			break;
+		}
+	}
+
+	vcpu->arch.vgic_cpu.used_lrs = count;
+
+	/* Nuke remaining LRs */
+	for ( ; count < kvm_vgic_global_state.nr_lr; count++)
+		vgic_clear_lr(vcpu, count);
+}
+
+static inline bool can_access_vgic_from_kernel(void)
+{
+	/*
+	 * GICv2 can always be accessed from the kernel because it is
+	 * memory-mapped, and VHE systems can access GICv3 EL2 system
+	 * registers.
+	 */
+	return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
+}
+
+static inline void vgic_save_state(struct kvm_vcpu *vcpu)
+{
+	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+		vgic_v2_save_state(vcpu);
+	else
+		__vgic_v3_save_state(vcpu);
+}
+
+/* Sync back the hardware VGIC state into our emulation after a guest's run. */
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+	/* An empty ap_list_head implies used_lrs == 0 */
+	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
+		return;
+
+	if (can_access_vgic_from_kernel())
+		vgic_save_state(vcpu);
+
+	if (vgic_cpu->used_lrs)
+		vgic_fold_lr_state(vcpu);
+	vgic_prune_ap_list(vcpu);
+}
+
+static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
+{
+	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+		vgic_v2_restore_state(vcpu);
+	else
+		__vgic_v3_restore_state(vcpu);
+}
+
+/* Flush our emulation state into the GIC hardware before entering the guest. */
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * If there are no virtual interrupts active or pending for this
+	 * VCPU, then there is no work to do and we can bail out without
+	 * taking any lock.  There is a potential race with someone injecting
+	 * interrupts to the VCPU, but it is a benign race as the VCPU will
+	 * either observe the new interrupt before or after doing this check,
+	 * and introducing additional synchronization mechanism doesn't change
+	 * this.
+	 *
+	 * Note that we still need to go through the whole thing if anything
+	 * can be directly injected (GICv4).
+	 */
+	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
+	    !vgic_supports_direct_msis(vcpu->kvm))
+		return;
+
+	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+	if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
+		raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+		vgic_flush_lr_state(vcpu);
+		raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+	}
+
+	if (can_access_vgic_from_kernel())
+		vgic_restore_state(vcpu);
+}
+
+void kvm_vgic_load(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(!vgic_initialized(vcpu->kvm)))
+		return;
+
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_load(vcpu);
+	else
+		vgic_v3_load(vcpu);
+}
+
+void kvm_vgic_put(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(!vgic_initialized(vcpu->kvm)))
+		return;
+
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_put(vcpu);
+	else
+		vgic_v3_put(vcpu);
+}
+
+void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+		return;
+
+	if (kvm_vgic_global_state.type == VGIC_V2)
+		vgic_v2_vmcr_sync(vcpu);
+	else
+		vgic_v3_vmcr_sync(vcpu);
+}
+
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_irq *irq;
+	bool pending = false;
+	unsigned long flags;
+	struct vgic_vmcr vmcr;
+
+	if (!vcpu->kvm->arch.vgic.enabled)
+		return false;
+
+	if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
+		return true;
+
+	vgic_get_vmcr(vcpu, &vmcr);
+
+	raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+		raw_spin_lock(&irq->irq_lock);
+		pending = irq_is_pending(irq) && irq->enabled &&
+			  !irq->active &&
+			  irq->priority < vmcr.pmr;
+		raw_spin_unlock(&irq->irq_lock);
+
+		if (pending)
+			break;
+	}
+
+	raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+
+	return pending;
+}
+
+void vgic_kick_vcpus(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	int c;
+
+	/*
+	 * We've injected an interrupt, time to find out who deserves
+	 * a good kick...
+	 */
+	kvm_for_each_vcpu(c, vcpu, kvm) {
+		if (kvm_vgic_vcpu_pending_irq(vcpu)) {
+			kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+			kvm_vcpu_kick(vcpu);
+		}
+	}
+}
+
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+	struct vgic_irq *irq;
+	bool map_is_active;
+	unsigned long flags;
+
+	if (!vgic_initialized(vcpu->kvm))
+		return false;
+
+	irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+	raw_spin_lock_irqsave(&irq->irq_lock, flags);
+	map_is_active = irq->hw && irq->active;
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+	vgic_put_irq(vcpu->kvm, irq);
+
+	return map_is_active;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
new file mode 100644
index 000000000000..769e4802645e
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+#ifndef __KVM_ARM_VGIC_NEW_H__
+#define __KVM_ARM_VGIC_NEW_H__
+
+#include <linux/irqchip/arm-gic-common.h>
+
+#define PRODUCT_ID_KVM		0x4b	/* ASCII code K */
+#define IMPLEMENTER_ARM		0x43b
+
+#define VGIC_ADDR_UNDEF		(-1)
+#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
+
+#define INTERRUPT_ID_BITS_SPIS	10
+#define INTERRUPT_ID_BITS_ITS	16
+#define VGIC_PRI_BITS		5
+
+#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
+
+#define VGIC_AFFINITY_0_SHIFT 0
+#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT)
+#define VGIC_AFFINITY_1_SHIFT 8
+#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT)
+#define VGIC_AFFINITY_2_SHIFT 16
+#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT)
+#define VGIC_AFFINITY_3_SHIFT 24
+#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT)
+
+#define VGIC_AFFINITY_LEVEL(reg, level) \
+	((((reg) & VGIC_AFFINITY_## level ##_MASK) \
+	>> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+
+/*
+ * The Userspace encodes the affinity differently from the MPIDR,
+ * Below macro converts vgic userspace format to MPIDR reg format.
+ */
+#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \
+			    VGIC_AFFINITY_LEVEL(val, 1) | \
+			    VGIC_AFFINITY_LEVEL(val, 2) | \
+			    VGIC_AFFINITY_LEVEL(val, 3))
+
+/*
+ * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt,
+ * below macros are defined for CPUREG encoding.
+ */
+#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK   0x000000000000c000
+#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT  14
+#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK   0x0000000000003800
+#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT  11
+#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK   0x0000000000000780
+#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT  7
+#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK   0x0000000000000078
+#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT  3
+#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK   0x0000000000000007
+#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT  0
+
+#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \
+				      KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \
+				      KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \
+				      KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
+				      KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
+
+/*
+ * As per Documentation/virt/kvm/devices/arm-vgic-its.txt,
+ * below macros are defined for ITS table entry encoding.
+ */
+#define KVM_ITS_CTE_VALID_SHIFT		63
+#define KVM_ITS_CTE_VALID_MASK		BIT_ULL(63)
+#define KVM_ITS_CTE_RDBASE_SHIFT	16
+#define KVM_ITS_CTE_ICID_MASK		GENMASK_ULL(15, 0)
+#define KVM_ITS_ITE_NEXT_SHIFT		48
+#define KVM_ITS_ITE_PINTID_SHIFT	16
+#define KVM_ITS_ITE_PINTID_MASK		GENMASK_ULL(47, 16)
+#define KVM_ITS_ITE_ICID_MASK		GENMASK_ULL(15, 0)
+#define KVM_ITS_DTE_VALID_SHIFT		63
+#define KVM_ITS_DTE_VALID_MASK		BIT_ULL(63)
+#define KVM_ITS_DTE_NEXT_SHIFT		49
+#define KVM_ITS_DTE_NEXT_MASK		GENMASK_ULL(62, 49)
+#define KVM_ITS_DTE_ITTADDR_SHIFT	5
+#define KVM_ITS_DTE_ITTADDR_MASK	GENMASK_ULL(48, 5)
+#define KVM_ITS_DTE_SIZE_MASK		GENMASK_ULL(4, 0)
+#define KVM_ITS_L1E_VALID_MASK		BIT_ULL(63)
+/* we only support 64 kB translation table page size */
+#define KVM_ITS_L1E_ADDR_MASK		GENMASK_ULL(51, 16)
+
+#define KVM_VGIC_V3_RDIST_INDEX_MASK	GENMASK_ULL(11, 0)
+#define KVM_VGIC_V3_RDIST_FLAGS_MASK	GENMASK_ULL(15, 12)
+#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT	12
+#define KVM_VGIC_V3_RDIST_BASE_MASK	GENMASK_ULL(51, 16)
+#define KVM_VGIC_V3_RDIST_COUNT_MASK	GENMASK_ULL(63, 52)
+#define KVM_VGIC_V3_RDIST_COUNT_SHIFT	52
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
+#else
+#define DEBUG_SPINLOCK_BUG_ON(p)
+#endif
+
+/* Requires the irq_lock to be held by the caller. */
+static inline bool irq_is_pending(struct vgic_irq *irq)
+{
+	if (irq->config == VGIC_CONFIG_EDGE)
+		return irq->pending_latch;
+	else
+		return irq->pending_latch || irq->line_level;
+}
+
+static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
+{
+	return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
+}
+
+static inline int vgic_irq_get_lr_count(struct vgic_irq *irq)
+{
+	/* Account for the active state as an interrupt */
+	if (vgic_irq_is_sgi(irq->intid) && irq->source)
+		return hweight8(irq->source) + irq->active;
+
+	return irq_is_pending(irq) || irq->active;
+}
+
+static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq)
+{
+	return vgic_irq_get_lr_count(irq) > 1;
+}
+
+/*
+ * This struct provides an intermediate representation of the fields contained
+ * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
+ * state to userspace can generate either GICv2 or GICv3 CPU interface
+ * registers regardless of the hardware backed GIC used.
+ */
+struct vgic_vmcr {
+	u32	grpen0;
+	u32	grpen1;
+
+	u32	ackctl;
+	u32	fiqen;
+	u32	cbpr;
+	u32	eoim;
+
+	u32	abpr;
+	u32	bpr;
+	u32	pmr;  /* Priority mask field in the GICC_PMR and
+		       * ICC_PMR_EL1 priority field format */
+};
+
+struct vgic_reg_attr {
+	struct kvm_vcpu *vcpu;
+	gpa_t addr;
+};
+
+int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+		       struct vgic_reg_attr *reg_attr);
+int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+		       struct vgic_reg_attr *reg_attr);
+const struct vgic_register_region *
+vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
+		     gpa_t addr, int len);
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+			      u32 intid);
+void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq);
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
+bool vgic_get_phys_line_level(struct vgic_irq *irq);
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+			   unsigned long flags);
+void vgic_kick_vcpus(struct kvm *kvm);
+
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+		      phys_addr_t addr, phys_addr_t alignment);
+
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			 int offset, u32 *val);
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			  int offset, u32 *val);
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_enable(struct kvm_vcpu *vcpu);
+int vgic_v2_probe(const struct gic_kvm_info *info);
+int vgic_v2_map_resources(struct kvm *kvm);
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+			     enum vgic_type);
+
+void vgic_v2_init_lrs(void);
+void vgic_v2_load(struct kvm_vcpu *vcpu);
+void vgic_v2_put(struct kvm_vcpu *vcpu);
+void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
+
+void vgic_v2_save_state(struct kvm_vcpu *vcpu);
+void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+	if (irq->intid < VGIC_MIN_LPI)
+		return;
+
+	kref_get(&irq->refcount);
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_enable(struct kvm_vcpu *vcpu);
+int vgic_v3_probe(const struct gic_kvm_info *info);
+int vgic_v3_map_resources(struct kvm *kvm);
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
+int vgic_v3_save_pending_tables(struct kvm *kvm);
+int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count);
+int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
+bool vgic_v3_check_base(struct kvm *kvm);
+
+void vgic_v3_load(struct kvm_vcpu *vcpu);
+void vgic_v3_put(struct kvm_vcpu *vcpu);
+void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
+
+bool vgic_has_its(struct kvm *kvm);
+int kvm_vgic_register_its_device(void);
+void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
+int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			 int offset, u32 *val);
+int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			 int offset, u32 *val);
+int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+			 u64 id, u64 *val);
+int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
+				u64 *reg);
+int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+				    u32 intid, u64 *val);
+int kvm_register_vgic_device(unsigned long type);
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+int vgic_lazy_init(struct kvm *kvm);
+int vgic_init(struct kvm *kvm);
+
+void vgic_debug_init(struct kvm *kvm);
+void vgic_debug_destroy(struct kvm *kvm);
+
+bool lock_all_vcpus(struct kvm *kvm);
+void unlock_all_vcpus(struct kvm *kvm);
+
+static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
+
+	/*
+	 * num_pri_bits are initialized with HW supported values.
+	 * We can rely safely on num_pri_bits even if VM has not
+	 * restored ICC_CTLR_EL1 before restoring APnR registers.
+	 */
+	switch (cpu_if->num_pri_bits) {
+	case 7: return 3;
+	case 6: return 1;
+	default: return 0;
+	}
+}
+
+static inline bool
+vgic_v3_redist_region_full(struct vgic_redist_region *region)
+{
+	if (!region->count)
+		return false;
+
+	return (region->free_index >= region->count);
+}
+
+struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs);
+
+static inline size_t
+vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
+{
+	if (!rdreg->count)
+		return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE;
+	else
+		return rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
+}
+
+struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
+							   u32 index);
+
+bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
+
+static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
+{
+	struct vgic_dist *d = &kvm->arch.vgic;
+
+	return (base + size > d->vgic_dist_base) &&
+		(base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE);
+}
+
+int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
+int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
+			 u32 devid, u32 eventid, struct vgic_irq **irq);
+struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
+int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
+void vgic_lpi_translation_cache_init(struct kvm *kvm);
+void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
+void vgic_its_invalidate_cache(struct kvm *kvm);
+
+bool vgic_supports_direct_msis(struct kvm *kvm);
+int vgic_v4_init(struct kvm *kvm);
+void vgic_v4_teardown(struct kvm *kvm);
+void vgic_v4_configure_vsgis(struct kvm *kvm);
+
+#endif
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
deleted file mode 100644
index 0a356aa91aa1..000000000000
--- a/virt/kvm/arm/aarch32.c
+++ /dev/null
@@ -1,204 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (not much of an) Emulation layer for 32bit guests.
- *
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * based on arch/arm/kvm/emulate.c
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/bits.h>
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-
-#define DFSR_FSC_EXTABT_LPAE	0x10
-#define DFSR_FSC_EXTABT_nLPAE	0x08
-#define DFSR_LPAE		BIT(9)
-
-/*
- * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
- */
-static const u8 return_offsets[8][2] = {
-	[0] = { 0, 0 },		/* Reset, unused */
-	[1] = { 4, 2 },		/* Undefined */
-	[2] = { 0, 0 },		/* SVC, unused */
-	[3] = { 4, 4 },		/* Prefetch abort */
-	[4] = { 8, 8 },		/* Data abort */
-	[5] = { 0, 0 },		/* HVC, unused */
-	[6] = { 4, 4 },		/* IRQ, unused */
-	[7] = { 4, 4 },		/* FIQ, unused */
-};
-
-/*
- * When an exception is taken, most CPSR fields are left unchanged in the
- * handler. However, some are explicitly overridden (e.g. M[4:0]).
- *
- * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
- * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
- * obsoleted by the ARMv7 virtualization extensions and is RES0.
- *
- * For the SPSR layout seen from AArch32, see:
- * - ARM DDI 0406C.d, page B1-1148
- * - ARM DDI 0487E.a, page G8-6264
- *
- * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
- * - ARM DDI 0487E.a, page C5-426
- *
- * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
- * MSB to LSB.
- */
-static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
-{
-	u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
-	unsigned long old, new;
-
-	old = *vcpu_cpsr(vcpu);
-	new = 0;
-
-	new |= (old & PSR_AA32_N_BIT);
-	new |= (old & PSR_AA32_Z_BIT);
-	new |= (old & PSR_AA32_C_BIT);
-	new |= (old & PSR_AA32_V_BIT);
-	new |= (old & PSR_AA32_Q_BIT);
-
-	// CPSR.IT[7:0] are set to zero upon any exception
-	// See ARM DDI 0487E.a, section G1.12.3
-	// See ARM DDI 0406C.d, section B1.8.3
-
-	new |= (old & PSR_AA32_DIT_BIT);
-
-	// CPSR.SSBS is set to SCTLR.DSSBS upon any exception
-	// See ARM DDI 0487E.a, page G8-6244
-	if (sctlr & BIT(31))
-		new |= PSR_AA32_SSBS_BIT;
-
-	// CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
-	// SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
-	// See ARM DDI 0487E.a, page G8-6246
-	new |= (old & PSR_AA32_PAN_BIT);
-	if (!(sctlr & BIT(23)))
-		new |= PSR_AA32_PAN_BIT;
-
-	// SS does not exist in AArch32, so ignore
-
-	// CPSR.IL is set to zero upon any exception
-	// See ARM DDI 0487E.a, page G1-5527
-
-	new |= (old & PSR_AA32_GE_MASK);
-
-	// CPSR.IT[7:0] are set to zero upon any exception
-	// See prior comment above
-
-	// CPSR.E is set to SCTLR.EE upon any exception
-	// See ARM DDI 0487E.a, page G8-6245
-	// See ARM DDI 0406C.d, page B4-1701
-	if (sctlr & BIT(25))
-		new |= PSR_AA32_E_BIT;
-
-	// CPSR.A is unchanged upon an exception to Undefined, Supervisor
-	// CPSR.A is set upon an exception to other modes
-	// See ARM DDI 0487E.a, pages G1-5515 to G1-5516
-	// See ARM DDI 0406C.d, page B1-1182
-	new |= (old & PSR_AA32_A_BIT);
-	if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
-		new |= PSR_AA32_A_BIT;
-
-	// CPSR.I is set upon any exception
-	// See ARM DDI 0487E.a, pages G1-5515 to G1-5516
-	// See ARM DDI 0406C.d, page B1-1182
-	new |= PSR_AA32_I_BIT;
-
-	// CPSR.F is set upon an exception to FIQ
-	// CPSR.F is unchanged upon an exception to other modes
-	// See ARM DDI 0487E.a, pages G1-5515 to G1-5516
-	// See ARM DDI 0406C.d, page B1-1182
-	new |= (old & PSR_AA32_F_BIT);
-	if (mode == PSR_AA32_MODE_FIQ)
-		new |= PSR_AA32_F_BIT;
-
-	// CPSR.T is set to SCTLR.TE upon any exception
-	// See ARM DDI 0487E.a, page G8-5514
-	// See ARM DDI 0406C.d, page B1-1181
-	if (sctlr & BIT(30))
-		new |= PSR_AA32_T_BIT;
-
-	new |= mode;
-
-	return new;
-}
-
-static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
-{
-	unsigned long spsr = *vcpu_cpsr(vcpu);
-	bool is_thumb = (spsr & PSR_AA32_T_BIT);
-	u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
-	u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
-
-	*vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
-
-	/* Note: These now point to the banked copies */
-	vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr));
-	*vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
-
-	/* Branch to exception vector */
-	if (sctlr & (1 << 13))
-		vect_offset += 0xffff0000;
-	else /* always have security exceptions */
-		vect_offset += vcpu_cp15(vcpu, c12_VBAR);
-
-	*vcpu_pc(vcpu) = vect_offset;
-}
-
-void kvm_inject_undef32(struct kvm_vcpu *vcpu)
-{
-	prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4);
-}
-
-/*
- * Modelled after TakeDataAbortException() and TakePrefetchAbortException
- * pseudocode.
- */
-static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
-			 unsigned long addr)
-{
-	u32 vect_offset;
-	u32 *far, *fsr;
-	bool is_lpae;
-
-	if (is_pabt) {
-		vect_offset = 12;
-		far = &vcpu_cp15(vcpu, c6_IFAR);
-		fsr = &vcpu_cp15(vcpu, c5_IFSR);
-	} else { /* !iabt */
-		vect_offset = 16;
-		far = &vcpu_cp15(vcpu, c6_DFAR);
-		fsr = &vcpu_cp15(vcpu, c5_DFSR);
-	}
-
-	prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset);
-
-	*far = addr;
-
-	/* Give the guest an IMPLEMENTATION DEFINED exception */
-	is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
-	if (is_lpae) {
-		*fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
-	} else {
-		/* no need to shuffle FS[4] into DFSR[10] as its 0 */
-		*fsr = DFSR_FSC_EXTABT_nLPAE;
-	}
-}
-
-void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-	inject_abt32(vcpu, false, addr);
-}
-
-void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-	inject_abt32(vcpu, true, addr);
-}
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
deleted file mode 100644
index 93bd59b46848..000000000000
--- a/virt/kvm/arm/arch_timer.c
+++ /dev/null
@@ -1,1180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/uaccess.h>
-
-#include <clocksource/arm_arch_timer.h>
-#include <asm/arch_timer.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-
-#include <kvm/arm_vgic.h>
-#include <kvm/arm_arch_timer.h>
-
-#include "trace.h"
-
-static struct timecounter *timecounter;
-static unsigned int host_vtimer_irq;
-static unsigned int host_ptimer_irq;
-static u32 host_vtimer_irq_flags;
-static u32 host_ptimer_irq_flags;
-
-static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
-
-static const struct kvm_irq_level default_ptimer_irq = {
-	.irq	= 30,
-	.level	= 1,
-};
-
-static const struct kvm_irq_level default_vtimer_irq = {
-	.irq	= 27,
-	.level	= 1,
-};
-
-static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
-static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
-				 struct arch_timer_context *timer_ctx);
-static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
-static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
-				struct arch_timer_context *timer,
-				enum kvm_arch_timer_regs treg,
-				u64 val);
-static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
-			      struct arch_timer_context *timer,
-			      enum kvm_arch_timer_regs treg);
-
-u64 kvm_phys_timer_read(void)
-{
-	return timecounter->cc->read(timecounter->cc);
-}
-
-static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
-{
-	if (has_vhe()) {
-		map->direct_vtimer = vcpu_vtimer(vcpu);
-		map->direct_ptimer = vcpu_ptimer(vcpu);
-		map->emul_ptimer = NULL;
-	} else {
-		map->direct_vtimer = vcpu_vtimer(vcpu);
-		map->direct_ptimer = NULL;
-		map->emul_ptimer = vcpu_ptimer(vcpu);
-	}
-
-	trace_kvm_get_timer_map(vcpu->vcpu_id, map);
-}
-
-static inline bool userspace_irqchip(struct kvm *kvm)
-{
-	return static_branch_unlikely(&userspace_irqchip_in_use) &&
-		unlikely(!irqchip_in_kernel(kvm));
-}
-
-static void soft_timer_start(struct hrtimer *hrt, u64 ns)
-{
-	hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
-		      HRTIMER_MODE_ABS_HARD);
-}
-
-static void soft_timer_cancel(struct hrtimer *hrt)
-{
-	hrtimer_cancel(hrt);
-}
-
-static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
-{
-	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
-	struct arch_timer_context *ctx;
-	struct timer_map map;
-
-	/*
-	 * We may see a timer interrupt after vcpu_put() has been called which
-	 * sets the CPU's vcpu pointer to NULL, because even though the timer
-	 * has been disabled in timer_save_state(), the hardware interrupt
-	 * signal may not have been retired from the interrupt controller yet.
-	 */
-	if (!vcpu)
-		return IRQ_HANDLED;
-
-	get_timer_map(vcpu, &map);
-
-	if (irq == host_vtimer_irq)
-		ctx = map.direct_vtimer;
-	else
-		ctx = map.direct_ptimer;
-
-	if (kvm_timer_should_fire(ctx))
-		kvm_timer_update_irq(vcpu, true, ctx);
-
-	if (userspace_irqchip(vcpu->kvm) &&
-	    !static_branch_unlikely(&has_gic_active_state))
-		disable_percpu_irq(host_vtimer_irq);
-
-	return IRQ_HANDLED;
-}
-
-static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
-{
-	u64 cval, now;
-
-	cval = timer_ctx->cnt_cval;
-	now = kvm_phys_timer_read() - timer_ctx->cntvoff;
-
-	if (now < cval) {
-		u64 ns;
-
-		ns = cyclecounter_cyc2ns(timecounter->cc,
-					 cval - now,
-					 timecounter->mask,
-					 &timecounter->frac);
-		return ns;
-	}
-
-	return 0;
-}
-
-static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
-{
-	WARN_ON(timer_ctx && timer_ctx->loaded);
-	return timer_ctx &&
-	       !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
-		(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
-}
-
-/*
- * Returns the earliest expiration time in ns among guest timers.
- * Note that it will return 0 if none of timers can fire.
- */
-static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
-{
-	u64 min_delta = ULLONG_MAX;
-	int i;
-
-	for (i = 0; i < NR_KVM_TIMERS; i++) {
-		struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
-
-		WARN(ctx->loaded, "timer %d loaded\n", i);
-		if (kvm_timer_irq_can_fire(ctx))
-			min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
-	}
-
-	/* If none of timers can fire, then return 0 */
-	if (min_delta == ULLONG_MAX)
-		return 0;
-
-	return min_delta;
-}
-
-static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
-{
-	struct arch_timer_cpu *timer;
-	struct kvm_vcpu *vcpu;
-	u64 ns;
-
-	timer = container_of(hrt, struct arch_timer_cpu, bg_timer);
-	vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
-
-	/*
-	 * Check that the timer has really expired from the guest's
-	 * PoV (NTP on the host may have forced it to expire
-	 * early). If we should have slept longer, restart it.
-	 */
-	ns = kvm_timer_earliest_exp(vcpu);
-	if (unlikely(ns)) {
-		hrtimer_forward_now(hrt, ns_to_ktime(ns));
-		return HRTIMER_RESTART;
-	}
-
-	kvm_vcpu_wake_up(vcpu);
-	return HRTIMER_NORESTART;
-}
-
-static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
-{
-	struct arch_timer_context *ctx;
-	struct kvm_vcpu *vcpu;
-	u64 ns;
-
-	ctx = container_of(hrt, struct arch_timer_context, hrtimer);
-	vcpu = ctx->vcpu;
-
-	trace_kvm_timer_hrtimer_expire(ctx);
-
-	/*
-	 * Check that the timer has really expired from the guest's
-	 * PoV (NTP on the host may have forced it to expire
-	 * early). If not ready, schedule for a later time.
-	 */
-	ns = kvm_timer_compute_delta(ctx);
-	if (unlikely(ns)) {
-		hrtimer_forward_now(hrt, ns_to_ktime(ns));
-		return HRTIMER_RESTART;
-	}
-
-	kvm_timer_update_irq(vcpu, true, ctx);
-	return HRTIMER_NORESTART;
-}
-
-static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
-{
-	enum kvm_arch_timers index;
-	u64 cval, now;
-
-	if (!timer_ctx)
-		return false;
-
-	index = arch_timer_ctx_index(timer_ctx);
-
-	if (timer_ctx->loaded) {
-		u32 cnt_ctl = 0;
-
-		switch (index) {
-		case TIMER_VTIMER:
-			cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
-			break;
-		case TIMER_PTIMER:
-			cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
-			break;
-		case NR_KVM_TIMERS:
-			/* GCC is braindead */
-			cnt_ctl = 0;
-			break;
-		}
-
-		return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
-		        (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
-		       !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
-	}
-
-	if (!kvm_timer_irq_can_fire(timer_ctx))
-		return false;
-
-	cval = timer_ctx->cnt_cval;
-	now = kvm_phys_timer_read() - timer_ctx->cntvoff;
-
-	return cval <= now;
-}
-
-bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
-{
-	struct timer_map map;
-
-	get_timer_map(vcpu, &map);
-
-	return kvm_timer_should_fire(map.direct_vtimer) ||
-	       kvm_timer_should_fire(map.direct_ptimer) ||
-	       kvm_timer_should_fire(map.emul_ptimer);
-}
-
-/*
- * Reflect the timer output level into the kvm_run structure
- */
-void kvm_timer_update_run(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
-
-	/* Populate the device bitmap with the timer states */
-	regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
-				    KVM_ARM_DEV_EL1_PTIMER);
-	if (kvm_timer_should_fire(vtimer))
-		regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
-	if (kvm_timer_should_fire(ptimer))
-		regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
-}
-
-static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
-				 struct arch_timer_context *timer_ctx)
-{
-	int ret;
-
-	timer_ctx->irq.level = new_level;
-	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
-				   timer_ctx->irq.level);
-
-	if (!userspace_irqchip(vcpu->kvm)) {
-		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-					  timer_ctx->irq.irq,
-					  timer_ctx->irq.level,
-					  timer_ctx);
-		WARN_ON(ret);
-	}
-}
-
-/* Only called for a fully emulated timer */
-static void timer_emulate(struct arch_timer_context *ctx)
-{
-	bool should_fire = kvm_timer_should_fire(ctx);
-
-	trace_kvm_timer_emulate(ctx, should_fire);
-
-	if (should_fire != ctx->irq.level) {
-		kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
-		return;
-	}
-
-	/*
-	 * If the timer can fire now, we don't need to have a soft timer
-	 * scheduled for the future.  If the timer cannot fire at all,
-	 * then we also don't need a soft timer.
-	 */
-	if (!kvm_timer_irq_can_fire(ctx)) {
-		soft_timer_cancel(&ctx->hrtimer);
-		return;
-	}
-
-	soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
-}
-
-static void timer_save_state(struct arch_timer_context *ctx)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
-	enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
-	unsigned long flags;
-
-	if (!timer->enabled)
-		return;
-
-	local_irq_save(flags);
-
-	if (!ctx->loaded)
-		goto out;
-
-	switch (index) {
-	case TIMER_VTIMER:
-		ctx->cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
-		ctx->cnt_cval = read_sysreg_el0(SYS_CNTV_CVAL);
-
-		/* Disable the timer */
-		write_sysreg_el0(0, SYS_CNTV_CTL);
-		isb();
-
-		break;
-	case TIMER_PTIMER:
-		ctx->cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
-		ctx->cnt_cval = read_sysreg_el0(SYS_CNTP_CVAL);
-
-		/* Disable the timer */
-		write_sysreg_el0(0, SYS_CNTP_CTL);
-		isb();
-
-		break;
-	case NR_KVM_TIMERS:
-		BUG();
-	}
-
-	trace_kvm_timer_save_state(ctx);
-
-	ctx->loaded = false;
-out:
-	local_irq_restore(flags);
-}
-
-/*
- * Schedule the background timer before calling kvm_vcpu_block, so that this
- * thread is removed from its waitqueue and made runnable when there's a timer
- * interrupt to handle.
- */
-static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct timer_map map;
-
-	get_timer_map(vcpu, &map);
-
-	/*
-	 * If no timers are capable of raising interrupts (disabled or
-	 * masked), then there's no more work for us to do.
-	 */
-	if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
-	    !kvm_timer_irq_can_fire(map.direct_ptimer) &&
-	    !kvm_timer_irq_can_fire(map.emul_ptimer))
-		return;
-
-	/*
-	 * At least one guest time will expire. Schedule a background timer.
-	 * Set the earliest expiration time among the guest timers.
-	 */
-	soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
-}
-
-static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-
-	soft_timer_cancel(&timer->bg_timer);
-}
-
-static void timer_restore_state(struct arch_timer_context *ctx)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
-	enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
-	unsigned long flags;
-
-	if (!timer->enabled)
-		return;
-
-	local_irq_save(flags);
-
-	if (ctx->loaded)
-		goto out;
-
-	switch (index) {
-	case TIMER_VTIMER:
-		write_sysreg_el0(ctx->cnt_cval, SYS_CNTV_CVAL);
-		isb();
-		write_sysreg_el0(ctx->cnt_ctl, SYS_CNTV_CTL);
-		break;
-	case TIMER_PTIMER:
-		write_sysreg_el0(ctx->cnt_cval, SYS_CNTP_CVAL);
-		isb();
-		write_sysreg_el0(ctx->cnt_ctl, SYS_CNTP_CTL);
-		break;
-	case NR_KVM_TIMERS:
-		BUG();
-	}
-
-	trace_kvm_timer_restore_state(ctx);
-
-	ctx->loaded = true;
-out:
-	local_irq_restore(flags);
-}
-
-static void set_cntvoff(u64 cntvoff)
-{
-	u32 low = lower_32_bits(cntvoff);
-	u32 high = upper_32_bits(cntvoff);
-
-	/*
-	 * Since kvm_call_hyp doesn't fully support the ARM PCS especially on
-	 * 32-bit systems, but rather passes register by register shifted one
-	 * place (we put the function address in r0/x0), we cannot simply pass
-	 * a 64-bit value as an argument, but have to split the value in two
-	 * 32-bit halves.
-	 */
-	kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
-}
-
-static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
-{
-	int r;
-	r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
-	WARN_ON(r);
-}
-
-static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
-{
-	struct kvm_vcpu *vcpu = ctx->vcpu;
-	bool phys_active = false;
-
-	/*
-	 * Update the timer output so that it is likely to match the
-	 * state we're about to restore. If the timer expires between
-	 * this point and the register restoration, we'll take the
-	 * interrupt anyway.
-	 */
-	kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
-
-	if (irqchip_in_kernel(vcpu->kvm))
-		phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
-
-	phys_active |= ctx->irq.level;
-
-	set_timer_irq_phys_active(ctx, phys_active);
-}
-
-static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-	/*
-	 * Update the timer output so that it is likely to match the
-	 * state we're about to restore. If the timer expires between
-	 * this point and the register restoration, we'll take the
-	 * interrupt anyway.
-	 */
-	kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
-
-	/*
-	 * When using a userspace irqchip with the architected timers and a
-	 * host interrupt controller that doesn't support an active state, we
-	 * must still prevent continuously exiting from the guest, and
-	 * therefore mask the physical interrupt by disabling it on the host
-	 * interrupt controller when the virtual level is high, such that the
-	 * guest can make forward progress.  Once we detect the output level
-	 * being de-asserted, we unmask the interrupt again so that we exit
-	 * from the guest when the timer fires.
-	 */
-	if (vtimer->irq.level)
-		disable_percpu_irq(host_vtimer_irq);
-	else
-		enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-}
-
-void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct timer_map map;
-
-	if (unlikely(!timer->enabled))
-		return;
-
-	get_timer_map(vcpu, &map);
-
-	if (static_branch_likely(&has_gic_active_state)) {
-		kvm_timer_vcpu_load_gic(map.direct_vtimer);
-		if (map.direct_ptimer)
-			kvm_timer_vcpu_load_gic(map.direct_ptimer);
-	} else {
-		kvm_timer_vcpu_load_nogic(vcpu);
-	}
-
-	set_cntvoff(map.direct_vtimer->cntvoff);
-
-	kvm_timer_unblocking(vcpu);
-
-	timer_restore_state(map.direct_vtimer);
-	if (map.direct_ptimer)
-		timer_restore_state(map.direct_ptimer);
-
-	if (map.emul_ptimer)
-		timer_emulate(map.emul_ptimer);
-}
-
-bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
-	bool vlevel, plevel;
-
-	if (likely(irqchip_in_kernel(vcpu->kvm)))
-		return false;
-
-	vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
-	plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
-
-	return kvm_timer_should_fire(vtimer) != vlevel ||
-	       kvm_timer_should_fire(ptimer) != plevel;
-}
-
-void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct timer_map map;
-
-	if (unlikely(!timer->enabled))
-		return;
-
-	get_timer_map(vcpu, &map);
-
-	timer_save_state(map.direct_vtimer);
-	if (map.direct_ptimer)
-		timer_save_state(map.direct_ptimer);
-
-	/*
-	 * Cancel soft timer emulation, because the only case where we
-	 * need it after a vcpu_put is in the context of a sleeping VCPU, and
-	 * in that case we already factor in the deadline for the physical
-	 * timer when scheduling the bg_timer.
-	 *
-	 * In any case, we re-schedule the hrtimer for the physical timer when
-	 * coming back to the VCPU thread in kvm_timer_vcpu_load().
-	 */
-	if (map.emul_ptimer)
-		soft_timer_cancel(&map.emul_ptimer->hrtimer);
-
-	if (swait_active(kvm_arch_vcpu_wq(vcpu)))
-		kvm_timer_blocking(vcpu);
-
-	/*
-	 * The kernel may decide to run userspace after calling vcpu_put, so
-	 * we reset cntvoff to 0 to ensure a consistent read between user
-	 * accesses to the virtual counter and kernel access to the physical
-	 * counter of non-VHE case. For VHE, the virtual counter uses a fixed
-	 * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
-	 */
-	set_cntvoff(0);
-}
-
-/*
- * With a userspace irqchip we have to check if the guest de-asserted the
- * timer and if so, unmask the timer irq signal on the host interrupt
- * controller to ensure that we see future timer signals.
- */
-static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-	if (!kvm_timer_should_fire(vtimer)) {
-		kvm_timer_update_irq(vcpu, false, vtimer);
-		if (static_branch_likely(&has_gic_active_state))
-			set_timer_irq_phys_active(vtimer, false);
-		else
-			enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-	}
-}
-
-void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-
-	if (unlikely(!timer->enabled))
-		return;
-
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		unmask_vtimer_irq_user(vcpu);
-}
-
-int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct timer_map map;
-
-	get_timer_map(vcpu, &map);
-
-	/*
-	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
-	 * and to 0 for ARMv7.  We provide an implementation that always
-	 * resets the timer to be disabled and unmasked and is compliant with
-	 * the ARMv7 architecture.
-	 */
-	vcpu_vtimer(vcpu)->cnt_ctl = 0;
-	vcpu_ptimer(vcpu)->cnt_ctl = 0;
-
-	if (timer->enabled) {
-		kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
-		kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
-
-		if (irqchip_in_kernel(vcpu->kvm)) {
-			kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
-			if (map.direct_ptimer)
-				kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
-		}
-	}
-
-	if (map.emul_ptimer)
-		soft_timer_cancel(&map.emul_ptimer->hrtimer);
-
-	return 0;
-}
-
-/* Make the updates of cntvoff for all vtimer contexts atomic */
-static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
-{
-	int i;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_vcpu *tmp;
-
-	mutex_lock(&kvm->lock);
-	kvm_for_each_vcpu(i, tmp, kvm)
-		vcpu_vtimer(tmp)->cntvoff = cntvoff;
-
-	/*
-	 * When called from the vcpu create path, the CPU being created is not
-	 * included in the loop above, so we just set it here as well.
-	 */
-	vcpu_vtimer(vcpu)->cntvoff = cntvoff;
-	mutex_unlock(&kvm->lock);
-}
-
-void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	/* Synchronize cntvoff across all vtimers of a VM. */
-	update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
-	ptimer->cntvoff = 0;
-
-	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-	timer->bg_timer.function = kvm_bg_timer_expire;
-
-	hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-	hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-	vtimer->hrtimer.function = kvm_hrtimer_expire;
-	ptimer->hrtimer.function = kvm_hrtimer_expire;
-
-	vtimer->irq.irq = default_vtimer_irq.irq;
-	ptimer->irq.irq = default_ptimer_irq.irq;
-
-	vtimer->host_timer_irq = host_vtimer_irq;
-	ptimer->host_timer_irq = host_ptimer_irq;
-
-	vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
-	ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
-
-	vtimer->vcpu = vcpu;
-	ptimer->vcpu = vcpu;
-}
-
-static void kvm_timer_init_interrupt(void *info)
-{
-	enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-	enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
-}
-
-int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
-{
-	struct arch_timer_context *timer;
-
-	switch (regid) {
-	case KVM_REG_ARM_TIMER_CTL:
-		timer = vcpu_vtimer(vcpu);
-		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
-		break;
-	case KVM_REG_ARM_TIMER_CNT:
-		timer = vcpu_vtimer(vcpu);
-		update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
-		break;
-	case KVM_REG_ARM_TIMER_CVAL:
-		timer = vcpu_vtimer(vcpu);
-		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
-		break;
-	case KVM_REG_ARM_PTIMER_CTL:
-		timer = vcpu_ptimer(vcpu);
-		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
-		break;
-	case KVM_REG_ARM_PTIMER_CVAL:
-		timer = vcpu_ptimer(vcpu);
-		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
-		break;
-
-	default:
-		return -1;
-	}
-
-	return 0;
-}
-
-static u64 read_timer_ctl(struct arch_timer_context *timer)
-{
-	/*
-	 * Set ISTATUS bit if it's expired.
-	 * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
-	 * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
-	 * regardless of ENABLE bit for our implementation convenience.
-	 */
-	if (!kvm_timer_compute_delta(timer))
-		return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT;
-	else
-		return timer->cnt_ctl;
-}
-
-u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
-{
-	switch (regid) {
-	case KVM_REG_ARM_TIMER_CTL:
-		return kvm_arm_timer_read(vcpu,
-					  vcpu_vtimer(vcpu), TIMER_REG_CTL);
-	case KVM_REG_ARM_TIMER_CNT:
-		return kvm_arm_timer_read(vcpu,
-					  vcpu_vtimer(vcpu), TIMER_REG_CNT);
-	case KVM_REG_ARM_TIMER_CVAL:
-		return kvm_arm_timer_read(vcpu,
-					  vcpu_vtimer(vcpu), TIMER_REG_CVAL);
-	case KVM_REG_ARM_PTIMER_CTL:
-		return kvm_arm_timer_read(vcpu,
-					  vcpu_ptimer(vcpu), TIMER_REG_CTL);
-	case KVM_REG_ARM_PTIMER_CNT:
-		return kvm_arm_timer_read(vcpu,
-					  vcpu_ptimer(vcpu), TIMER_REG_CNT);
-	case KVM_REG_ARM_PTIMER_CVAL:
-		return kvm_arm_timer_read(vcpu,
-					  vcpu_ptimer(vcpu), TIMER_REG_CVAL);
-	}
-	return (u64)-1;
-}
-
-static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
-			      struct arch_timer_context *timer,
-			      enum kvm_arch_timer_regs treg)
-{
-	u64 val;
-
-	switch (treg) {
-	case TIMER_REG_TVAL:
-		val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
-		val &= lower_32_bits(val);
-		break;
-
-	case TIMER_REG_CTL:
-		val = read_timer_ctl(timer);
-		break;
-
-	case TIMER_REG_CVAL:
-		val = timer->cnt_cval;
-		break;
-
-	case TIMER_REG_CNT:
-		val = kvm_phys_timer_read() - timer->cntvoff;
-		break;
-
-	default:
-		BUG();
-	}
-
-	return val;
-}
-
-u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
-			      enum kvm_arch_timers tmr,
-			      enum kvm_arch_timer_regs treg)
-{
-	u64 val;
-
-	preempt_disable();
-	kvm_timer_vcpu_put(vcpu);
-
-	val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
-
-	kvm_timer_vcpu_load(vcpu);
-	preempt_enable();
-
-	return val;
-}
-
-static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
-				struct arch_timer_context *timer,
-				enum kvm_arch_timer_regs treg,
-				u64 val)
-{
-	switch (treg) {
-	case TIMER_REG_TVAL:
-		timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val;
-		break;
-
-	case TIMER_REG_CTL:
-		timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT;
-		break;
-
-	case TIMER_REG_CVAL:
-		timer->cnt_cval = val;
-		break;
-
-	default:
-		BUG();
-	}
-}
-
-void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
-				enum kvm_arch_timers tmr,
-				enum kvm_arch_timer_regs treg,
-				u64 val)
-{
-	preempt_disable();
-	kvm_timer_vcpu_put(vcpu);
-
-	kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
-
-	kvm_timer_vcpu_load(vcpu);
-	preempt_enable();
-}
-
-static int kvm_timer_starting_cpu(unsigned int cpu)
-{
-	kvm_timer_init_interrupt(NULL);
-	return 0;
-}
-
-static int kvm_timer_dying_cpu(unsigned int cpu)
-{
-	disable_percpu_irq(host_vtimer_irq);
-	return 0;
-}
-
-int kvm_timer_hyp_init(bool has_gic)
-{
-	struct arch_timer_kvm_info *info;
-	int err;
-
-	info = arch_timer_get_kvm_info();
-	timecounter = &info->timecounter;
-
-	if (!timecounter->cc) {
-		kvm_err("kvm_arch_timer: uninitialized timecounter\n");
-		return -ENODEV;
-	}
-
-	/* First, do the virtual EL1 timer irq */
-
-	if (info->virtual_irq <= 0) {
-		kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
-			info->virtual_irq);
-		return -ENODEV;
-	}
-	host_vtimer_irq = info->virtual_irq;
-
-	host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
-	if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
-	    host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
-		kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
-			host_vtimer_irq);
-		host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
-	}
-
-	err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
-				 "kvm guest vtimer", kvm_get_running_vcpus());
-	if (err) {
-		kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
-			host_vtimer_irq, err);
-		return err;
-	}
-
-	if (has_gic) {
-		err = irq_set_vcpu_affinity(host_vtimer_irq,
-					    kvm_get_running_vcpus());
-		if (err) {
-			kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
-			goto out_free_irq;
-		}
-
-		static_branch_enable(&has_gic_active_state);
-	}
-
-	kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
-
-	/* Now let's do the physical EL1 timer irq */
-
-	if (info->physical_irq > 0) {
-		host_ptimer_irq = info->physical_irq;
-		host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
-		if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
-		    host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
-			kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
-				host_ptimer_irq);
-			host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
-		}
-
-		err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
-					 "kvm guest ptimer", kvm_get_running_vcpus());
-		if (err) {
-			kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
-				host_ptimer_irq, err);
-			return err;
-		}
-
-		if (has_gic) {
-			err = irq_set_vcpu_affinity(host_ptimer_irq,
-						    kvm_get_running_vcpus());
-			if (err) {
-				kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
-				goto out_free_irq;
-			}
-		}
-
-		kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
-	} else if (has_vhe()) {
-		kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
-			info->physical_irq);
-		err = -ENODEV;
-		goto out_free_irq;
-	}
-
-	cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
-			  "kvm/arm/timer:starting", kvm_timer_starting_cpu,
-			  kvm_timer_dying_cpu);
-	return 0;
-out_free_irq:
-	free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
-	return err;
-}
-
-void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-
-	soft_timer_cancel(&timer->bg_timer);
-}
-
-static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
-{
-	int vtimer_irq, ptimer_irq;
-	int i, ret;
-
-	vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
-	ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
-	if (ret)
-		return false;
-
-	ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
-	ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
-	if (ret)
-		return false;
-
-	kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
-		if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
-		    vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
-			return false;
-	}
-
-	return true;
-}
-
-bool kvm_arch_timer_get_input_level(int vintid)
-{
-	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
-	struct arch_timer_context *timer;
-
-	if (vintid == vcpu_vtimer(vcpu)->irq.irq)
-		timer = vcpu_vtimer(vcpu);
-	else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
-		timer = vcpu_ptimer(vcpu);
-	else
-		BUG();
-
-	return kvm_timer_should_fire(timer);
-}
-
-int kvm_timer_enable(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = vcpu_timer(vcpu);
-	struct timer_map map;
-	int ret;
-
-	if (timer->enabled)
-		return 0;
-
-	/* Without a VGIC we do not map virtual IRQs to physical IRQs */
-	if (!irqchip_in_kernel(vcpu->kvm))
-		goto no_vgic;
-
-	if (!vgic_initialized(vcpu->kvm))
-		return -ENODEV;
-
-	if (!timer_irqs_are_valid(vcpu)) {
-		kvm_debug("incorrectly configured timer irqs\n");
-		return -EINVAL;
-	}
-
-	get_timer_map(vcpu, &map);
-
-	ret = kvm_vgic_map_phys_irq(vcpu,
-				    map.direct_vtimer->host_timer_irq,
-				    map.direct_vtimer->irq.irq,
-				    kvm_arch_timer_get_input_level);
-	if (ret)
-		return ret;
-
-	if (map.direct_ptimer) {
-		ret = kvm_vgic_map_phys_irq(vcpu,
-					    map.direct_ptimer->host_timer_irq,
-					    map.direct_ptimer->irq.irq,
-					    kvm_arch_timer_get_input_level);
-	}
-
-	if (ret)
-		return ret;
-
-no_vgic:
-	timer->enabled = 1;
-	return 0;
-}
-
-/*
- * On VHE system, we only need to configure the EL2 timer trap register once,
- * not for every world switch.
- * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
- * and this makes those bits have no effect for the host kernel execution.
- */
-void kvm_timer_init_vhe(void)
-{
-	/* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */
-	u32 cnthctl_shift = 10;
-	u64 val;
-
-	/*
-	 * VHE systems allow the guest direct access to the EL1 physical
-	 * timer/counter.
-	 */
-	val = read_sysreg(cnthctl_el2);
-	val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
-	val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
-	write_sysreg(val, cnthctl_el2);
-}
-
-static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
-		vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
-	}
-}
-
-int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	int __user *uaddr = (int __user *)(long)attr->addr;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	int irq;
-
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return -EINVAL;
-
-	if (get_user(irq, uaddr))
-		return -EFAULT;
-
-	if (!(irq_is_ppi(irq)))
-		return -EINVAL;
-
-	if (vcpu->arch.timer_cpu.enabled)
-		return -EBUSY;
-
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-		set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
-		break;
-	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-		set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
-		break;
-	default:
-		return -ENXIO;
-	}
-
-	return 0;
-}
-
-int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	int __user *uaddr = (int __user *)(long)attr->addr;
-	struct arch_timer_context *timer;
-	int irq;
-
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-		timer = vcpu_vtimer(vcpu);
-		break;
-	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-		timer = vcpu_ptimer(vcpu);
-		break;
-	default:
-		return -ENXIO;
-	}
-
-	irq = timer->irq.irq;
-	return put_user(irq, uaddr);
-}
-
-int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-		return 0;
-	}
-
-	return -ENXIO;
-}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
deleted file mode 100644
index 48d0ec44ad77..000000000000
--- a/virt/kvm/arm/arm.c
+++ /dev/null
@@ -1,1681 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/bug.h>
-#include <linux/cpu_pm.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/kvm_host.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/mman.h>
-#include <linux/sched.h>
-#include <linux/kvm.h>
-#include <linux/kvm_irqfd.h>
-#include <linux/irqbypass.h>
-#include <linux/sched/stat.h>
-#include <trace/events/kvm.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-#include <linux/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/mman.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/cpufeature.h>
-#include <asm/virt.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_coproc.h>
-#include <asm/sections.h>
-
-#include <kvm/arm_hypercalls.h>
-#include <kvm/arm_pmu.h>
-#include <kvm/arm_psci.h>
-
-#ifdef REQUIRES_VIRT
-__asm__(".arch_extension	virt");
-#endif
-
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
-static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-
-/* The VMID used in the VTTBR */
-static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
-static u32 kvm_next_vmid;
-static DEFINE_SPINLOCK(kvm_vmid_lock);
-
-static bool vgic_present;
-
-static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
-DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
-
-int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
-{
-	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
-	return 0;
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
-	return 0;
-}
-
-int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
-			    struct kvm_enable_cap *cap)
-{
-	int r;
-
-	if (cap->flags)
-		return -EINVAL;
-
-	switch (cap->cap) {
-	case KVM_CAP_ARM_NISV_TO_USER:
-		r = 0;
-		kvm->arch.return_nisv_io_abort_to_user = true;
-		break;
-	default:
-		r = -EINVAL;
-		break;
-	}
-
-	return r;
-}
-
-/**
- * kvm_arch_init_vm - initializes a VM data structure
- * @kvm:	pointer to the KVM struct
- */
-int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-{
-	int ret, cpu;
-
-	ret = kvm_arm_setup_stage2(kvm, type);
-	if (ret)
-		return ret;
-
-	kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
-	if (!kvm->arch.last_vcpu_ran)
-		return -ENOMEM;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;
-
-	ret = kvm_alloc_stage2_pgd(kvm);
-	if (ret)
-		goto out_fail_alloc;
-
-	ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
-	if (ret)
-		goto out_free_stage2_pgd;
-
-	kvm_vgic_early_init(kvm);
-
-	/* Mark the initial VMID generation invalid */
-	kvm->arch.vmid.vmid_gen = 0;
-
-	/* The maximum number of VCPUs is limited by the host's GIC model */
-	kvm->arch.max_vcpus = vgic_present ?
-				kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
-
-	return ret;
-out_free_stage2_pgd:
-	kvm_free_stage2_pgd(kvm);
-out_fail_alloc:
-	free_percpu(kvm->arch.last_vcpu_ran);
-	kvm->arch.last_vcpu_ran = NULL;
-	return ret;
-}
-
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
-{
-	return VM_FAULT_SIGBUS;
-}
-
-
-/**
- * kvm_arch_destroy_vm - destroy the VM data structure
- * @kvm:	pointer to the KVM struct
- */
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
-	int i;
-
-	kvm_vgic_destroy(kvm);
-
-	free_percpu(kvm->arch.last_vcpu_ran);
-	kvm->arch.last_vcpu_ran = NULL;
-
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			kvm_vcpu_destroy(kvm->vcpus[i]);
-			kvm->vcpus[i] = NULL;
-		}
-	}
-	atomic_set(&kvm->online_vcpus, 0);
-}
-
-int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
-{
-	int r;
-	switch (ext) {
-	case KVM_CAP_IRQCHIP:
-		r = vgic_present;
-		break;
-	case KVM_CAP_IOEVENTFD:
-	case KVM_CAP_DEVICE_CTRL:
-	case KVM_CAP_USER_MEMORY:
-	case KVM_CAP_SYNC_MMU:
-	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
-	case KVM_CAP_ONE_REG:
-	case KVM_CAP_ARM_PSCI:
-	case KVM_CAP_ARM_PSCI_0_2:
-	case KVM_CAP_READONLY_MEM:
-	case KVM_CAP_MP_STATE:
-	case KVM_CAP_IMMEDIATE_EXIT:
-	case KVM_CAP_VCPU_EVENTS:
-	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
-	case KVM_CAP_ARM_NISV_TO_USER:
-	case KVM_CAP_ARM_INJECT_EXT_DABT:
-		r = 1;
-		break;
-	case KVM_CAP_ARM_SET_DEVICE_ADDR:
-		r = 1;
-		break;
-	case KVM_CAP_NR_VCPUS:
-		r = num_online_cpus();
-		break;
-	case KVM_CAP_MAX_VCPUS:
-		r = KVM_MAX_VCPUS;
-		break;
-	case KVM_CAP_MAX_VCPU_ID:
-		r = KVM_MAX_VCPU_ID;
-		break;
-	case KVM_CAP_MSI_DEVID:
-		if (!kvm)
-			r = -EINVAL;
-		else
-			r = kvm->arch.vgic.msis_require_devid;
-		break;
-	case KVM_CAP_ARM_USER_IRQ:
-		/*
-		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
-		 * (bump this number if adding more devices)
-		 */
-		r = 1;
-		break;
-	default:
-		r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
-		break;
-	}
-	return r;
-}
-
-long kvm_arch_dev_ioctl(struct file *filp,
-			unsigned int ioctl, unsigned long arg)
-{
-	return -EINVAL;
-}
-
-struct kvm *kvm_arch_alloc_vm(void)
-{
-	if (!has_vhe())
-		return kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-	return vzalloc(sizeof(struct kvm));
-}
-
-void kvm_arch_free_vm(struct kvm *kvm)
-{
-	if (!has_vhe())
-		kfree(kvm);
-	else
-		vfree(kvm);
-}
-
-int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
-{
-	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
-		return -EBUSY;
-
-	if (id >= kvm->arch.max_vcpus)
-		return -EINVAL;
-
-	return 0;
-}
-
-int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
-{
-	int err;
-
-	/* Force users to call KVM_ARM_VCPU_INIT */
-	vcpu->arch.target = -1;
-	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-
-	/* Set up the timer */
-	kvm_timer_vcpu_init(vcpu);
-
-	kvm_pmu_vcpu_init(vcpu);
-
-	kvm_arm_reset_debug_ptr(vcpu);
-
-	kvm_arm_pvtime_vcpu_init(&vcpu->arch);
-
-	err = kvm_vgic_vcpu_init(vcpu);
-	if (err)
-		return err;
-
-	return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
-}
-
-void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
-{
-}
-
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		static_branch_dec(&userspace_irqchip_in_use);
-
-	kvm_mmu_free_memory_caches(vcpu);
-	kvm_timer_vcpu_terminate(vcpu);
-	kvm_pmu_vcpu_destroy(vcpu);
-
-	kvm_arm_vcpu_destroy(vcpu);
-}
-
-int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	return kvm_timer_is_pending(vcpu);
-}
-
-void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * If we're about to block (most likely because we've just hit a
-	 * WFI), we need to sync back the state of the GIC CPU interface
-	 * so that we have the latest PMR and group enables. This ensures
-	 * that kvm_arch_vcpu_runnable has up-to-date data to decide
-	 * whether we have pending interrupts.
-	 *
-	 * For the same reason, we want to tell GICv4 that we need
-	 * doorbells to be signalled, should an interrupt become pending.
-	 */
-	preempt_disable();
-	kvm_vgic_vmcr_sync(vcpu);
-	vgic_v4_put(vcpu, true);
-	preempt_enable();
-}
-
-void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
-	preempt_disable();
-	vgic_v4_load(vcpu);
-	preempt_enable();
-}
-
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	int *last_ran;
-	kvm_host_data_t *cpu_data;
-
-	last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
-	cpu_data = this_cpu_ptr(&kvm_host_data);
-
-	/*
-	 * We might get preempted before the vCPU actually runs, but
-	 * over-invalidation doesn't affect correctness.
-	 */
-	if (*last_ran != vcpu->vcpu_id) {
-		kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
-		*last_ran = vcpu->vcpu_id;
-	}
-
-	vcpu->cpu = cpu;
-	vcpu->arch.host_cpu_context = &cpu_data->host_ctxt;
-
-	kvm_vgic_load(vcpu);
-	kvm_timer_vcpu_load(vcpu);
-	kvm_vcpu_load_sysregs(vcpu);
-	kvm_arch_vcpu_load_fp(vcpu);
-	kvm_vcpu_pmu_restore_guest(vcpu);
-	if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
-		kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
-
-	if (single_task_running())
-		vcpu_clear_wfx_traps(vcpu);
-	else
-		vcpu_set_wfx_traps(vcpu);
-
-	vcpu_ptrauth_setup_lazy(vcpu);
-}
-
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	kvm_arch_vcpu_put_fp(vcpu);
-	kvm_vcpu_put_sysregs(vcpu);
-	kvm_timer_vcpu_put(vcpu);
-	kvm_vgic_put(vcpu);
-	kvm_vcpu_pmu_restore_host(vcpu);
-
-	vcpu->cpu = -1;
-}
-
-static void vcpu_power_off(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.power_off = true;
-	kvm_make_request(KVM_REQ_SLEEP, vcpu);
-	kvm_vcpu_kick(vcpu);
-}
-
-int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
-				    struct kvm_mp_state *mp_state)
-{
-	if (vcpu->arch.power_off)
-		mp_state->mp_state = KVM_MP_STATE_STOPPED;
-	else
-		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
-
-	return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
-				    struct kvm_mp_state *mp_state)
-{
-	int ret = 0;
-
-	switch (mp_state->mp_state) {
-	case KVM_MP_STATE_RUNNABLE:
-		vcpu->arch.power_off = false;
-		break;
-	case KVM_MP_STATE_STOPPED:
-		vcpu_power_off(vcpu);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
-/**
- * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
- * @v:		The VCPU pointer
- *
- * If the guest CPU is not waiting for interrupts or an interrupt line is
- * asserted, the CPU is by definition runnable.
- */
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
-{
-	bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
-	return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
-		&& !v->arch.power_off && !v->arch.pause);
-}
-
-bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
-{
-	return vcpu_mode_priv(vcpu);
-}
-
-/* Just ensure a guest exit from a particular CPU */
-static void exit_vm_noop(void *info)
-{
-}
-
-void force_vm_exit(const cpumask_t *mask)
-{
-	preempt_disable();
-	smp_call_function_many(mask, exit_vm_noop, NULL, true);
-	preempt_enable();
-}
-
-/**
- * need_new_vmid_gen - check that the VMID is still valid
- * @vmid: The VMID to check
- *
- * return true if there is a new generation of VMIDs being used
- *
- * The hardware supports a limited set of values with the value zero reserved
- * for the host, so we check if an assigned value belongs to a previous
- * generation, which which requires us to assign a new value. If we're the
- * first to use a VMID for the new generation, we must flush necessary caches
- * and TLBs on all CPUs.
- */
-static bool need_new_vmid_gen(struct kvm_vmid *vmid)
-{
-	u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
-	smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
-	return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
-}
-
-/**
- * update_vmid - Update the vmid with a valid VMID for the current generation
- * @kvm: The guest that struct vmid belongs to
- * @vmid: The stage-2 VMID information struct
- */
-static void update_vmid(struct kvm_vmid *vmid)
-{
-	if (!need_new_vmid_gen(vmid))
-		return;
-
-	spin_lock(&kvm_vmid_lock);
-
-	/*
-	 * We need to re-check the vmid_gen here to ensure that if another vcpu
-	 * already allocated a valid vmid for this vm, then this vcpu should
-	 * use the same vmid.
-	 */
-	if (!need_new_vmid_gen(vmid)) {
-		spin_unlock(&kvm_vmid_lock);
-		return;
-	}
-
-	/* First user of a new VMID generation? */
-	if (unlikely(kvm_next_vmid == 0)) {
-		atomic64_inc(&kvm_vmid_gen);
-		kvm_next_vmid = 1;
-
-		/*
-		 * On SMP we know no other CPUs can use this CPU's or each
-		 * other's VMID after force_vm_exit returns since the
-		 * kvm_vmid_lock blocks them from reentry to the guest.
-		 */
-		force_vm_exit(cpu_all_mask);
-		/*
-		 * Now broadcast TLB + ICACHE invalidation over the inner
-		 * shareable domain to make sure all data structures are
-		 * clean.
-		 */
-		kvm_call_hyp(__kvm_flush_vm_context);
-	}
-
-	vmid->vmid = kvm_next_vmid;
-	kvm_next_vmid++;
-	kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
-
-	smp_wmb();
-	WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen));
-
-	spin_unlock(&kvm_vmid_lock);
-}
-
-static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	int ret = 0;
-
-	if (likely(vcpu->arch.has_run_once))
-		return 0;
-
-	if (!kvm_arm_vcpu_is_finalized(vcpu))
-		return -EPERM;
-
-	vcpu->arch.has_run_once = true;
-
-	if (likely(irqchip_in_kernel(kvm))) {
-		/*
-		 * Map the VGIC hardware resources before running a vcpu the
-		 * first time on this VM.
-		 */
-		if (unlikely(!vgic_ready(kvm))) {
-			ret = kvm_vgic_map_resources(kvm);
-			if (ret)
-				return ret;
-		}
-	} else {
-		/*
-		 * Tell the rest of the code that there are userspace irqchip
-		 * VMs in the wild.
-		 */
-		static_branch_inc(&userspace_irqchip_in_use);
-	}
-
-	ret = kvm_timer_enable(vcpu);
-	if (ret)
-		return ret;
-
-	ret = kvm_arm_pmu_v3_enable(vcpu);
-
-	return ret;
-}
-
-bool kvm_arch_intc_initialized(struct kvm *kvm)
-{
-	return vgic_initialized(kvm);
-}
-
-void kvm_arm_halt_guest(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		vcpu->arch.pause = true;
-	kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
-}
-
-void kvm_arm_resume_guest(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		vcpu->arch.pause = false;
-		swake_up_one(kvm_arch_vcpu_wq(vcpu));
-	}
-}
-
-static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
-{
-	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-
-	swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
-				       (!vcpu->arch.pause)));
-
-	if (vcpu->arch.power_off || vcpu->arch.pause) {
-		/* Awaken to handle a signal, request we sleep again later. */
-		kvm_make_request(KVM_REQ_SLEEP, vcpu);
-	}
-
-	/*
-	 * Make sure we will observe a potential reset request if we've
-	 * observed a change to the power state. Pairs with the smp_wmb() in
-	 * kvm_psci_vcpu_on().
-	 */
-	smp_rmb();
-}
-
-static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.target >= 0;
-}
-
-static void check_vcpu_requests(struct kvm_vcpu *vcpu)
-{
-	if (kvm_request_pending(vcpu)) {
-		if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
-			vcpu_req_sleep(vcpu);
-
-		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
-			kvm_reset_vcpu(vcpu);
-
-		/*
-		 * Clear IRQ_PENDING requests that were made to guarantee
-		 * that a VCPU sees new virtual interrupts.
-		 */
-		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
-
-		if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
-			kvm_update_stolen_time(vcpu);
-
-		if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
-			/* The distributor enable bits were changed */
-			preempt_disable();
-			vgic_v4_put(vcpu, false);
-			vgic_v4_load(vcpu);
-			preempt_enable();
-		}
-	}
-}
-
-/**
- * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
- * @vcpu:	The VCPU pointer
- * @run:	The kvm_run structure pointer used for userspace state exchange
- *
- * This function is called through the VCPU_RUN ioctl called from user space. It
- * will execute VM code in a loop until the time slice for the process is used
- * or some emulation is needed from user space in which case the function will
- * return with return value 0 and with the kvm_run structure filled in with the
- * required data for the requested emulation.
- */
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	int ret;
-
-	if (unlikely(!kvm_vcpu_initialized(vcpu)))
-		return -ENOEXEC;
-
-	ret = kvm_vcpu_first_run_init(vcpu);
-	if (ret)
-		return ret;
-
-	if (run->exit_reason == KVM_EXIT_MMIO) {
-		ret = kvm_handle_mmio_return(vcpu, vcpu->run);
-		if (ret)
-			return ret;
-	}
-
-	if (run->immediate_exit)
-		return -EINTR;
-
-	vcpu_load(vcpu);
-
-	kvm_sigset_activate(vcpu);
-
-	ret = 1;
-	run->exit_reason = KVM_EXIT_UNKNOWN;
-	while (ret > 0) {
-		/*
-		 * Check conditions before entering the guest
-		 */
-		cond_resched();
-
-		update_vmid(&vcpu->kvm->arch.vmid);
-
-		check_vcpu_requests(vcpu);
-
-		/*
-		 * Preparing the interrupts to be injected also
-		 * involves poking the GIC, which must be done in a
-		 * non-preemptible context.
-		 */
-		preempt_disable();
-
-		kvm_pmu_flush_hwstate(vcpu);
-
-		local_irq_disable();
-
-		kvm_vgic_flush_hwstate(vcpu);
-
-		/*
-		 * Exit if we have a signal pending so that we can deliver the
-		 * signal to user space.
-		 */
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			run->exit_reason = KVM_EXIT_INTR;
-		}
-
-		/*
-		 * If we're using a userspace irqchip, then check if we need
-		 * to tell a userspace irqchip about timer or PMU level
-		 * changes and if so, exit to userspace (the actual level
-		 * state gets updated in kvm_timer_update_run and
-		 * kvm_pmu_update_run below).
-		 */
-		if (static_branch_unlikely(&userspace_irqchip_in_use)) {
-			if (kvm_timer_should_notify_user(vcpu) ||
-			    kvm_pmu_should_notify_user(vcpu)) {
-				ret = -EINTR;
-				run->exit_reason = KVM_EXIT_INTR;
-			}
-		}
-
-		/*
-		 * Ensure we set mode to IN_GUEST_MODE after we disable
-		 * interrupts and before the final VCPU requests check.
-		 * See the comment in kvm_vcpu_exiting_guest_mode() and
-		 * Documentation/virt/kvm/vcpu-requests.rst
-		 */
-		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
-
-		if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) ||
-		    kvm_request_pending(vcpu)) {
-			vcpu->mode = OUTSIDE_GUEST_MODE;
-			isb(); /* Ensure work in x_flush_hwstate is committed */
-			kvm_pmu_sync_hwstate(vcpu);
-			if (static_branch_unlikely(&userspace_irqchip_in_use))
-				kvm_timer_sync_hwstate(vcpu);
-			kvm_vgic_sync_hwstate(vcpu);
-			local_irq_enable();
-			preempt_enable();
-			continue;
-		}
-
-		kvm_arm_setup_debug(vcpu);
-
-		/**************************************************************
-		 * Enter the guest
-		 */
-		trace_kvm_entry(*vcpu_pc(vcpu));
-		guest_enter_irqoff();
-
-		if (has_vhe()) {
-			ret = kvm_vcpu_run_vhe(vcpu);
-		} else {
-			ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
-		}
-
-		vcpu->mode = OUTSIDE_GUEST_MODE;
-		vcpu->stat.exits++;
-		/*
-		 * Back from guest
-		 *************************************************************/
-
-		kvm_arm_clear_debug(vcpu);
-
-		/*
-		 * We must sync the PMU state before the vgic state so
-		 * that the vgic can properly sample the updated state of the
-		 * interrupt line.
-		 */
-		kvm_pmu_sync_hwstate(vcpu);
-
-		/*
-		 * Sync the vgic state before syncing the timer state because
-		 * the timer code needs to know if the virtual timer
-		 * interrupts are active.
-		 */
-		kvm_vgic_sync_hwstate(vcpu);
-
-		/*
-		 * Sync the timer hardware state before enabling interrupts as
-		 * we don't want vtimer interrupts to race with syncing the
-		 * timer virtual interrupt state.
-		 */
-		if (static_branch_unlikely(&userspace_irqchip_in_use))
-			kvm_timer_sync_hwstate(vcpu);
-
-		kvm_arch_vcpu_ctxsync_fp(vcpu);
-
-		/*
-		 * We may have taken a host interrupt in HYP mode (ie
-		 * while executing the guest). This interrupt is still
-		 * pending, as we haven't serviced it yet!
-		 *
-		 * We're now back in SVC mode, with interrupts
-		 * disabled.  Enabling the interrupts now will have
-		 * the effect of taking the interrupt again, in SVC
-		 * mode this time.
-		 */
-		local_irq_enable();
-
-		/*
-		 * We do local_irq_enable() before calling guest_exit() so
-		 * that if a timer interrupt hits while running the guest we
-		 * account that tick as being spent in the guest.  We enable
-		 * preemption after calling guest_exit() so that if we get
-		 * preempted we make sure ticks after that is not counted as
-		 * guest time.
-		 */
-		guest_exit();
-		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-
-		/* Exit types that need handling before we can be preempted */
-		handle_exit_early(vcpu, run, ret);
-
-		preempt_enable();
-
-		ret = handle_exit(vcpu, run, ret);
-	}
-
-	/* Tell userspace about in-kernel device output levels */
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
-		kvm_timer_update_run(vcpu);
-		kvm_pmu_update_run(vcpu);
-	}
-
-	kvm_sigset_deactivate(vcpu);
-
-	vcpu_put(vcpu);
-	return ret;
-}
-
-static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
-{
-	int bit_index;
-	bool set;
-	unsigned long *hcr;
-
-	if (number == KVM_ARM_IRQ_CPU_IRQ)
-		bit_index = __ffs(HCR_VI);
-	else /* KVM_ARM_IRQ_CPU_FIQ */
-		bit_index = __ffs(HCR_VF);
-
-	hcr = vcpu_hcr(vcpu);
-	if (level)
-		set = test_and_set_bit(bit_index, hcr);
-	else
-		set = test_and_clear_bit(bit_index, hcr);
-
-	/*
-	 * If we didn't change anything, no need to wake up or kick other CPUs
-	 */
-	if (set == level)
-		return 0;
-
-	/*
-	 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
-	 * trigger a world-switch round on the running physical CPU to set the
-	 * virtual IRQ/FIQ fields in the HCR appropriately.
-	 */
-	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-	kvm_vcpu_kick(vcpu);
-
-	return 0;
-}
-
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
-			  bool line_status)
-{
-	u32 irq = irq_level->irq;
-	unsigned int irq_type, vcpu_idx, irq_num;
-	int nrcpus = atomic_read(&kvm->online_vcpus);
-	struct kvm_vcpu *vcpu = NULL;
-	bool level = irq_level->level;
-
-	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
-	vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
-	vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
-	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
-
-	trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
-
-	switch (irq_type) {
-	case KVM_ARM_IRQ_TYPE_CPU:
-		if (irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (vcpu_idx >= nrcpus)
-			return -EINVAL;
-
-		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
-		if (!vcpu)
-			return -EINVAL;
-
-		if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
-			return -EINVAL;
-
-		return vcpu_interrupt_line(vcpu, irq_num, level);
-	case KVM_ARM_IRQ_TYPE_PPI:
-		if (!irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (vcpu_idx >= nrcpus)
-			return -EINVAL;
-
-		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
-		if (!vcpu)
-			return -EINVAL;
-
-		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
-			return -EINVAL;
-
-		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
-	case KVM_ARM_IRQ_TYPE_SPI:
-		if (!irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (irq_num < VGIC_NR_PRIVATE_IRQS)
-			return -EINVAL;
-
-		return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
-	}
-
-	return -EINVAL;
-}
-
-static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-			       const struct kvm_vcpu_init *init)
-{
-	unsigned int i, ret;
-	int phys_target = kvm_target_cpu();
-
-	if (init->target != phys_target)
-		return -EINVAL;
-
-	/*
-	 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
-	 * use the same target.
-	 */
-	if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
-		return -EINVAL;
-
-	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
-	for (i = 0; i < sizeof(init->features) * 8; i++) {
-		bool set = (init->features[i / 32] & (1 << (i % 32)));
-
-		if (set && i >= KVM_VCPU_MAX_FEATURES)
-			return -ENOENT;
-
-		/*
-		 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
-		 * use the same feature set.
-		 */
-		if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
-		    test_bit(i, vcpu->arch.features) != set)
-			return -EINVAL;
-
-		if (set)
-			set_bit(i, vcpu->arch.features);
-	}
-
-	vcpu->arch.target = phys_target;
-
-	/* Now we know what it is, we can reset it. */
-	ret = kvm_reset_vcpu(vcpu);
-	if (ret) {
-		vcpu->arch.target = -1;
-		bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-	}
-
-	return ret;
-}
-
-static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
-					 struct kvm_vcpu_init *init)
-{
-	int ret;
-
-	ret = kvm_vcpu_set_target(vcpu, init);
-	if (ret)
-		return ret;
-
-	/*
-	 * Ensure a rebooted VM will fault in RAM pages and detect if the
-	 * guest MMU is turned off and flush the caches as needed.
-	 */
-	if (vcpu->arch.has_run_once)
-		stage2_unmap_vm(vcpu->kvm);
-
-	vcpu_reset_hcr(vcpu);
-
-	/*
-	 * Handle the "start in power-off" case.
-	 */
-	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-		vcpu_power_off(vcpu);
-	else
-		vcpu->arch.power_off = false;
-
-	return 0;
-}
-
-static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
-				   struct kvm_vcpu_events *events)
-{
-	memset(events, 0, sizeof(*events));
-
-	return __kvm_arm_vcpu_get_events(vcpu, events);
-}
-
-static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
-				   struct kvm_vcpu_events *events)
-{
-	int i;
-
-	/* check whether the reserved field is zero */
-	for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
-		if (events->reserved[i])
-			return -EINVAL;
-
-	/* check whether the pad field is zero */
-	for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
-		if (events->exception.pad[i])
-			return -EINVAL;
-
-	return __kvm_arm_vcpu_set_events(vcpu, events);
-}
-
-long kvm_arch_vcpu_ioctl(struct file *filp,
-			 unsigned int ioctl, unsigned long arg)
-{
-	struct kvm_vcpu *vcpu = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	struct kvm_device_attr attr;
-	long r;
-
-	switch (ioctl) {
-	case KVM_ARM_VCPU_INIT: {
-		struct kvm_vcpu_init init;
-
-		r = -EFAULT;
-		if (copy_from_user(&init, argp, sizeof(init)))
-			break;
-
-		r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
-		break;
-	}
-	case KVM_SET_ONE_REG:
-	case KVM_GET_ONE_REG: {
-		struct kvm_one_reg reg;
-
-		r = -ENOEXEC;
-		if (unlikely(!kvm_vcpu_initialized(vcpu)))
-			break;
-
-		r = -EFAULT;
-		if (copy_from_user(&reg, argp, sizeof(reg)))
-			break;
-
-		if (ioctl == KVM_SET_ONE_REG)
-			r = kvm_arm_set_reg(vcpu, &reg);
-		else
-			r = kvm_arm_get_reg(vcpu, &reg);
-		break;
-	}
-	case KVM_GET_REG_LIST: {
-		struct kvm_reg_list __user *user_list = argp;
-		struct kvm_reg_list reg_list;
-		unsigned n;
-
-		r = -ENOEXEC;
-		if (unlikely(!kvm_vcpu_initialized(vcpu)))
-			break;
-
-		r = -EPERM;
-		if (!kvm_arm_vcpu_is_finalized(vcpu))
-			break;
-
-		r = -EFAULT;
-		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
-			break;
-		n = reg_list.n;
-		reg_list.n = kvm_arm_num_regs(vcpu);
-		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
-			break;
-		r = -E2BIG;
-		if (n < reg_list.n)
-			break;
-		r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
-		break;
-	}
-	case KVM_SET_DEVICE_ATTR: {
-		r = -EFAULT;
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			break;
-		r = kvm_arm_vcpu_set_attr(vcpu, &attr);
-		break;
-	}
-	case KVM_GET_DEVICE_ATTR: {
-		r = -EFAULT;
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			break;
-		r = kvm_arm_vcpu_get_attr(vcpu, &attr);
-		break;
-	}
-	case KVM_HAS_DEVICE_ATTR: {
-		r = -EFAULT;
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			break;
-		r = kvm_arm_vcpu_has_attr(vcpu, &attr);
-		break;
-	}
-	case KVM_GET_VCPU_EVENTS: {
-		struct kvm_vcpu_events events;
-
-		if (kvm_arm_vcpu_get_events(vcpu, &events))
-			return -EINVAL;
-
-		if (copy_to_user(argp, &events, sizeof(events)))
-			return -EFAULT;
-
-		return 0;
-	}
-	case KVM_SET_VCPU_EVENTS: {
-		struct kvm_vcpu_events events;
-
-		if (copy_from_user(&events, argp, sizeof(events)))
-			return -EFAULT;
-
-		return kvm_arm_vcpu_set_events(vcpu, &events);
-	}
-	case KVM_ARM_VCPU_FINALIZE: {
-		int what;
-
-		if (!kvm_vcpu_initialized(vcpu))
-			return -ENOEXEC;
-
-		if (get_user(what, (const int __user *)argp))
-			return -EFAULT;
-
-		return kvm_arm_vcpu_finalize(vcpu, what);
-	}
-	default:
-		r = -EINVAL;
-	}
-
-	return r;
-}
-
-void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
-{
-
-}
-
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					struct kvm_memory_slot *memslot)
-{
-	kvm_flush_remote_tlbs(kvm);
-}
-
-static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
-					struct kvm_arm_device_addr *dev_addr)
-{
-	unsigned long dev_id, type;
-
-	dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
-		KVM_ARM_DEVICE_ID_SHIFT;
-	type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
-		KVM_ARM_DEVICE_TYPE_SHIFT;
-
-	switch (dev_id) {
-	case KVM_ARM_DEVICE_VGIC_V2:
-		if (!vgic_present)
-			return -ENXIO;
-		return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
-	default:
-		return -ENODEV;
-	}
-}
-
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
-{
-	struct kvm *kvm = filp->private_data;
-	void __user *argp = (void __user *)arg;
-
-	switch (ioctl) {
-	case KVM_CREATE_IRQCHIP: {
-		int ret;
-		if (!vgic_present)
-			return -ENXIO;
-		mutex_lock(&kvm->lock);
-		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-		mutex_unlock(&kvm->lock);
-		return ret;
-	}
-	case KVM_ARM_SET_DEVICE_ADDR: {
-		struct kvm_arm_device_addr dev_addr;
-
-		if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
-			return -EFAULT;
-		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
-	}
-	case KVM_ARM_PREFERRED_TARGET: {
-		int err;
-		struct kvm_vcpu_init init;
-
-		err = kvm_vcpu_preferred_target(&init);
-		if (err)
-			return err;
-
-		if (copy_to_user(argp, &init, sizeof(init)))
-			return -EFAULT;
-
-		return 0;
-	}
-	default:
-		return -EINVAL;
-	}
-}
-
-static void cpu_init_hyp_mode(void)
-{
-	phys_addr_t pgd_ptr;
-	unsigned long hyp_stack_ptr;
-	unsigned long stack_page;
-	unsigned long vector_ptr;
-
-	/* Switch from the HYP stub to our own HYP init vector */
-	__hyp_set_vectors(kvm_get_idmap_vector());
-
-	pgd_ptr = kvm_mmu_get_httbr();
-	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
-	hyp_stack_ptr = stack_page + PAGE_SIZE;
-	vector_ptr = (unsigned long)kvm_get_hyp_vector();
-
-	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
-	__cpu_init_stage2();
-}
-
-static void cpu_hyp_reset(void)
-{
-	if (!is_kernel_in_hyp_mode())
-		__hyp_reset_vectors();
-}
-
-static void cpu_hyp_reinit(void)
-{
-	kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
-
-	cpu_hyp_reset();
-
-	if (is_kernel_in_hyp_mode())
-		kvm_timer_init_vhe();
-	else
-		cpu_init_hyp_mode();
-
-	kvm_arm_init_debug();
-
-	if (vgic_present)
-		kvm_vgic_init_cpu_hardware();
-}
-
-static void _kvm_arch_hardware_enable(void *discard)
-{
-	if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
-		cpu_hyp_reinit();
-		__this_cpu_write(kvm_arm_hardware_enabled, 1);
-	}
-}
-
-int kvm_arch_hardware_enable(void)
-{
-	_kvm_arch_hardware_enable(NULL);
-	return 0;
-}
-
-static void _kvm_arch_hardware_disable(void *discard)
-{
-	if (__this_cpu_read(kvm_arm_hardware_enabled)) {
-		cpu_hyp_reset();
-		__this_cpu_write(kvm_arm_hardware_enabled, 0);
-	}
-}
-
-void kvm_arch_hardware_disable(void)
-{
-	_kvm_arch_hardware_disable(NULL);
-}
-
-#ifdef CONFIG_CPU_PM
-static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
-				    unsigned long cmd,
-				    void *v)
-{
-	/*
-	 * kvm_arm_hardware_enabled is left with its old value over
-	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
-	 * re-enable hyp.
-	 */
-	switch (cmd) {
-	case CPU_PM_ENTER:
-		if (__this_cpu_read(kvm_arm_hardware_enabled))
-			/*
-			 * don't update kvm_arm_hardware_enabled here
-			 * so that the hardware will be re-enabled
-			 * when we resume. See below.
-			 */
-			cpu_hyp_reset();
-
-		return NOTIFY_OK;
-	case CPU_PM_ENTER_FAILED:
-	case CPU_PM_EXIT:
-		if (__this_cpu_read(kvm_arm_hardware_enabled))
-			/* The hardware was enabled before suspend. */
-			cpu_hyp_reinit();
-
-		return NOTIFY_OK;
-
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static struct notifier_block hyp_init_cpu_pm_nb = {
-	.notifier_call = hyp_init_cpu_pm_notifier,
-};
-
-static void __init hyp_cpu_pm_init(void)
-{
-	cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
-}
-static void __init hyp_cpu_pm_exit(void)
-{
-	cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
-}
-#else
-static inline void hyp_cpu_pm_init(void)
-{
-}
-static inline void hyp_cpu_pm_exit(void)
-{
-}
-#endif
-
-static int init_common_resources(void)
-{
-	kvm_set_ipa_limit();
-
-	return 0;
-}
-
-static int init_subsystems(void)
-{
-	int err = 0;
-
-	/*
-	 * Enable hardware so that subsystem initialisation can access EL2.
-	 */
-	on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
-
-	/*
-	 * Register CPU lower-power notifier
-	 */
-	hyp_cpu_pm_init();
-
-	/*
-	 * Init HYP view of VGIC
-	 */
-	err = kvm_vgic_hyp_init();
-	switch (err) {
-	case 0:
-		vgic_present = true;
-		break;
-	case -ENODEV:
-	case -ENXIO:
-		vgic_present = false;
-		err = 0;
-		break;
-	default:
-		goto out;
-	}
-
-	/*
-	 * Init HYP architected timer support
-	 */
-	err = kvm_timer_hyp_init(vgic_present);
-	if (err)
-		goto out;
-
-	kvm_perf_init();
-	kvm_coproc_table_init();
-
-out:
-	on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
-
-	return err;
-}
-
-static void teardown_hyp_mode(void)
-{
-	int cpu;
-
-	free_hyp_pgds();
-	for_each_possible_cpu(cpu)
-		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
-}
-
-/**
- * Inits Hyp-mode on all online CPUs
- */
-static int init_hyp_mode(void)
-{
-	int cpu;
-	int err = 0;
-
-	/*
-	 * Allocate Hyp PGD and setup Hyp identity mapping
-	 */
-	err = kvm_mmu_init();
-	if (err)
-		goto out_err;
-
-	/*
-	 * Allocate stack pages for Hypervisor-mode
-	 */
-	for_each_possible_cpu(cpu) {
-		unsigned long stack_page;
-
-		stack_page = __get_free_page(GFP_KERNEL);
-		if (!stack_page) {
-			err = -ENOMEM;
-			goto out_err;
-		}
-
-		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
-	}
-
-	/*
-	 * Map the Hyp-code called directly from the host
-	 */
-	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
-	if (err) {
-		kvm_err("Cannot map world-switch code\n");
-		goto out_err;
-	}
-
-	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
-	if (err) {
-		kvm_err("Cannot map rodata section\n");
-		goto out_err;
-	}
-
-	err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
-				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
-	if (err) {
-		kvm_err("Cannot map bss section\n");
-		goto out_err;
-	}
-
-	err = kvm_map_vectors();
-	if (err) {
-		kvm_err("Cannot map vectors\n");
-		goto out_err;
-	}
-
-	/*
-	 * Map the Hyp stack pages
-	 */
-	for_each_possible_cpu(cpu) {
-		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-		err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
-					  PAGE_HYP);
-
-		if (err) {
-			kvm_err("Cannot map hyp stack\n");
-			goto out_err;
-		}
-	}
-
-	for_each_possible_cpu(cpu) {
-		kvm_host_data_t *cpu_data;
-
-		cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
-		err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
-
-		if (err) {
-			kvm_err("Cannot map host CPU state: %d\n", err);
-			goto out_err;
-		}
-	}
-
-	err = hyp_map_aux_data();
-	if (err)
-		kvm_err("Cannot map host auxiliary data: %d\n", err);
-
-	return 0;
-
-out_err:
-	teardown_hyp_mode();
-	kvm_err("error initializing Hyp mode: %d\n", err);
-	return err;
-}
-
-static void check_kvm_target_cpu(void *ret)
-{
-	*(int *)ret = kvm_target_cpu();
-}
-
-struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	mpidr &= MPIDR_HWID_BITMASK;
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
-			return vcpu;
-	}
-	return NULL;
-}
-
-bool kvm_arch_has_irq_bypass(void)
-{
-	return true;
-}
-
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
-				      struct irq_bypass_producer *prod)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-	return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
-					  &irqfd->irq_entry);
-}
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
-				      struct irq_bypass_producer *prod)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
-				     &irqfd->irq_entry);
-}
-
-void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-	kvm_arm_halt_guest(irqfd->kvm);
-}
-
-void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-	kvm_arm_resume_guest(irqfd->kvm);
-}
-
-/**
- * Initialize Hyp-mode and memory mappings on all CPUs.
- */
-int kvm_arch_init(void *opaque)
-{
-	int err;
-	int ret, cpu;
-	bool in_hyp_mode;
-
-	if (!is_hyp_mode_available()) {
-		kvm_info("HYP mode not available\n");
-		return -ENODEV;
-	}
-
-	in_hyp_mode = is_kernel_in_hyp_mode();
-
-	if (!in_hyp_mode && kvm_arch_requires_vhe()) {
-		kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
-		return -ENODEV;
-	}
-
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
-		if (ret < 0) {
-			kvm_err("Error, CPU %d not supported!\n", cpu);
-			return -ENODEV;
-		}
-	}
-
-	err = init_common_resources();
-	if (err)
-		return err;
-
-	err = kvm_arm_init_sve();
-	if (err)
-		return err;
-
-	if (!in_hyp_mode) {
-		err = init_hyp_mode();
-		if (err)
-			goto out_err;
-	}
-
-	err = init_subsystems();
-	if (err)
-		goto out_hyp;
-
-	if (in_hyp_mode)
-		kvm_info("VHE mode initialized successfully\n");
-	else
-		kvm_info("Hyp mode initialized successfully\n");
-
-	return 0;
-
-out_hyp:
-	hyp_cpu_pm_exit();
-	if (!in_hyp_mode)
-		teardown_hyp_mode();
-out_err:
-	return err;
-}
-
-/* NOP: Compiling as a module not supported */
-void kvm_arch_exit(void)
-{
-	kvm_perf_teardown();
-}
-
-static int arm_init(void)
-{
-	int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
-	return rc;
-}
-
-module_init(arm_init);
diff --git a/virt/kvm/arm/hyp/aarch32.c b/virt/kvm/arm/hyp/aarch32.c
deleted file mode 100644
index 25c0e47d57cb..000000000000
--- a/virt/kvm/arm/hyp/aarch32.c
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Hyp portion of the (not much of an) Emulation layer for 32bit guests.
- *
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * based on arch/arm/kvm/emulate.c
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-
-/*
- * stolen from arch/arm/kernel/opcodes.c
- *
- * condition code lookup table
- * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
- *
- * bit position in short is condition code: NZCV
- */
-static const unsigned short cc_map[16] = {
-	0xF0F0,			/* EQ == Z set            */
-	0x0F0F,			/* NE                     */
-	0xCCCC,			/* CS == C set            */
-	0x3333,			/* CC                     */
-	0xFF00,			/* MI == N set            */
-	0x00FF,			/* PL                     */
-	0xAAAA,			/* VS == V set            */
-	0x5555,			/* VC                     */
-	0x0C0C,			/* HI == C set && Z clear */
-	0xF3F3,			/* LS == C clear || Z set */
-	0xAA55,			/* GE == (N==V)           */
-	0x55AA,			/* LT == (N!=V)           */
-	0x0A05,			/* GT == (!Z && (N==V))   */
-	0xF5FA,			/* LE == (Z || (N!=V))    */
-	0xFFFF,			/* AL always              */
-	0			/* NV                     */
-};
-
-/*
- * Check if a trapped instruction should have been executed or not.
- */
-bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu)
-{
-	unsigned long cpsr;
-	u32 cpsr_cond;
-	int cond;
-
-	/* Top two bits non-zero?  Unconditional. */
-	if (kvm_vcpu_get_hsr(vcpu) >> 30)
-		return true;
-
-	/* Is condition field valid? */
-	cond = kvm_vcpu_get_condition(vcpu);
-	if (cond == 0xE)
-		return true;
-
-	cpsr = *vcpu_cpsr(vcpu);
-
-	if (cond < 0) {
-		/* This can happen in Thumb mode: examine IT state. */
-		unsigned long it;
-
-		it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
-
-		/* it == 0 => unconditional. */
-		if (it == 0)
-			return true;
-
-		/* The cond for this insn works out as the top 4 bits. */
-		cond = (it >> 4);
-	}
-
-	cpsr_cond = cpsr >> 28;
-
-	if (!((cc_map[cond] >> cpsr_cond) & 1))
-		return false;
-
-	return true;
-}
-
-/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
- * @vcpu:	The VCPU pointer
- *
- * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
- * to do this little bit of work manually. The fields map like this:
- *
- * IT[7:0] -> CPSR[26:25],CPSR[15:10]
- */
-static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
-{
-	unsigned long itbits, cond;
-	unsigned long cpsr = *vcpu_cpsr(vcpu);
-	bool is_arm = !(cpsr & PSR_AA32_T_BIT);
-
-	if (is_arm || !(cpsr & PSR_AA32_IT_MASK))
-		return;
-
-	cond = (cpsr & 0xe000) >> 13;
-	itbits = (cpsr & 0x1c00) >> (10 - 2);
-	itbits |= (cpsr & (0x3 << 25)) >> 25;
-
-	/* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
-	if ((itbits & 0x7) == 0)
-		itbits = cond = 0;
-	else
-		itbits = (itbits << 1) & 0x1f;
-
-	cpsr &= ~PSR_AA32_IT_MASK;
-	cpsr |= cond << 13;
-	cpsr |= (itbits & 0x1c) << (10 - 2);
-	cpsr |= (itbits & 0x3) << 25;
-	*vcpu_cpsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
- * @vcpu: The vcpu pointer
- */
-void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
-	u32 pc = *vcpu_pc(vcpu);
-	bool is_thumb;
-
-	is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
-	if (is_thumb && !is_wide_instr)
-		pc += 2;
-	else
-		pc += 4;
-
-	*vcpu_pc(vcpu) = pc;
-
-	kvm_adjust_itstate(vcpu);
-}
diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c
deleted file mode 100644
index ff76e6845fe4..000000000000
--- a/virt/kvm/arm/hyp/timer-sr.c
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <clocksource/arm_arch_timer.h>
-#include <linux/compiler.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_hyp.h>
-
-void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
-{
-	u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low;
-	write_sysreg(cntvoff, cntvoff_el2);
-}
-
-/*
- * Should only be called on non-VHE systems.
- * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
- */
-void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
-{
-	u64 val;
-
-	/* Allow physical timer/counter access for the host */
-	val = read_sysreg(cnthctl_el2);
-	val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
-	write_sysreg(val, cnthctl_el2);
-}
-
-/*
- * Should only be called on non-VHE systems.
- * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
- */
-void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
-{
-	u64 val;
-
-	/*
-	 * Disallow physical timer access for the guest
-	 * Physical counter access is allowed
-	 */
-	val = read_sysreg(cnthctl_el2);
-	val &= ~CNTHCTL_EL1PCEN;
-	val |= CNTHCTL_EL1PCTEN;
-	write_sysreg(val, cnthctl_el2);
-}
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
deleted file mode 100644
index ccf1fde9836c..000000000000
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ /dev/null
@@ -1,1130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-#include <asm/kvm_mmu.h>
-
-#define vtr_to_max_lr_idx(v)		((v) & 0xf)
-#define vtr_to_nr_pre_bits(v)		((((u32)(v) >> 26) & 7) + 1)
-#define vtr_to_nr_apr_regs(v)		(1 << (vtr_to_nr_pre_bits(v) - 5))
-
-static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
-{
-	switch (lr & 0xf) {
-	case 0:
-		return read_gicreg(ICH_LR0_EL2);
-	case 1:
-		return read_gicreg(ICH_LR1_EL2);
-	case 2:
-		return read_gicreg(ICH_LR2_EL2);
-	case 3:
-		return read_gicreg(ICH_LR3_EL2);
-	case 4:
-		return read_gicreg(ICH_LR4_EL2);
-	case 5:
-		return read_gicreg(ICH_LR5_EL2);
-	case 6:
-		return read_gicreg(ICH_LR6_EL2);
-	case 7:
-		return read_gicreg(ICH_LR7_EL2);
-	case 8:
-		return read_gicreg(ICH_LR8_EL2);
-	case 9:
-		return read_gicreg(ICH_LR9_EL2);
-	case 10:
-		return read_gicreg(ICH_LR10_EL2);
-	case 11:
-		return read_gicreg(ICH_LR11_EL2);
-	case 12:
-		return read_gicreg(ICH_LR12_EL2);
-	case 13:
-		return read_gicreg(ICH_LR13_EL2);
-	case 14:
-		return read_gicreg(ICH_LR14_EL2);
-	case 15:
-		return read_gicreg(ICH_LR15_EL2);
-	}
-
-	unreachable();
-}
-
-static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
-{
-	switch (lr & 0xf) {
-	case 0:
-		write_gicreg(val, ICH_LR0_EL2);
-		break;
-	case 1:
-		write_gicreg(val, ICH_LR1_EL2);
-		break;
-	case 2:
-		write_gicreg(val, ICH_LR2_EL2);
-		break;
-	case 3:
-		write_gicreg(val, ICH_LR3_EL2);
-		break;
-	case 4:
-		write_gicreg(val, ICH_LR4_EL2);
-		break;
-	case 5:
-		write_gicreg(val, ICH_LR5_EL2);
-		break;
-	case 6:
-		write_gicreg(val, ICH_LR6_EL2);
-		break;
-	case 7:
-		write_gicreg(val, ICH_LR7_EL2);
-		break;
-	case 8:
-		write_gicreg(val, ICH_LR8_EL2);
-		break;
-	case 9:
-		write_gicreg(val, ICH_LR9_EL2);
-		break;
-	case 10:
-		write_gicreg(val, ICH_LR10_EL2);
-		break;
-	case 11:
-		write_gicreg(val, ICH_LR11_EL2);
-		break;
-	case 12:
-		write_gicreg(val, ICH_LR12_EL2);
-		break;
-	case 13:
-		write_gicreg(val, ICH_LR13_EL2);
-		break;
-	case 14:
-		write_gicreg(val, ICH_LR14_EL2);
-		break;
-	case 15:
-		write_gicreg(val, ICH_LR15_EL2);
-		break;
-	}
-}
-
-static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n)
-{
-	switch (n) {
-	case 0:
-		write_gicreg(val, ICH_AP0R0_EL2);
-		break;
-	case 1:
-		write_gicreg(val, ICH_AP0R1_EL2);
-		break;
-	case 2:
-		write_gicreg(val, ICH_AP0R2_EL2);
-		break;
-	case 3:
-		write_gicreg(val, ICH_AP0R3_EL2);
-		break;
-	}
-}
-
-static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n)
-{
-	switch (n) {
-	case 0:
-		write_gicreg(val, ICH_AP1R0_EL2);
-		break;
-	case 1:
-		write_gicreg(val, ICH_AP1R1_EL2);
-		break;
-	case 2:
-		write_gicreg(val, ICH_AP1R2_EL2);
-		break;
-	case 3:
-		write_gicreg(val, ICH_AP1R3_EL2);
-		break;
-	}
-}
-
-static u32 __hyp_text __vgic_v3_read_ap0rn(int n)
-{
-	u32 val;
-
-	switch (n) {
-	case 0:
-		val = read_gicreg(ICH_AP0R0_EL2);
-		break;
-	case 1:
-		val = read_gicreg(ICH_AP0R1_EL2);
-		break;
-	case 2:
-		val = read_gicreg(ICH_AP0R2_EL2);
-		break;
-	case 3:
-		val = read_gicreg(ICH_AP0R3_EL2);
-		break;
-	default:
-		unreachable();
-	}
-
-	return val;
-}
-
-static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
-{
-	u32 val;
-
-	switch (n) {
-	case 0:
-		val = read_gicreg(ICH_AP1R0_EL2);
-		break;
-	case 1:
-		val = read_gicreg(ICH_AP1R1_EL2);
-		break;
-	case 2:
-		val = read_gicreg(ICH_AP1R2_EL2);
-		break;
-	case 3:
-		val = read_gicreg(ICH_AP1R3_EL2);
-		break;
-	default:
-		unreachable();
-	}
-
-	return val;
-}
-
-void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
-	/*
-	 * Make sure stores to the GIC via the memory mapped interface
-	 * are now visible to the system register interface when reading the
-	 * LRs, and when reading back the VMCR on non-VHE systems.
-	 */
-	if (used_lrs || !has_vhe()) {
-		if (!cpu_if->vgic_sre) {
-			dsb(sy);
-			isb();
-		}
-	}
-
-	if (used_lrs || cpu_if->its_vpe.its_vm) {
-		int i;
-		u32 elrsr;
-
-		elrsr = read_gicreg(ICH_ELRSR_EL2);
-
-		write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
-
-		for (i = 0; i < used_lrs; i++) {
-			if (elrsr & (1 << i))
-				cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
-			else
-				cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
-
-			__gic_v3_set_lr(0, i);
-		}
-	}
-}
-
-void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	int i;
-
-	if (used_lrs || cpu_if->its_vpe.its_vm) {
-		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-
-		for (i = 0; i < used_lrs; i++)
-			__gic_v3_set_lr(cpu_if->vgic_lr[i], i);
-	}
-
-	/*
-	 * Ensure that writes to the LRs, and on non-VHE systems ensure that
-	 * the write to the VMCR in __vgic_v3_activate_traps(), will have
-	 * reached the (re)distributors. This ensure the guest will read the
-	 * correct values from the memory-mapped interface.
-	 */
-	if (used_lrs || !has_vhe()) {
-		if (!cpu_if->vgic_sre) {
-			isb();
-			dsb(sy);
-		}
-	}
-}
-
-void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	/*
-	 * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
-	 * Group0 interrupt (as generated in GICv2 mode) to be
-	 * delivered as a FIQ to the guest, with potentially fatal
-	 * consequences. So we must make sure that ICC_SRE_EL1 has
-	 * been actually programmed with the value we want before
-	 * starting to mess with the rest of the GIC, and VMCR_EL2 in
-	 * particular.  This logic must be called before
-	 * __vgic_v3_restore_state().
-	 */
-	if (!cpu_if->vgic_sre) {
-		write_gicreg(0, ICC_SRE_EL1);
-		isb();
-		write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
-
-
-		if (has_vhe()) {
-			/*
-			 * Ensure that the write to the VMCR will have reached
-			 * the (re)distributors. This ensure the guest will
-			 * read the correct values from the memory-mapped
-			 * interface.
-			 */
-			isb();
-			dsb(sy);
-		}
-	}
-
-	/*
-	 * Prevent the guest from touching the GIC system registers if
-	 * SRE isn't enabled for GICv3 emulation.
-	 */
-	write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
-		     ICC_SRE_EL2);
-
-	/*
-	 * If we need to trap system registers, we must write
-	 * ICH_HCR_EL2 anyway, even if no interrupts are being
-	 * injected,
-	 */
-	if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
-	    cpu_if->its_vpe.its_vm)
-		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-}
-
-void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u64 val;
-
-	if (!cpu_if->vgic_sre) {
-		cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
-	}
-
-	val = read_gicreg(ICC_SRE_EL2);
-	write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
-
-	if (!cpu_if->vgic_sre) {
-		/* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
-		isb();
-		write_gicreg(1, ICC_SRE_EL1);
-	}
-
-	/*
-	 * If we were trapping system registers, we enabled the VGIC even if
-	 * no interrupts were being injected, and we disable it again here.
-	 */
-	if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
-	    cpu_if->its_vpe.its_vm)
-		write_gicreg(0, ICH_HCR_EL2);
-}
-
-void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if;
-	u64 val;
-	u32 nr_pre_bits;
-
-	vcpu = kern_hyp_va(vcpu);
-	cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	val = read_gicreg(ICH_VTR_EL2);
-	nr_pre_bits = vtr_to_nr_pre_bits(val);
-
-	switch (nr_pre_bits) {
-	case 7:
-		cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
-		cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
-		/* Fall through */
-	case 6:
-		cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
-		/* Fall through */
-	default:
-		cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
-	}
-
-	switch (nr_pre_bits) {
-	case 7:
-		cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
-		cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
-		/* Fall through */
-	case 6:
-		cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
-		/* Fall through */
-	default:
-		cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
-	}
-}
-
-void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if;
-	u64 val;
-	u32 nr_pre_bits;
-
-	vcpu = kern_hyp_va(vcpu);
-	cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	val = read_gicreg(ICH_VTR_EL2);
-	nr_pre_bits = vtr_to_nr_pre_bits(val);
-
-	switch (nr_pre_bits) {
-	case 7:
-		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
-		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
-		/* Fall through */
-	case 6:
-		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
-		/* Fall through */
-	default:
-		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
-	}
-
-	switch (nr_pre_bits) {
-	case 7:
-		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
-		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
-		/* Fall through */
-	case 6:
-		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
-		/* Fall through */
-	default:
-		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
-	}
-}
-
-void __hyp_text __vgic_v3_init_lrs(void)
-{
-	int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
-	int i;
-
-	for (i = 0; i <= max_lr_idx; i++)
-		__gic_v3_set_lr(0, i);
-}
-
-u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
-{
-	return read_gicreg(ICH_VTR_EL2);
-}
-
-u64 __hyp_text __vgic_v3_read_vmcr(void)
-{
-	return read_gicreg(ICH_VMCR_EL2);
-}
-
-void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
-{
-	write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-#ifdef CONFIG_ARM64
-
-static int __hyp_text __vgic_v3_bpr_min(void)
-{
-	/* See Pseudocode for VPriorityGroup */
-	return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
-}
-
-static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu)
-{
-	u32 esr = kvm_vcpu_get_hsr(vcpu);
-	u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
-
-	return crm != 8;
-}
-
-#define GICv3_IDLE_PRIORITY	0xff
-
-static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
-						    u32 vmcr,
-						    u64 *lr_val)
-{
-	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	u8 priority = GICv3_IDLE_PRIORITY;
-	int i, lr = -1;
-
-	for (i = 0; i < used_lrs; i++) {
-		u64 val = __gic_v3_get_lr(i);
-		u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-
-		/* Not pending in the state? */
-		if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
-			continue;
-
-		/* Group-0 interrupt, but Group-0 disabled? */
-		if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
-			continue;
-
-		/* Group-1 interrupt, but Group-1 disabled? */
-		if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
-			continue;
-
-		/* Not the highest priority? */
-		if (lr_prio >= priority)
-			continue;
-
-		/* This is a candidate */
-		priority = lr_prio;
-		*lr_val = val;
-		lr = i;
-	}
-
-	if (lr == -1)
-		*lr_val = ICC_IAR1_EL1_SPURIOUS;
-
-	return lr;
-}
-
-static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
-					       int intid, u64 *lr_val)
-{
-	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	int i;
-
-	for (i = 0; i < used_lrs; i++) {
-		u64 val = __gic_v3_get_lr(i);
-
-		if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
-		    (val & ICH_LR_ACTIVE_BIT)) {
-			*lr_val = val;
-			return i;
-		}
-	}
-
-	*lr_val = ICC_IAR1_EL1_SPURIOUS;
-	return -1;
-}
-
-static int __hyp_text __vgic_v3_get_highest_active_priority(void)
-{
-	u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
-	u32 hap = 0;
-	int i;
-
-	for (i = 0; i < nr_apr_regs; i++) {
-		u32 val;
-
-		/*
-		 * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
-		 * contain the active priority levels for this VCPU
-		 * for the maximum number of supported priority
-		 * levels, and we return the full priority level only
-		 * if the BPR is programmed to its minimum, otherwise
-		 * we return a combination of the priority level and
-		 * subpriority, as determined by the setting of the
-		 * BPR, but without the full subpriority.
-		 */
-		val  = __vgic_v3_read_ap0rn(i);
-		val |= __vgic_v3_read_ap1rn(i);
-		if (!val) {
-			hap += 32;
-			continue;
-		}
-
-		return (hap + __ffs(val)) << __vgic_v3_bpr_min();
-	}
-
-	return GICv3_IDLE_PRIORITY;
-}
-
-static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr)
-{
-	return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-}
-
-static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr)
-{
-	unsigned int bpr;
-
-	if (vmcr & ICH_VMCR_CBPR_MASK) {
-		bpr = __vgic_v3_get_bpr0(vmcr);
-		if (bpr < 7)
-			bpr++;
-	} else {
-		bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-	}
-
-	return bpr;
-}
-
-/*
- * Convert a priority to a preemption level, taking the relevant BPR
- * into account by zeroing the sub-priority bits.
- */
-static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
-{
-	unsigned int bpr;
-
-	if (!grp)
-		bpr = __vgic_v3_get_bpr0(vmcr) + 1;
-	else
-		bpr = __vgic_v3_get_bpr1(vmcr);
-
-	return pri & (GENMASK(7, 0) << bpr);
-}
-
-/*
- * The priority value is independent of any of the BPR values, so we
- * normalize it using the minumal BPR value. This guarantees that no
- * matter what the guest does with its BPR, we can always set/get the
- * same value of a priority.
- */
-static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
-{
-	u8 pre, ap;
-	u32 val;
-	int apr;
-
-	pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
-	ap = pre >> __vgic_v3_bpr_min();
-	apr = ap / 32;
-
-	if (!grp) {
-		val = __vgic_v3_read_ap0rn(apr);
-		__vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
-	} else {
-		val = __vgic_v3_read_ap1rn(apr);
-		__vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
-	}
-}
-
-static int __hyp_text __vgic_v3_clear_highest_active_priority(void)
-{
-	u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
-	u32 hap = 0;
-	int i;
-
-	for (i = 0; i < nr_apr_regs; i++) {
-		u32 ap0, ap1;
-		int c0, c1;
-
-		ap0 = __vgic_v3_read_ap0rn(i);
-		ap1 = __vgic_v3_read_ap1rn(i);
-		if (!ap0 && !ap1) {
-			hap += 32;
-			continue;
-		}
-
-		c0 = ap0 ? __ffs(ap0) : 32;
-		c1 = ap1 ? __ffs(ap1) : 32;
-
-		/* Always clear the LSB, which is the highest priority */
-		if (c0 < c1) {
-			ap0 &= ~BIT(c0);
-			__vgic_v3_write_ap0rn(ap0, i);
-			hap += c0;
-		} else {
-			ap1 &= ~BIT(c1);
-			__vgic_v3_write_ap1rn(ap1, i);
-			hap += c1;
-		}
-
-		/* Rescale to 8 bits of priority */
-		return hap << __vgic_v3_bpr_min();
-	}
-
-	return GICv3_IDLE_PRIORITY;
-}
-
-static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 lr_val;
-	u8 lr_prio, pmr;
-	int lr, grp;
-
-	grp = __vgic_v3_get_group(vcpu);
-
-	lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
-	if (lr < 0)
-		goto spurious;
-
-	if (grp != !!(lr_val & ICH_LR_GROUP))
-		goto spurious;
-
-	pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-	lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-	if (pmr <= lr_prio)
-		goto spurious;
-
-	if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
-		goto spurious;
-
-	lr_val &= ~ICH_LR_STATE;
-	/* No active state for LPIs */
-	if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI)
-		lr_val |= ICH_LR_ACTIVE_BIT;
-	__gic_v3_set_lr(lr_val, lr);
-	__vgic_v3_set_active_priority(lr_prio, vmcr, grp);
-	vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
-	return;
-
-spurious:
-	vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
-}
-
-static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val)
-{
-	lr_val &= ~ICH_LR_ACTIVE_BIT;
-	if (lr_val & ICH_LR_HW) {
-		u32 pid;
-
-		pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
-		gic_write_dir(pid);
-	}
-
-	__gic_v3_set_lr(lr_val, lr);
-}
-
-static void __hyp_text __vgic_v3_bump_eoicount(void)
-{
-	u32 hcr;
-
-	hcr = read_gicreg(ICH_HCR_EL2);
-	hcr += 1 << ICH_HCR_EOIcount_SHIFT;
-	write_gicreg(hcr, ICH_HCR_EL2);
-}
-
-static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu,
-					   u32 vmcr, int rt)
-{
-	u32 vid = vcpu_get_reg(vcpu, rt);
-	u64 lr_val;
-	int lr;
-
-	/* EOImode == 0, nothing to be done here */
-	if (!(vmcr & ICH_VMCR_EOIM_MASK))
-		return;
-
-	/* No deactivate to be performed on an LPI */
-	if (vid >= VGIC_MIN_LPI)
-		return;
-
-	lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
-	if (lr == -1) {
-		__vgic_v3_bump_eoicount();
-		return;
-	}
-
-	__vgic_v3_clear_active_lr(lr, lr_val);
-}
-
-static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u32 vid = vcpu_get_reg(vcpu, rt);
-	u64 lr_val;
-	u8 lr_prio, act_prio;
-	int lr, grp;
-
-	grp = __vgic_v3_get_group(vcpu);
-
-	/* Drop priority in any case */
-	act_prio = __vgic_v3_clear_highest_active_priority();
-
-	/* If EOIing an LPI, no deactivate to be performed */
-	if (vid >= VGIC_MIN_LPI)
-		return;
-
-	/* EOImode == 1, nothing to be done here */
-	if (vmcr & ICH_VMCR_EOIM_MASK)
-		return;
-
-	lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
-	if (lr == -1) {
-		__vgic_v3_bump_eoicount();
-		return;
-	}
-
-	lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-
-	/* If priorities or group do not match, the guest has fscked-up. */
-	if (grp != !!(lr_val & ICH_LR_GROUP) ||
-	    __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
-		return;
-
-	/* Let's now perform the deactivation */
-	__vgic_v3_clear_active_lr(lr, lr_val);
-}
-
-static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
-}
-
-static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
-}
-
-static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 val = vcpu_get_reg(vcpu, rt);
-
-	if (val & 1)
-		vmcr |= ICH_VMCR_ENG0_MASK;
-	else
-		vmcr &= ~ICH_VMCR_ENG0_MASK;
-
-	__vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 val = vcpu_get_reg(vcpu, rt);
-
-	if (val & 1)
-		vmcr |= ICH_VMCR_ENG1_MASK;
-	else
-		vmcr &= ~ICH_VMCR_ENG1_MASK;
-
-	__vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
-}
-
-static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
-}
-
-static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 val = vcpu_get_reg(vcpu, rt);
-	u8 bpr_min = __vgic_v3_bpr_min() - 1;
-
-	/* Enforce BPR limiting */
-	if (val < bpr_min)
-		val = bpr_min;
-
-	val <<= ICH_VMCR_BPR0_SHIFT;
-	val &= ICH_VMCR_BPR0_MASK;
-	vmcr &= ~ICH_VMCR_BPR0_MASK;
-	vmcr |= val;
-
-	__vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 val = vcpu_get_reg(vcpu, rt);
-	u8 bpr_min = __vgic_v3_bpr_min();
-
-	if (vmcr & ICH_VMCR_CBPR_MASK)
-		return;
-
-	/* Enforce BPR limiting */
-	if (val < bpr_min)
-		val = bpr_min;
-
-	val <<= ICH_VMCR_BPR1_SHIFT;
-	val &= ICH_VMCR_BPR1_MASK;
-	vmcr &= ~ICH_VMCR_BPR1_MASK;
-	vmcr |= val;
-
-	__vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
-{
-	u32 val;
-
-	if (!__vgic_v3_get_group(vcpu))
-		val = __vgic_v3_read_ap0rn(n);
-	else
-		val = __vgic_v3_read_ap1rn(n);
-
-	vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
-{
-	u32 val = vcpu_get_reg(vcpu, rt);
-
-	if (!__vgic_v3_get_group(vcpu))
-		__vgic_v3_write_ap0rn(val, n);
-	else
-		__vgic_v3_write_ap1rn(val, n);
-}
-
-static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	__vgic_v3_read_apxrn(vcpu, rt, 0);
-}
-
-static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	__vgic_v3_read_apxrn(vcpu, rt, 1);
-}
-
-static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	__vgic_v3_read_apxrn(vcpu, rt, 2);
-}
-
-static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	__vgic_v3_read_apxrn(vcpu, rt, 3);
-}
-
-static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu,
-					     u32 vmcr, int rt)
-{
-	__vgic_v3_write_apxrn(vcpu, rt, 0);
-}
-
-static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu,
-					     u32 vmcr, int rt)
-{
-	__vgic_v3_write_apxrn(vcpu, rt, 1);
-}
-
-static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu,
-					     u32 vmcr, int rt)
-{
-	__vgic_v3_write_apxrn(vcpu, rt, 2);
-}
-
-static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu,
-					     u32 vmcr, int rt)
-{
-	__vgic_v3_write_apxrn(vcpu, rt, 3);
-}
-
-static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	u64 lr_val;
-	int lr, lr_grp, grp;
-
-	grp = __vgic_v3_get_group(vcpu);
-
-	lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
-	if (lr == -1)
-		goto spurious;
-
-	lr_grp = !!(lr_val & ICH_LR_GROUP);
-	if (lr_grp != grp)
-		lr_val = ICC_IAR1_EL1_SPURIOUS;
-
-spurious:
-	vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
-}
-
-static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu,
-					  u32 vmcr, int rt)
-{
-	vmcr &= ICH_VMCR_PMR_MASK;
-	vmcr >>= ICH_VMCR_PMR_SHIFT;
-	vcpu_set_reg(vcpu, rt, vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu,
-					   u32 vmcr, int rt)
-{
-	u32 val = vcpu_get_reg(vcpu, rt);
-
-	val <<= ICH_VMCR_PMR_SHIFT;
-	val &= ICH_VMCR_PMR_MASK;
-	vmcr &= ~ICH_VMCR_PMR_MASK;
-	vmcr |= val;
-
-	write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu,
-					  u32 vmcr, int rt)
-{
-	u32 val = __vgic_v3_get_highest_active_priority();
-	vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu,
-					   u32 vmcr, int rt)
-{
-	u32 vtr, val;
-
-	vtr = read_gicreg(ICH_VTR_EL2);
-	/* PRIbits */
-	val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
-	/* IDbits */
-	val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
-	/* SEIS */
-	val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT;
-	/* A3V */
-	val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
-	/* EOImode */
-	val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
-	/* CBPR */
-	val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
-
-	vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	u32 val = vcpu_get_reg(vcpu, rt);
-
-	if (val & ICC_CTLR_EL1_CBPR_MASK)
-		vmcr |= ICH_VMCR_CBPR_MASK;
-	else
-		vmcr &= ~ICH_VMCR_CBPR_MASK;
-
-	if (val & ICC_CTLR_EL1_EOImode_MASK)
-		vmcr |= ICH_VMCR_EOIM_MASK;
-	else
-		vmcr &= ~ICH_VMCR_EOIM_MASK;
-
-	write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
-{
-	int rt;
-	u32 esr;
-	u32 vmcr;
-	void (*fn)(struct kvm_vcpu *, u32, int);
-	bool is_read;
-	u32 sysreg;
-
-	esr = kvm_vcpu_get_hsr(vcpu);
-	if (vcpu_mode_is_32bit(vcpu)) {
-		if (!kvm_condition_valid(vcpu)) {
-			__kvm_skip_instr(vcpu);
-			return 1;
-		}
-
-		sysreg = esr_cp15_to_sysreg(esr);
-	} else {
-		sysreg = esr_sys64_to_sysreg(esr);
-	}
-
-	is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
-
-	switch (sysreg) {
-	case SYS_ICC_IAR0_EL1:
-	case SYS_ICC_IAR1_EL1:
-		if (unlikely(!is_read))
-			return 0;
-		fn = __vgic_v3_read_iar;
-		break;
-	case SYS_ICC_EOIR0_EL1:
-	case SYS_ICC_EOIR1_EL1:
-		if (unlikely(is_read))
-			return 0;
-		fn = __vgic_v3_write_eoir;
-		break;
-	case SYS_ICC_IGRPEN1_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_igrpen1;
-		else
-			fn = __vgic_v3_write_igrpen1;
-		break;
-	case SYS_ICC_BPR1_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_bpr1;
-		else
-			fn = __vgic_v3_write_bpr1;
-		break;
-	case SYS_ICC_AP0Rn_EL1(0):
-	case SYS_ICC_AP1Rn_EL1(0):
-		if (is_read)
-			fn = __vgic_v3_read_apxr0;
-		else
-			fn = __vgic_v3_write_apxr0;
-		break;
-	case SYS_ICC_AP0Rn_EL1(1):
-	case SYS_ICC_AP1Rn_EL1(1):
-		if (is_read)
-			fn = __vgic_v3_read_apxr1;
-		else
-			fn = __vgic_v3_write_apxr1;
-		break;
-	case SYS_ICC_AP0Rn_EL1(2):
-	case SYS_ICC_AP1Rn_EL1(2):
-		if (is_read)
-			fn = __vgic_v3_read_apxr2;
-		else
-			fn = __vgic_v3_write_apxr2;
-		break;
-	case SYS_ICC_AP0Rn_EL1(3):
-	case SYS_ICC_AP1Rn_EL1(3):
-		if (is_read)
-			fn = __vgic_v3_read_apxr3;
-		else
-			fn = __vgic_v3_write_apxr3;
-		break;
-	case SYS_ICC_HPPIR0_EL1:
-	case SYS_ICC_HPPIR1_EL1:
-		if (unlikely(!is_read))
-			return 0;
-		fn = __vgic_v3_read_hppir;
-		break;
-	case SYS_ICC_IGRPEN0_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_igrpen0;
-		else
-			fn = __vgic_v3_write_igrpen0;
-		break;
-	case SYS_ICC_BPR0_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_bpr0;
-		else
-			fn = __vgic_v3_write_bpr0;
-		break;
-	case SYS_ICC_DIR_EL1:
-		if (unlikely(is_read))
-			return 0;
-		fn = __vgic_v3_write_dir;
-		break;
-	case SYS_ICC_RPR_EL1:
-		if (unlikely(!is_read))
-			return 0;
-		fn = __vgic_v3_read_rpr;
-		break;
-	case SYS_ICC_CTLR_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_ctlr;
-		else
-			fn = __vgic_v3_write_ctlr;
-		break;
-	case SYS_ICC_PMR_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_pmr;
-		else
-			fn = __vgic_v3_write_pmr;
-		break;
-	default:
-		return 0;
-	}
-
-	vmcr = __vgic_v3_read_vmcr();
-	rt = kvm_vcpu_sys_get_rt(vcpu);
-	fn(vcpu, vmcr, rt);
-
-	__kvm_skip_instr(vcpu);
-
-	return 1;
-}
-
-#endif
diff --git a/virt/kvm/arm/hypercalls.c b/virt/kvm/arm/hypercalls.c
deleted file mode 100644
index 550dfa3e53cd..000000000000
--- a/virt/kvm/arm/hypercalls.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Arm Ltd.
-
-#include <linux/arm-smccc.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-
-#include <kvm/arm_hypercalls.h>
-#include <kvm/arm_psci.h>
-
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
-{
-	u32 func_id = smccc_get_function(vcpu);
-	long val = SMCCC_RET_NOT_SUPPORTED;
-	u32 feature;
-	gpa_t gpa;
-
-	switch (func_id) {
-	case ARM_SMCCC_VERSION_FUNC_ID:
-		val = ARM_SMCCC_VERSION_1_1;
-		break;
-	case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
-		feature = smccc_get_arg1(vcpu);
-		switch (feature) {
-		case ARM_SMCCC_ARCH_WORKAROUND_1:
-			switch (kvm_arm_harden_branch_predictor()) {
-			case KVM_BP_HARDEN_UNKNOWN:
-				break;
-			case KVM_BP_HARDEN_WA_NEEDED:
-				val = SMCCC_RET_SUCCESS;
-				break;
-			case KVM_BP_HARDEN_NOT_REQUIRED:
-				val = SMCCC_RET_NOT_REQUIRED;
-				break;
-			}
-			break;
-		case ARM_SMCCC_ARCH_WORKAROUND_2:
-			switch (kvm_arm_have_ssbd()) {
-			case KVM_SSBD_FORCE_DISABLE:
-			case KVM_SSBD_UNKNOWN:
-				break;
-			case KVM_SSBD_KERNEL:
-				val = SMCCC_RET_SUCCESS;
-				break;
-			case KVM_SSBD_FORCE_ENABLE:
-			case KVM_SSBD_MITIGATED:
-				val = SMCCC_RET_NOT_REQUIRED;
-				break;
-			}
-			break;
-		case ARM_SMCCC_HV_PV_TIME_FEATURES:
-			val = SMCCC_RET_SUCCESS;
-			break;
-		}
-		break;
-	case ARM_SMCCC_HV_PV_TIME_FEATURES:
-		val = kvm_hypercall_pv_features(vcpu);
-		break;
-	case ARM_SMCCC_HV_PV_TIME_ST:
-		gpa = kvm_init_stolen_time(vcpu);
-		if (gpa != GPA_INVALID)
-			val = gpa;
-		break;
-	default:
-		return kvm_psci_call(vcpu);
-	}
-
-	smccc_set_retval(vcpu, val, 0, 0, 0);
-	return 1;
-}
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
deleted file mode 100644
index aedfcff99ac5..000000000000
--- a/virt/kvm/arm/mmio.c
+++ /dev/null
@@ -1,200 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <trace/events/kvm.h>
-
-#include "trace.h"
-
-void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
-{
-	void *datap = NULL;
-	union {
-		u8	byte;
-		u16	hword;
-		u32	word;
-		u64	dword;
-	} tmp;
-
-	switch (len) {
-	case 1:
-		tmp.byte	= data;
-		datap		= &tmp.byte;
-		break;
-	case 2:
-		tmp.hword	= data;
-		datap		= &tmp.hword;
-		break;
-	case 4:
-		tmp.word	= data;
-		datap		= &tmp.word;
-		break;
-	case 8:
-		tmp.dword	= data;
-		datap		= &tmp.dword;
-		break;
-	}
-
-	memcpy(buf, datap, len);
-}
-
-unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
-{
-	unsigned long data = 0;
-	union {
-		u16	hword;
-		u32	word;
-		u64	dword;
-	} tmp;
-
-	switch (len) {
-	case 1:
-		data = *(u8 *)buf;
-		break;
-	case 2:
-		memcpy(&tmp.hword, buf, len);
-		data = tmp.hword;
-		break;
-	case 4:
-		memcpy(&tmp.word, buf, len);
-		data = tmp.word;
-		break;
-	case 8:
-		memcpy(&tmp.dword, buf, len);
-		data = tmp.dword;
-		break;
-	}
-
-	return data;
-}
-
-/**
- * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
- *			     or in-kernel IO emulation
- *
- * @vcpu: The VCPU pointer
- * @run:  The VCPU run struct containing the mmio data
- */
-int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	unsigned long data;
-	unsigned int len;
-	int mask;
-
-	/* Detect an already handled MMIO return */
-	if (unlikely(!vcpu->mmio_needed))
-		return 0;
-
-	vcpu->mmio_needed = 0;
-
-	if (!kvm_vcpu_dabt_iswrite(vcpu)) {
-		len = kvm_vcpu_dabt_get_as(vcpu);
-		data = kvm_mmio_read_buf(run->mmio.data, len);
-
-		if (kvm_vcpu_dabt_issext(vcpu) &&
-		    len < sizeof(unsigned long)) {
-			mask = 1U << ((len * 8) - 1);
-			data = (data ^ mask) - mask;
-		}
-
-		if (!kvm_vcpu_dabt_issf(vcpu))
-			data = data & 0xffffffff;
-
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
-			       &data);
-		data = vcpu_data_host_to_guest(vcpu, data, len);
-		vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data);
-	}
-
-	/*
-	 * The MMIO instruction is emulated and should not be re-executed
-	 * in the guest.
-	 */
-	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-
-	return 0;
-}
-
-int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
-		 phys_addr_t fault_ipa)
-{
-	unsigned long data;
-	unsigned long rt;
-	int ret;
-	bool is_write;
-	int len;
-	u8 data_buf[8];
-
-	/*
-	 * No valid syndrome? Ask userspace for help if it has
-	 * voluntered to do so, and bail out otherwise.
-	 */
-	if (!kvm_vcpu_dabt_isvalid(vcpu)) {
-		if (vcpu->kvm->arch.return_nisv_io_abort_to_user) {
-			run->exit_reason = KVM_EXIT_ARM_NISV;
-			run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu);
-			run->arm_nisv.fault_ipa = fault_ipa;
-			return 0;
-		}
-
-		kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n");
-		return -ENOSYS;
-	}
-
-	/* Page table accesses IO mem: tell guest to fix its TTBR */
-	if (kvm_vcpu_dabt_iss1tw(vcpu)) {
-		kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-		return 1;
-	}
-
-	/*
-	 * Prepare MMIO operation. First decode the syndrome data we get
-	 * from the CPU. Then try if some in-kernel emulation feels
-	 * responsible, otherwise let user space do its magic.
-	 */
-	is_write = kvm_vcpu_dabt_iswrite(vcpu);
-	len = kvm_vcpu_dabt_get_as(vcpu);
-	rt = kvm_vcpu_dabt_get_rd(vcpu);
-
-	if (is_write) {
-		data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
-					       len);
-
-		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
-		kvm_mmio_write_buf(data_buf, len, data);
-
-		ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
-				       data_buf);
-	} else {
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
-			       fault_ipa, NULL);
-
-		ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
-				      data_buf);
-	}
-
-	/* Now prepare kvm_run for the potential return to userland. */
-	run->mmio.is_write	= is_write;
-	run->mmio.phys_addr	= fault_ipa;
-	run->mmio.len		= len;
-	vcpu->mmio_needed	= 1;
-
-	if (!ret) {
-		/* We handled the access successfully in the kernel. */
-		if (!is_write)
-			memcpy(run->mmio.data, data_buf, len);
-		vcpu->stat.mmio_exit_kernel++;
-		kvm_handle_mmio_return(vcpu, run);
-		return 1;
-	}
-
-	if (is_write)
-		memcpy(run->mmio.data, data_buf, len);
-	vcpu->stat.mmio_exit_user++;
-	run->exit_reason	= KVM_EXIT_MMIO;
-	return 0;
-}
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
deleted file mode 100644
index e3b9ee268823..000000000000
--- a/virt/kvm/arm/mmu.c
+++ /dev/null
@@ -1,2447 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- */
-
-#include <linux/mman.h>
-#include <linux/kvm_host.h>
-#include <linux/io.h>
-#include <linux/hugetlb.h>
-#include <linux/sched/signal.h>
-#include <trace/events/kvm.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_ras.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_emulate.h>
-#include <asm/virt.h>
-
-#include "trace.h"
-
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
-static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
-
-static unsigned long hyp_idmap_start;
-static unsigned long hyp_idmap_end;
-static phys_addr_t hyp_idmap_vector;
-
-static unsigned long io_map_base;
-
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-
-#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
-
-static bool is_iomap(unsigned long flags)
-{
-	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
-}
-
-static bool memslot_is_logging(struct kvm_memory_slot *memslot)
-{
-	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
-}
-
-/**
- * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
- * @kvm:	pointer to kvm structure.
- *
- * Interface to HYP function to flush all VM TLB entries
- */
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
-}
-
-static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
-{
-	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
-	__kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
-	__kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
-	__kvm_flush_dcache_pud(pud);
-}
-
-static bool kvm_is_device_pfn(unsigned long pfn)
-{
-	return !pfn_valid(pfn);
-}
-
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @kvm:	pointer to kvm structure.
- * @addr:	IPA
- * @pmd:	pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
-{
-	if (!pmd_thp_or_huge(*pmd))
-		return;
-
-	pmd_clear(pmd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @kvm:	pointer to kvm structure.
- * @addr:	IPA
- * @pud:	pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
-{
-	if (!stage2_pud_huge(kvm, *pudp))
-		return;
-
-	stage2_pud_clear(kvm, pudp);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	put_page(virt_to_page(pudp));
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-				  int min, int max)
-{
-	void *page;
-
-	BUG_ON(max > KVM_NR_MEM_OBJS);
-	if (cache->nobjs >= min)
-		return 0;
-	while (cache->nobjs < max) {
-		page = (void *)__get_free_page(GFP_PGTABLE_USER);
-		if (!page)
-			return -ENOMEM;
-		cache->objects[cache->nobjs++] = page;
-	}
-	return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-	while (mc->nobjs)
-		free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-{
-	void *p;
-
-	BUG_ON(!mc || !mc->nobjs);
-	p = mc->objects[--mc->nobjs];
-	return p;
-}
-
-static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
-{
-	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
-	stage2_pgd_clear(kvm, pgd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	stage2_pud_free(kvm, pud_table);
-	put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
-{
-	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
-	stage2_pud_clear(kvm, pud);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	stage2_pmd_free(kvm, pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(pmd_thp_or_huge(*pmd));
-	pmd_clear(pmd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	free_page((unsigned long)pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
-	WRITE_ONCE(*ptep, new_pte);
-	dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
-	WRITE_ONCE(*pmdp, new_pmd);
-	dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
-	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
-	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
-	dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
-{
-	WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
-	dsb(ishst);
-}
-
-/*
- * Unmapping vs dcache management:
- *
- * If a guest maps certain memory pages as uncached, all writes will
- * bypass the data cache and go directly to RAM.  However, the CPUs
- * can still speculate reads (not writes) and fill cache lines with
- * data.
- *
- * Those cache lines will be *clean* cache lines though, so a
- * clean+invalidate operation is equivalent to an invalidate
- * operation, because no cache lines are marked dirty.
- *
- * Those clean cache lines could be filled prior to an uncached write
- * by the guest, and the cache coherent IO subsystem would therefore
- * end up writing old data to disk.
- *
- * This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
- *
- * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
- * we then fully enforce cacheability of RAM, no matter what the guest
- * does.
- */
-static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t start_addr = addr;
-	pte_t *pte, *start_pte;
-
-	start_pte = pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			pte_t old_pte = *pte;
-
-			kvm_set_pte(pte, __pte(0));
-			kvm_tlb_flush_vmid_ipa(kvm, addr);
-
-			/* No need to invalidate the cache for device mappings */
-			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
-				kvm_flush_dcache_pte(old_pte);
-
-			put_page(virt_to_page(pte));
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	if (stage2_pte_table_empty(kvm, start_pte))
-		clear_stage2_pmd_entry(kvm, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next, start_addr = addr;
-	pmd_t *pmd, *start_pmd;
-
-	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
-				pmd_t old_pmd = *pmd;
-
-				pmd_clear(pmd);
-				kvm_tlb_flush_vmid_ipa(kvm, addr);
-
-				kvm_flush_dcache_pmd(old_pmd);
-
-				put_page(virt_to_page(pmd));
-			} else {
-				unmap_stage2_ptes(kvm, pmd, addr, next);
-			}
-		}
-	} while (pmd++, addr = next, addr != end);
-
-	if (stage2_pmd_table_empty(kvm, start_pmd))
-		clear_stage2_pud_entry(kvm, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next, start_addr = addr;
-	pud_t *pud, *start_pud;
-
-	start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud)) {
-				pud_t old_pud = *pud;
-
-				stage2_pud_clear(kvm, pud);
-				kvm_tlb_flush_vmid_ipa(kvm, addr);
-				kvm_flush_dcache_pud(old_pud);
-				put_page(virt_to_page(pud));
-			} else {
-				unmap_stage2_pmds(kvm, pud, addr, next);
-			}
-		}
-	} while (pud++, addr = next, addr != end);
-
-	if (stage2_pud_table_empty(kvm, start_pud))
-		clear_stage2_pgd_entry(kvm, pgd, start_addr);
-}
-
-/**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm:   The VM pointer
- * @start: The intermediate physical base address of the range to unmap
- * @size:  The size of the area to unmap
- *
- * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
- * be called while holding mmu_lock (unless for freeing the stage2 pgd before
- * destroying the VM), otherwise another faulting VCPU may come in and mess
- * with things behind our backs.
- */
-static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
-{
-	pgd_t *pgd;
-	phys_addr_t addr = start, end = start + size;
-	phys_addr_t next;
-
-	assert_spin_locked(&kvm->mmu_lock);
-	WARN_ON(size & ~PAGE_MASK);
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-	do {
-		/*
-		 * Make sure the page table is still active, as another thread
-		 * could have possibly freed the page table, while we released
-		 * the lock.
-		 */
-		if (!READ_ONCE(kvm->arch.pgd))
-			break;
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (!stage2_pgd_none(kvm, *pgd))
-			unmap_stage2_puds(kvm, pgd, addr, next);
-		/*
-		 * If the range is too large, release the kvm->mmu_lock
-		 * to prevent starvation and lockup detector warnings.
-		 */
-		if (next != end)
-			cond_resched_lock(&kvm->mmu_lock);
-	} while (pgd++, addr = next, addr != end);
-}
-
-static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
-			kvm_flush_dcache_pte(*pte);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pmd_t *pmd;
-	phys_addr_t next;
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd))
-				kvm_flush_dcache_pmd(*pmd);
-			else
-				stage2_flush_ptes(kvm, pmd, addr, next);
-		}
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pud_t *pud;
-	phys_addr_t next;
-
-	pud = stage2_pud_offset(kvm, pgd, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud))
-				kvm_flush_dcache_pud(*pud);
-			else
-				stage2_flush_pmds(kvm, pud, addr, next);
-		}
-	} while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_memslot(struct kvm *kvm,
-				 struct kvm_memory_slot *memslot)
-{
-	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
-	phys_addr_t next;
-	pgd_t *pgd;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-	do {
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (!stage2_pgd_none(kvm, *pgd))
-			stage2_flush_puds(kvm, pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-/**
- * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
- * @kvm: The struct kvm pointer
- *
- * Go through the stage 2 page tables and invalidate any cache lines
- * backing memory already mapped to the VM.
- */
-static void stage2_flush_vm(struct kvm *kvm)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
-
-	slots = kvm_memslots(kvm);
-	kvm_for_each_memslot(memslot, slots)
-		stage2_flush_memslot(kvm, memslot);
-
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
-}
-
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
-	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
-	pgd_clear(pgd);
-	pud_free(NULL, pud_table);
-	put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
-	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
-	VM_BUG_ON(pud_huge(*pud));
-	pud_clear(pud);
-	pmd_free(NULL, pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(pmd_thp_or_huge(*pmd));
-	pmd_clear(pmd);
-	pte_free_kernel(NULL, pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte, *start_pte;
-
-	start_pte = pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			kvm_set_pte(pte, __pte(0));
-			put_page(virt_to_page(pte));
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	if (hyp_pte_table_empty(start_pte))
-		clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	pmd_t *pmd, *start_pmd;
-
-	start_pmd = pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		/* Hyp doesn't use huge pmds */
-		if (!pmd_none(*pmd))
-			unmap_hyp_ptes(pmd, addr, next);
-	} while (pmd++, addr = next, addr != end);
-
-	if (hyp_pmd_table_empty(start_pmd))
-		clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	pud_t *pud, *start_pud;
-
-	start_pud = pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		/* Hyp doesn't use huge puds */
-		if (!pud_none(*pud))
-			unmap_hyp_pmds(pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-
-	if (hyp_pud_table_empty(start_pud))
-		clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
-	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-			      phys_addr_t start, u64 size)
-{
-	pgd_t *pgd;
-	phys_addr_t addr = start, end = start + size;
-	phys_addr_t next;
-
-	/*
-	 * We don't unmap anything from HYP, except at the hyp tear down.
-	 * Hence, we don't have to invalidate the TLBs here.
-	 */
-	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (!pgd_none(*pgd))
-			unmap_hyp_puds(pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
-/**
- * free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
- */
-void free_hyp_pgds(void)
-{
-	pgd_t *id_pgd;
-
-	mutex_lock(&kvm_hyp_pgd_mutex);
-
-	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
-	if (id_pgd) {
-		/* In case we never called hyp_mmu_init() */
-		if (!io_map_base)
-			io_map_base = hyp_idmap_start;
-		unmap_hyp_idmap_range(id_pgd, io_map_base,
-				      hyp_idmap_start + PAGE_SIZE - io_map_base);
-	}
-
-	if (boot_hyp_pgd) {
-		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-		boot_hyp_pgd = NULL;
-	}
-
-	if (hyp_pgd) {
-		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
-				(uintptr_t)high_memory - PAGE_OFFSET);
-
-		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
-		hyp_pgd = NULL;
-	}
-	if (merged_hyp_pgd) {
-		clear_page(merged_hyp_pgd);
-		free_page((unsigned long)merged_hyp_pgd);
-		merged_hyp_pgd = NULL;
-	}
-
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
-				    unsigned long end, unsigned long pfn,
-				    pgprot_t prot)
-{
-	pte_t *pte;
-	unsigned long addr;
-
-	addr = start;
-	do {
-		pte = pte_offset_kernel(pmd, addr);
-		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
-		get_page(virt_to_page(pte));
-		pfn++;
-	} while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long addr, next;
-
-	addr = start;
-	do {
-		pmd = pmd_offset(pud, addr);
-
-		BUG_ON(pmd_sect(*pmd));
-
-		if (pmd_none(*pmd)) {
-			pte = pte_alloc_one_kernel(NULL);
-			if (!pte) {
-				kvm_err("Cannot allocate Hyp pte\n");
-				return -ENOMEM;
-			}
-			kvm_pmd_populate(pmd, pte);
-			get_page(virt_to_page(pmd));
-		}
-
-		next = pmd_addr_end(addr, end);
-
-		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	unsigned long addr, next;
-	int ret;
-
-	addr = start;
-	do {
-		pud = pud_offset(pgd, addr);
-
-		if (pud_none_or_clear_bad(pud)) {
-			pmd = pmd_alloc_one(NULL, addr);
-			if (!pmd) {
-				kvm_err("Cannot allocate Hyp pmd\n");
-				return -ENOMEM;
-			}
-			kvm_pud_populate(pud, pmd);
-			get_page(virt_to_page(pud));
-		}
-
-		next = pud_addr_end(addr, end);
-		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
-		if (ret)
-			return ret;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-				 unsigned long start, unsigned long end,
-				 unsigned long pfn, pgprot_t prot)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	unsigned long addr, next;
-	int err = 0;
-
-	mutex_lock(&kvm_hyp_pgd_mutex);
-	addr = start & PAGE_MASK;
-	end = PAGE_ALIGN(end);
-	do {
-		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
-		if (pgd_none(*pgd)) {
-			pud = pud_alloc_one(NULL, addr);
-			if (!pud) {
-				kvm_err("Cannot allocate Hyp pud\n");
-				err = -ENOMEM;
-				goto out;
-			}
-			kvm_pgd_populate(pgd, pud);
-			get_page(virt_to_page(pgd));
-		}
-
-		next = pgd_addr_end(addr, end);
-		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
-		if (err)
-			goto out;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-out:
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-	return err;
-}
-
-static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
-{
-	if (!is_vmalloc_addr(kaddr)) {
-		BUG_ON(!virt_addr_valid(kaddr));
-		return __pa(kaddr);
-	} else {
-		return page_to_phys(vmalloc_to_page(kaddr)) +
-		       offset_in_page(kaddr);
-	}
-}
-
-/**
- * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
- * @from:	The virtual kernel start address of the range
- * @to:		The virtual kernel end address of the range (exclusive)
- * @prot:	The protection to be applied to this range
- *
- * The same virtual address as the kernel virtual address is also used
- * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
- * physical pages.
- */
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
-{
-	phys_addr_t phys_addr;
-	unsigned long virt_addr;
-	unsigned long start = kern_hyp_va((unsigned long)from);
-	unsigned long end = kern_hyp_va((unsigned long)to);
-
-	if (is_kernel_in_hyp_mode())
-		return 0;
-
-	start = start & PAGE_MASK;
-	end = PAGE_ALIGN(end);
-
-	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
-		int err;
-
-		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
-		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
-					    virt_addr, virt_addr + PAGE_SIZE,
-					    __phys_to_pfn(phys_addr),
-					    prot);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
-					unsigned long *haddr, pgprot_t prot)
-{
-	pgd_t *pgd = hyp_pgd;
-	unsigned long base;
-	int ret = 0;
-
-	mutex_lock(&kvm_hyp_pgd_mutex);
-
-	/*
-	 * This assumes that we we have enough space below the idmap
-	 * page to allocate our VAs. If not, the check below will
-	 * kick. A potential alternative would be to detect that
-	 * overflow and switch to an allocation above the idmap.
-	 *
-	 * The allocated size is always a multiple of PAGE_SIZE.
-	 */
-	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
-	base = io_map_base - size;
-
-	/*
-	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
-	 * allocating the new area, as it would indicate we've
-	 * overflowed the idmap/IO address range.
-	 */
-	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
-		ret = -ENOMEM;
-	else
-		io_map_base = base;
-
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-
-	if (ret)
-		goto out;
-
-	if (__kvm_cpu_uses_extended_idmap())
-		pgd = boot_hyp_pgd;
-
-	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-				    base, base + size,
-				    __phys_to_pfn(phys_addr), prot);
-	if (ret)
-		goto out;
-
-	*haddr = base + offset_in_page(phys_addr);
-
-out:
-	return ret;
-}
-
-/**
- * create_hyp_io_mappings - Map IO into both kernel and HYP
- * @phys_addr:	The physical start address which gets mapped
- * @size:	Size of the region being mapped
- * @kaddr:	Kernel VA for this mapping
- * @haddr:	HYP VA for this mapping
- */
-int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
-			   void __iomem **kaddr,
-			   void __iomem **haddr)
-{
-	unsigned long addr;
-	int ret;
-
-	*kaddr = ioremap(phys_addr, size);
-	if (!*kaddr)
-		return -ENOMEM;
-
-	if (is_kernel_in_hyp_mode()) {
-		*haddr = *kaddr;
-		return 0;
-	}
-
-	ret = __create_hyp_private_mapping(phys_addr, size,
-					   &addr, PAGE_HYP_DEVICE);
-	if (ret) {
-		iounmap(*kaddr);
-		*kaddr = NULL;
-		*haddr = NULL;
-		return ret;
-	}
-
-	*haddr = (void __iomem *)addr;
-	return 0;
-}
-
-/**
- * create_hyp_exec_mappings - Map an executable range into HYP
- * @phys_addr:	The physical start address which gets mapped
- * @size:	Size of the region being mapped
- * @haddr:	HYP VA for this mapping
- */
-int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
-			     void **haddr)
-{
-	unsigned long addr;
-	int ret;
-
-	BUG_ON(is_kernel_in_hyp_mode());
-
-	ret = __create_hyp_private_mapping(phys_addr, size,
-					   &addr, PAGE_HYP_EXEC);
-	if (ret) {
-		*haddr = NULL;
-		return ret;
-	}
-
-	*haddr = (void *)addr;
-	return 0;
-}
-
-/**
- * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
- * @kvm:	The KVM struct pointer for the VM.
- *
- * Allocates only the stage-2 HW PGD level table(s) of size defined by
- * stage2_pgd_size(kvm).
- *
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_alloc_stage2_pgd(struct kvm *kvm)
-{
-	phys_addr_t pgd_phys;
-	pgd_t *pgd;
-
-	if (kvm->arch.pgd != NULL) {
-		kvm_err("kvm_arch already initialized?\n");
-		return -EINVAL;
-	}
-
-	/* Allocate the HW PGD, making sure that each page gets its own refcount */
-	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
-	if (!pgd)
-		return -ENOMEM;
-
-	pgd_phys = virt_to_phys(pgd);
-	if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
-		return -EINVAL;
-
-	kvm->arch.pgd = pgd;
-	kvm->arch.pgd_phys = pgd_phys;
-	return 0;
-}
-
-static void stage2_unmap_memslot(struct kvm *kvm,
-				 struct kvm_memory_slot *memslot)
-{
-	hva_t hva = memslot->userspace_addr;
-	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-	phys_addr_t size = PAGE_SIZE * memslot->npages;
-	hva_t reg_end = hva + size;
-
-	/*
-	 * A memory region could potentially cover multiple VMAs, and any holes
-	 * between them, so iterate over all of them to find out if we should
-	 * unmap any of them.
-	 *
-	 *     +--------------------------------------------+
-	 * +---------------+----------------+   +----------------+
-	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-	 * +---------------+----------------+   +----------------+
-	 *     |               memory region                |
-	 *     +--------------------------------------------+
-	 */
-	do {
-		struct vm_area_struct *vma = find_vma(current->mm, hva);
-		hva_t vm_start, vm_end;
-
-		if (!vma || vma->vm_start >= reg_end)
-			break;
-
-		/*
-		 * Take the intersection of this VMA with the memory region
-		 */
-		vm_start = max(hva, vma->vm_start);
-		vm_end = min(reg_end, vma->vm_end);
-
-		if (!(vma->vm_flags & VM_PFNMAP)) {
-			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
-			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
-		}
-		hva = vm_end;
-	} while (hva < reg_end);
-}
-
-/**
- * stage2_unmap_vm - Unmap Stage-2 RAM mappings
- * @kvm: The struct kvm pointer
- *
- * Go through the memregions and unmap any reguler RAM
- * backing memory already mapped to the VM.
- */
-void stage2_unmap_vm(struct kvm *kvm)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	down_read(&current->mm->mmap_sem);
-	spin_lock(&kvm->mmu_lock);
-
-	slots = kvm_memslots(kvm);
-	kvm_for_each_memslot(memslot, slots)
-		stage2_unmap_memslot(kvm, memslot);
-
-	spin_unlock(&kvm->mmu_lock);
-	up_read(&current->mm->mmap_sem);
-	srcu_read_unlock(&kvm->srcu, idx);
-}
-
-/**
- * kvm_free_stage2_pgd - free all stage-2 tables
- * @kvm:	The KVM struct pointer for the VM.
- *
- * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
- * underlying level-2 and level-3 tables before freeing the actual level-1 table
- * and setting the struct pointer to NULL.
- */
-void kvm_free_stage2_pgd(struct kvm *kvm)
-{
-	void *pgd = NULL;
-
-	spin_lock(&kvm->mmu_lock);
-	if (kvm->arch.pgd) {
-		unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
-		pgd = READ_ONCE(kvm->arch.pgd);
-		kvm->arch.pgd = NULL;
-		kvm->arch.pgd_phys = 0;
-	}
-	spin_unlock(&kvm->mmu_lock);
-
-	/* Free the HW pgd, one page at a time */
-	if (pgd)
-		free_pages_exact(pgd, stage2_pgd_size(kvm));
-}
-
-static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-	if (stage2_pgd_none(kvm, *pgd)) {
-		if (!cache)
-			return NULL;
-		pud = mmu_memory_cache_alloc(cache);
-		stage2_pgd_populate(kvm, pgd, pud);
-		get_page(virt_to_page(pgd));
-	}
-
-	return stage2_pud_offset(kvm, pgd, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-
-	pud = stage2_get_pud(kvm, cache, addr);
-	if (!pud || stage2_pud_huge(kvm, *pud))
-		return NULL;
-
-	if (stage2_pud_none(kvm, *pud)) {
-		if (!cache)
-			return NULL;
-		pmd = mmu_memory_cache_alloc(cache);
-		stage2_pud_populate(kvm, pud, pmd);
-		get_page(virt_to_page(pud));
-	}
-
-	return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
-			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
-{
-	pmd_t *pmd, old_pmd;
-
-retry:
-	pmd = stage2_get_pmd(kvm, cache, addr);
-	VM_BUG_ON(!pmd);
-
-	old_pmd = *pmd;
-	/*
-	 * Multiple vcpus faulting on the same PMD entry, can
-	 * lead to them sequentially updating the PMD with the
-	 * same value. Following the break-before-make
-	 * (pmd_clear() followed by tlb_flush()) process can
-	 * hinder forward progress due to refaults generated
-	 * on missing translations.
-	 *
-	 * Skip updating the page table if the entry is
-	 * unchanged.
-	 */
-	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-		return 0;
-
-	if (pmd_present(old_pmd)) {
-		/*
-		 * If we already have PTE level mapping for this block,
-		 * we must unmap it to avoid inconsistent TLB state and
-		 * leaking the table page. We could end up in this situation
-		 * if the memory slot was marked for dirty logging and was
-		 * reverted, leaving PTE level mappings for the pages accessed
-		 * during the period. So, unmap the PTE level mapping for this
-		 * block and retry, as we could have released the upper level
-		 * table in the process.
-		 *
-		 * Normal THP split/merge follows mmu_notifier callbacks and do
-		 * get handled accordingly.
-		 */
-		if (!pmd_thp_or_huge(old_pmd)) {
-			unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
-			goto retry;
-		}
-		/*
-		 * Mapping in huge pages should only happen through a
-		 * fault.  If a page is merged into a transparent huge
-		 * page, the individual subpages of that huge page
-		 * should be unmapped through MMU notifiers before we
-		 * get here.
-		 *
-		 * Merging of CompoundPages is not supported; they
-		 * should become splitting first, unmapped, merged,
-		 * and mapped back in on-demand.
-		 */
-		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-		pmd_clear(pmd);
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	} else {
-		get_page(virt_to_page(pmd));
-	}
-
-	kvm_set_pmd(pmd, *new_pmd);
-	return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			       phys_addr_t addr, const pud_t *new_pudp)
-{
-	pud_t *pudp, old_pud;
-
-retry:
-	pudp = stage2_get_pud(kvm, cache, addr);
-	VM_BUG_ON(!pudp);
-
-	old_pud = *pudp;
-
-	/*
-	 * A large number of vcpus faulting on the same stage 2 entry,
-	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
-	 * Skip updating the page tables if there is no change.
-	 */
-	if (pud_val(old_pud) == pud_val(*new_pudp))
-		return 0;
-
-	if (stage2_pud_present(kvm, old_pud)) {
-		/*
-		 * If we already have table level mapping for this block, unmap
-		 * the range for this block and retry.
-		 */
-		if (!stage2_pud_huge(kvm, old_pud)) {
-			unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
-			goto retry;
-		}
-
-		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
-		stage2_pud_clear(kvm, pudp);
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	} else {
-		get_page(virt_to_page(pudp));
-	}
-
-	kvm_set_pud(pudp, *new_pudp);
-	return 0;
-}
-
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
- */
-static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
-				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
-{
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	*pudpp = NULL;
-	*pmdpp = NULL;
-	*ptepp = NULL;
-
-	pudp = stage2_get_pud(kvm, NULL, addr);
-	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
-		return false;
-
-	if (stage2_pud_huge(kvm, *pudp)) {
-		*pudpp = pudp;
-		return true;
-	}
-
-	pmdp = stage2_pmd_offset(kvm, pudp, addr);
-	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
-		return false;
-
-	if (pmd_thp_or_huge(*pmdp)) {
-		*pmdpp = pmdp;
-		return true;
-	}
-
-	ptep = pte_offset_kernel(pmdp, addr);
-	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
-		return false;
-
-	*ptepp = ptep;
-	return true;
-}
-
-static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
-{
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	bool found;
-
-	found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
-	if (!found)
-		return false;
-
-	if (pudp)
-		return kvm_s2pud_exec(pudp);
-	else if (pmdp)
-		return kvm_s2pmd_exec(pmdp);
-	else
-		return kvm_s2pte_exec(ptep);
-}
-
-static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte,
-			  unsigned long flags)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte, old_pte;
-	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
-	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
-	VM_BUG_ON(logging_active && !cache);
-
-	/* Create stage-2 page table mapping - Levels 0 and 1 */
-	pud = stage2_get_pud(kvm, cache, addr);
-	if (!pud) {
-		/*
-		 * Ignore calls from kvm_set_spte_hva for unallocated
-		 * address ranges.
-		 */
-		return 0;
-	}
-
-	/*
-	 * While dirty page logging - dissolve huge PUD, then continue
-	 * on to allocate page.
-	 */
-	if (logging_active)
-		stage2_dissolve_pud(kvm, addr, pud);
-
-	if (stage2_pud_none(kvm, *pud)) {
-		if (!cache)
-			return 0; /* ignore calls from kvm_set_spte_hva */
-		pmd = mmu_memory_cache_alloc(cache);
-		stage2_pud_populate(kvm, pud, pmd);
-		get_page(virt_to_page(pud));
-	}
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-	if (!pmd) {
-		/*
-		 * Ignore calls from kvm_set_spte_hva for unallocated
-		 * address ranges.
-		 */
-		return 0;
-	}
-
-	/*
-	 * While dirty page logging - dissolve huge PMD, then continue on to
-	 * allocate page.
-	 */
-	if (logging_active)
-		stage2_dissolve_pmd(kvm, addr, pmd);
-
-	/* Create stage-2 page mappings - Level 2 */
-	if (pmd_none(*pmd)) {
-		if (!cache)
-			return 0; /* ignore calls from kvm_set_spte_hva */
-		pte = mmu_memory_cache_alloc(cache);
-		kvm_pmd_populate(pmd, pte);
-		get_page(virt_to_page(pmd));
-	}
-
-	pte = pte_offset_kernel(pmd, addr);
-
-	if (iomap && pte_present(*pte))
-		return -EFAULT;
-
-	/* Create 2nd stage page table mapping - Level 3 */
-	old_pte = *pte;
-	if (pte_present(old_pte)) {
-		/* Skip page table update if there is no change */
-		if (pte_val(old_pte) == pte_val(*new_pte))
-			return 0;
-
-		kvm_set_pte(pte, __pte(0));
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	} else {
-		get_page(virt_to_page(pte));
-	}
-
-	kvm_set_pte(pte, *new_pte);
-	return 0;
-}
-
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-	if (pte_young(*pte)) {
-		*pte = pte_mkold(*pte);
-		return 1;
-	}
-	return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-	return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
-	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
-	return stage2_ptep_test_and_clear_young((pte_t *)pud);
-}
-
-/**
- * kvm_phys_addr_ioremap - map a device range to guest IPA
- *
- * @kvm:	The KVM pointer
- * @guest_ipa:	The IPA at which to insert the mapping
- * @pa:		The physical address of the device
- * @size:	The size of the mapping
- */
-int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
-			  phys_addr_t pa, unsigned long size, bool writable)
-{
-	phys_addr_t addr, end;
-	int ret = 0;
-	unsigned long pfn;
-	struct kvm_mmu_memory_cache cache = { 0, };
-
-	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
-	pfn = __phys_to_pfn(pa);
-
-	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
-		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
-		if (writable)
-			pte = kvm_s2pte_mkwrite(pte);
-
-		ret = mmu_topup_memory_cache(&cache,
-					     kvm_mmu_cache_min_pages(kvm),
-					     KVM_NR_MEM_OBJS);
-		if (ret)
-			goto out;
-		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte,
-						KVM_S2PTE_FLAG_IS_IOMAP);
-		spin_unlock(&kvm->mmu_lock);
-		if (ret)
-			goto out;
-
-		pfn++;
-	}
-
-out:
-	mmu_free_memory_cache(&cache);
-	return ret;
-}
-
-static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
-{
-	kvm_pfn_t pfn = *pfnp;
-	gfn_t gfn = *ipap >> PAGE_SHIFT;
-
-	if (kvm_is_transparent_hugepage(pfn)) {
-		unsigned long mask;
-		/*
-		 * The address we faulted on is backed by a transparent huge
-		 * page.  However, because we map the compound huge page and
-		 * not the individual tail page, we need to transfer the
-		 * refcount to the head page.  We have to be careful that the
-		 * THP doesn't start to split while we are adjusting the
-		 * refcounts.
-		 *
-		 * We are sure this doesn't happen, because mmu_notifier_retry
-		 * was successful and we are holding the mmu_lock, so if this
-		 * THP is trying to split, it will be blocked in the mmu
-		 * notifier before touching any of the pages, specifically
-		 * before being able to call __split_huge_page_refcount().
-		 *
-		 * We can therefore safely transfer the refcount from PG_tail
-		 * to PG_head and switch the pfn from a tail page to the head
-		 * page accordingly.
-		 */
-		mask = PTRS_PER_PMD - 1;
-		VM_BUG_ON((gfn & mask) != (pfn & mask));
-		if (pfn & mask) {
-			*ipap &= PMD_MASK;
-			kvm_release_pfn_clean(pfn);
-			pfn &= ~mask;
-			kvm_get_pfn(pfn);
-			*pfnp = pfn;
-		}
-
-		return true;
-	}
-
-	return false;
-}
-
-/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd:	pointer to pmd entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			if (!kvm_s2pte_readonly(pte))
-				kvm_set_s2pte_readonly(pte);
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm:		kvm instance for the VM
- * @pud:	pointer to pud entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
-			   phys_addr_t addr, phys_addr_t end)
-{
-	pmd_t *pmd;
-	phys_addr_t next;
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
-				if (!kvm_s2pmd_readonly(pmd))
-					kvm_set_s2pmd_readonly(pmd);
-			} else {
-				stage2_wp_ptes(pmd, addr, next);
-			}
-		}
-	} while (pmd++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_puds - write protect PGD range
- * @pgd:	pointer to pgd entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
-			    phys_addr_t addr, phys_addr_t end)
-{
-	pud_t *pud;
-	phys_addr_t next;
-
-	pud = stage2_pud_offset(kvm, pgd, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud)) {
-				if (!kvm_s2pud_readonly(pud))
-					kvm_set_s2pud_readonly(pud);
-			} else {
-				stage2_wp_pmds(kvm, pud, addr, next);
-			}
-		}
-	} while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_range() - write protect stage2 memory region range
- * @kvm:	The KVM pointer
- * @addr:	Start address of range
- * @end:	End address of range
- */
-static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
-	pgd_t *pgd;
-	phys_addr_t next;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-	do {
-		/*
-		 * Release kvm_mmu_lock periodically if the memory region is
-		 * large. Otherwise, we may see kernel panics with
-		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
-		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
-		 * will also starve other vCPUs. We have to also make sure
-		 * that the page tables are not freed while we released
-		 * the lock.
-		 */
-		cond_resched_lock(&kvm->mmu_lock);
-		if (!READ_ONCE(kvm->arch.pgd))
-			break;
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (stage2_pgd_present(kvm, *pgd))
-			stage2_wp_puds(kvm, pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-/**
- * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
- * @kvm:	The KVM pointer
- * @slot:	The memory slot to write protect
- *
- * Called to start logging dirty pages after memory region
- * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
- * all present PUD, PMD and PTEs are write protected in the memory region.
- * Afterwards read of dirty page log can be called.
- *
- * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
- * serializing operations for VM memory regions.
- */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
-{
-	struct kvm_memslots *slots = kvm_memslots(kvm);
-	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
-	phys_addr_t start, end;
-
-	if (WARN_ON_ONCE(!memslot))
-		return;
-
-	start = memslot->base_gfn << PAGE_SHIFT;
-	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
-
-	spin_lock(&kvm->mmu_lock);
-	stage2_wp_range(kvm, start, end);
-	spin_unlock(&kvm->mmu_lock);
-	kvm_flush_remote_tlbs(kvm);
-}
-
-/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
- * @kvm:	The KVM pointer
- * @slot:	The memory slot associated with mask
- * @gfn_offset:	The gfn offset in memory slot
- * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
- *		slot to be write protected
- *
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
- */
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
-		struct kvm_memory_slot *slot,
-		gfn_t gfn_offset, unsigned long mask)
-{
-	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
-	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
-	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
-
-	stage2_wp_range(kvm, start, end);
-}
-
-/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
- *
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
- */
-void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
-		struct kvm_memory_slot *slot,
-		gfn_t gfn_offset, unsigned long mask)
-{
-	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
-}
-
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-	__clean_dcache_guest_page(pfn, size);
-}
-
-static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-	__invalidate_icache_guest_page(pfn, size);
-}
-
-static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
-{
-	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
-}
-
-static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
-					       unsigned long hva,
-					       unsigned long map_size)
-{
-	gpa_t gpa_start;
-	hva_t uaddr_start, uaddr_end;
-	size_t size;
-
-	size = memslot->npages * PAGE_SIZE;
-
-	gpa_start = memslot->base_gfn << PAGE_SHIFT;
-
-	uaddr_start = memslot->userspace_addr;
-	uaddr_end = uaddr_start + size;
-
-	/*
-	 * Pages belonging to memslots that don't have the same alignment
-	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
-	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
-	 *
-	 * Consider a layout like the following:
-	 *
-	 *    memslot->userspace_addr:
-	 *    +-----+--------------------+--------------------+---+
-	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
-	 *    +-----+--------------------+--------------------+---+
-	 *
-	 *    memslot->base_gfn << PAGE_SIZE:
-	 *      +---+--------------------+--------------------+-----+
-	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
-	 *      +---+--------------------+--------------------+-----+
-	 *
-	 * If we create those stage-2 blocks, we'll end up with this incorrect
-	 * mapping:
-	 *   d -> f
-	 *   e -> g
-	 *   f -> h
-	 */
-	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
-		return false;
-
-	/*
-	 * Next, let's make sure we're not trying to map anything not covered
-	 * by the memslot. This means we have to prohibit block size mappings
-	 * for the beginning and end of a non-block aligned and non-block sized
-	 * memory slot (illustrated by the head and tail parts of the
-	 * userspace view above containing pages 'abcde' and 'xyz',
-	 * respectively).
-	 *
-	 * Note that it doesn't matter if we do the check using the
-	 * userspace_addr or the base_gfn, as both are equally aligned (per
-	 * the check above) and equally sized.
-	 */
-	return (hva & ~(map_size - 1)) >= uaddr_start &&
-	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
-}
-
-static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-			  struct kvm_memory_slot *memslot, unsigned long hva,
-			  unsigned long fault_status)
-{
-	int ret;
-	bool write_fault, writable, force_pte = false;
-	bool exec_fault, needs_exec;
-	unsigned long mmu_seq;
-	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
-	struct vm_area_struct *vma;
-	short vma_shift;
-	kvm_pfn_t pfn;
-	pgprot_t mem_type = PAGE_S2;
-	bool logging_active = memslot_is_logging(memslot);
-	unsigned long vma_pagesize, flags = 0;
-
-	write_fault = kvm_is_write_fault(vcpu);
-	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
-	VM_BUG_ON(write_fault && exec_fault);
-
-	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
-		kvm_err("Unexpected L2 read permission error\n");
-		return -EFAULT;
-	}
-
-	/* Let's check if we will get back a huge page backed by hugetlbfs */
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma_intersection(current->mm, hva, hva + 1);
-	if (unlikely(!vma)) {
-		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
-		up_read(&current->mm->mmap_sem);
-		return -EFAULT;
-	}
-
-	if (is_vm_hugetlb_page(vma))
-		vma_shift = huge_page_shift(hstate_vma(vma));
-	else
-		vma_shift = PAGE_SHIFT;
-
-	vma_pagesize = 1ULL << vma_shift;
-	if (logging_active ||
-	    (vma->vm_flags & VM_PFNMAP) ||
-	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
-		force_pte = true;
-		vma_pagesize = PAGE_SIZE;
-	}
-
-	/*
-	 * The stage2 has a minimum of 2 level table (For arm64 see
-	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
-	 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
-	 * As for PUD huge maps, we must make sure that we have at least
-	 * 3 levels, i.e, PMD is not folded.
-	 */
-	if (vma_pagesize == PMD_SIZE ||
-	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
-		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
-	up_read(&current->mm->mmap_sem);
-
-	/* We need minimum second+third level pages */
-	ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
-				     KVM_NR_MEM_OBJS);
-	if (ret)
-		return ret;
-
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/*
-	 * Ensure the read of mmu_notifier_seq happens before we call
-	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
-	 * the page we just got a reference to gets unmapped before we have a
-	 * chance to grab the mmu_lock, which ensure that if the page gets
-	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
-	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
-	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
-	 */
-	smp_rmb();
-
-	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
-	if (pfn == KVM_PFN_ERR_HWPOISON) {
-		kvm_send_hwpoison_signal(hva, vma_shift);
-		return 0;
-	}
-	if (is_error_noslot_pfn(pfn))
-		return -EFAULT;
-
-	if (kvm_is_device_pfn(pfn)) {
-		mem_type = PAGE_S2_DEVICE;
-		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
-	} else if (logging_active) {
-		/*
-		 * Faults on pages in a memslot with logging enabled
-		 * should not be mapped with huge pages (it introduces churn
-		 * and performance degradation), so force a pte mapping.
-		 */
-		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
-		/*
-		 * Only actually map the page as writable if this was a write
-		 * fault.
-		 */
-		if (!write_fault)
-			writable = false;
-	}
-
-	if (exec_fault && is_iomap(flags))
-		return -ENOEXEC;
-
-	spin_lock(&kvm->mmu_lock);
-	if (mmu_notifier_retry(kvm, mmu_seq))
-		goto out_unlock;
-
-	if (vma_pagesize == PAGE_SIZE && !force_pte) {
-		/*
-		 * Only PMD_SIZE transparent hugepages(THP) are
-		 * currently supported. This code will need to be
-		 * updated to support other THP sizes.
-		 *
-		 * Make sure the host VA and the guest IPA are sufficiently
-		 * aligned and that the block is contained within the memslot.
-		 */
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
-		    transparent_hugepage_adjust(&pfn, &fault_ipa))
-			vma_pagesize = PMD_SIZE;
-	}
-
-	if (writable)
-		kvm_set_pfn_dirty(pfn);
-
-	if (fault_status != FSC_PERM && !is_iomap(flags))
-		clean_dcache_guest_page(pfn, vma_pagesize);
-
-	if (exec_fault)
-		invalidate_icache_guest_page(pfn, vma_pagesize);
-
-	/*
-	 * If we took an execution fault we have made the
-	 * icache/dcache coherent above and should now let the s2
-	 * mapping be executable.
-	 *
-	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
-	 * execute permissions, and we preserve whatever we have.
-	 */
-	needs_exec = exec_fault ||
-		(fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
-
-	if (vma_pagesize == PUD_SIZE) {
-		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
-		new_pud = kvm_pud_mkhuge(new_pud);
-		if (writable)
-			new_pud = kvm_s2pud_mkwrite(new_pud);
-
-		if (needs_exec)
-			new_pud = kvm_s2pud_mkexec(new_pud);
-
-		ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
-	} else if (vma_pagesize == PMD_SIZE) {
-		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
-		new_pmd = kvm_pmd_mkhuge(new_pmd);
-
-		if (writable)
-			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
-		if (needs_exec)
-			new_pmd = kvm_s2pmd_mkexec(new_pmd);
-
-		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
-	} else {
-		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
-		if (writable) {
-			new_pte = kvm_s2pte_mkwrite(new_pte);
-			mark_page_dirty(kvm, gfn);
-		}
-
-		if (needs_exec)
-			new_pte = kvm_s2pte_mkexec(new_pte);
-
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
-	}
-
-out_unlock:
-	spin_unlock(&kvm->mmu_lock);
-	kvm_set_pfn_accessed(pfn);
-	kvm_release_pfn_clean(pfn);
-	return ret;
-}
-
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
-static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	kvm_pfn_t pfn;
-	bool pfn_valid = false;
-
-	trace_kvm_access_fault(fault_ipa);
-
-	spin_lock(&vcpu->kvm->mmu_lock);
-
-	if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
-		goto out;
-
-	if (pud) {		/* HugeTLB */
-		*pud = kvm_s2pud_mkyoung(*pud);
-		pfn = kvm_pud_pfn(*pud);
-		pfn_valid = true;
-	} else	if (pmd) {	/* THP, HugeTLB */
-		*pmd = pmd_mkyoung(*pmd);
-		pfn = pmd_pfn(*pmd);
-		pfn_valid = true;
-	} else {
-		*pte = pte_mkyoung(*pte);	/* Just a page... */
-		pfn = pte_pfn(*pte);
-		pfn_valid = true;
-	}
-
-out:
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (pfn_valid)
-		kvm_set_pfn_accessed(pfn);
-}
-
-/**
- * kvm_handle_guest_abort - handles all 2nd stage aborts
- * @vcpu:	the VCPU pointer
- * @run:	the kvm_run structure
- *
- * Any abort that gets to the host is almost guaranteed to be caused by a
- * missing second stage translation table entry, which can mean that either the
- * guest simply needs more memory and we must allocate an appropriate page or it
- * can mean that the guest tried to access I/O memory, which is emulated by user
- * space. The distinction is based on the IPA causing the fault and whether this
- * memory region has been registered as standard RAM by user space.
- */
-int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	unsigned long fault_status;
-	phys_addr_t fault_ipa;
-	struct kvm_memory_slot *memslot;
-	unsigned long hva;
-	bool is_iabt, write_fault, writable;
-	gfn_t gfn;
-	int ret, idx;
-
-	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
-
-	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
-	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
-
-	/* Synchronous External Abort? */
-	if (kvm_vcpu_dabt_isextabt(vcpu)) {
-		/*
-		 * For RAS the host kernel may handle this abort.
-		 * There is no need to pass the error into the guest.
-		 */
-		if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
-			return 1;
-
-		if (unlikely(!is_iabt)) {
-			kvm_inject_vabt(vcpu);
-			return 1;
-		}
-	}
-
-	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
-			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
-
-	/* Check the stage-2 fault is trans. fault or write fault */
-	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
-	    fault_status != FSC_ACCESS) {
-		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
-			kvm_vcpu_trap_get_class(vcpu),
-			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
-			(unsigned long)kvm_vcpu_get_hsr(vcpu));
-		return -EFAULT;
-	}
-
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
-
-	gfn = fault_ipa >> PAGE_SHIFT;
-	memslot = gfn_to_memslot(vcpu->kvm, gfn);
-	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
-	write_fault = kvm_is_write_fault(vcpu);
-	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
-		if (is_iabt) {
-			/* Prefetch Abort on I/O address */
-			ret = -ENOEXEC;
-			goto out;
-		}
-
-		/*
-		 * Check for a cache maintenance operation. Since we
-		 * ended-up here, we know it is outside of any memory
-		 * slot. But we can't find out if that is for a device,
-		 * or if the guest is just being stupid. The only thing
-		 * we know for sure is that this range cannot be cached.
-		 *
-		 * So let's assume that the guest is just being
-		 * cautious, and skip the instruction.
-		 */
-		if (kvm_vcpu_dabt_is_cm(vcpu)) {
-			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-			ret = 1;
-			goto out_unlock;
-		}
-
-		/*
-		 * The IPA is reported as [MAX:12], so we need to
-		 * complement it with the bottom 12 bits from the
-		 * faulting VA. This is always 12 bits, irrespective
-		 * of the page size.
-		 */
-		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
-		ret = io_mem_abort(vcpu, run, fault_ipa);
-		goto out_unlock;
-	}
-
-	/* Userspace should not be able to register out-of-bounds IPAs */
-	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
-
-	if (fault_status == FSC_ACCESS) {
-		handle_access_fault(vcpu, fault_ipa);
-		ret = 1;
-		goto out_unlock;
-	}
-
-	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
-	if (ret == 0)
-		ret = 1;
-out:
-	if (ret == -ENOEXEC) {
-		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-		ret = 1;
-	}
-out_unlock:
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-	return ret;
-}
-
-static int handle_hva_to_gpa(struct kvm *kvm,
-			     unsigned long start,
-			     unsigned long end,
-			     int (*handler)(struct kvm *kvm,
-					    gpa_t gpa, u64 size,
-					    void *data),
-			     void *data)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int ret = 0;
-
-	slots = kvm_memslots(kvm);
-
-	/* we only care about the pages that the guest sees */
-	kvm_for_each_memslot(memslot, slots) {
-		unsigned long hva_start, hva_end;
-		gfn_t gpa;
-
-		hva_start = max(start, memslot->userspace_addr);
-		hva_end = min(end, memslot->userspace_addr +
-					(memslot->npages << PAGE_SHIFT));
-		if (hva_start >= hva_end)
-			continue;
-
-		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
-		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
-	}
-
-	return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	unmap_stage2_range(kvm, gpa, size);
-	return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
-			unsigned long start, unsigned long end)
-{
-	if (!kvm->arch.pgd)
-		return 0;
-
-	trace_kvm_unmap_hva_range(start, end);
-	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-	return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pte_t *pte = (pte_t *)data;
-
-	WARN_ON(size != PAGE_SIZE);
-	/*
-	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
-	 * flag clear because MMU notifiers will have unmapped a huge PMD before
-	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-	 * therefore stage2_set_pte() never needs to clear out a huge PMD
-	 * through this calling path.
-	 */
-	stage2_set_pte(kvm, NULL, gpa, pte, 0);
-	return 0;
-}
-
-
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
-	unsigned long end = hva + PAGE_SIZE;
-	kvm_pfn_t pfn = pte_pfn(pte);
-	pte_t stage2_pte;
-
-	if (!kvm->arch.pgd)
-		return 0;
-
-	trace_kvm_set_spte_hva(hva);
-
-	/*
-	 * We've moved a page around, probably through CoW, so let's treat it
-	 * just like a translation fault and clean the cache to the PoC.
-	 */
-	clean_dcache_guest_page(pfn, PAGE_SIZE);
-	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
-	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
-	return 0;
-}
-
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
-		return 0;
-
-	if (pud)
-		return stage2_pudp_test_and_clear_young(pud);
-	else if (pmd)
-		return stage2_pmdp_test_and_clear_young(pmd);
-	else
-		return stage2_ptep_test_and_clear_young(pte);
-}
-
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
-		return 0;
-
-	if (pud)
-		return kvm_s2pud_young(*pud);
-	else if (pmd)
-		return pmd_young(*pmd);
-	else
-		return pte_young(*pte);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
-	if (!kvm->arch.pgd)
-		return 0;
-	trace_kvm_age_hva(start, end);
-	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-	if (!kvm->arch.pgd)
-		return 0;
-	trace_kvm_test_age_hva(hva);
-	return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
-				 kvm_test_age_hva_handler, NULL);
-}
-
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
-phys_addr_t kvm_mmu_get_httbr(void)
-{
-	if (__kvm_cpu_uses_extended_idmap())
-		return virt_to_phys(merged_hyp_pgd);
-	else
-		return virt_to_phys(hyp_pgd);
-}
-
-phys_addr_t kvm_get_idmap_vector(void)
-{
-	return hyp_idmap_vector;
-}
-
-static int kvm_map_idmap_text(pgd_t *pgd)
-{
-	int err;
-
-	/* Create the idmap in the boot page tables */
-	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-				      hyp_idmap_start, hyp_idmap_end,
-				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP_EXEC);
-	if (err)
-		kvm_err("Failed to idmap %lx-%lx\n",
-			hyp_idmap_start, hyp_idmap_end);
-
-	return err;
-}
-
-int kvm_mmu_init(void)
-{
-	int err;
-
-	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
-	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
-	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
-	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
-	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
-
-	/*
-	 * We rely on the linker script to ensure at build time that the HYP
-	 * init code does not cross a page boundary.
-	 */
-	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
-
-	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
-	kvm_debug("HYP VA range: %lx:%lx\n",
-		  kern_hyp_va(PAGE_OFFSET),
-		  kern_hyp_va((unsigned long)high_memory - 1));
-
-	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
-	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
-	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
-		/*
-		 * The idmap page is intersecting with the VA space,
-		 * it is not safe to continue further.
-		 */
-		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
-		err = -EINVAL;
-		goto out;
-	}
-
-	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-	if (!hyp_pgd) {
-		kvm_err("Hyp mode PGD not allocated\n");
-		err = -ENOMEM;
-		goto out;
-	}
-
-	if (__kvm_cpu_uses_extended_idmap()) {
-		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-							 hyp_pgd_order);
-		if (!boot_hyp_pgd) {
-			kvm_err("Hyp boot PGD not allocated\n");
-			err = -ENOMEM;
-			goto out;
-		}
-
-		err = kvm_map_idmap_text(boot_hyp_pgd);
-		if (err)
-			goto out;
-
-		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-		if (!merged_hyp_pgd) {
-			kvm_err("Failed to allocate extra HYP pgd\n");
-			goto out;
-		}
-		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
-				    hyp_idmap_start);
-	} else {
-		err = kvm_map_idmap_text(hyp_pgd);
-		if (err)
-			goto out;
-	}
-
-	io_map_base = hyp_idmap_start;
-	return 0;
-out:
-	free_hyp_pgds();
-	return err;
-}
-
-void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   const struct kvm_userspace_memory_region *mem,
-				   struct kvm_memory_slot *old,
-				   const struct kvm_memory_slot *new,
-				   enum kvm_mr_change change)
-{
-	/*
-	 * At this point memslot has been committed and there is an
-	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
-	 * memory slot is write protected.
-	 */
-	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
-		kvm_mmu_wp_memory_region(kvm, mem->slot);
-}
-
-int kvm_arch_prepare_memory_region(struct kvm *kvm,
-				   struct kvm_memory_slot *memslot,
-				   const struct kvm_userspace_memory_region *mem,
-				   enum kvm_mr_change change)
-{
-	hva_t hva = mem->userspace_addr;
-	hva_t reg_end = hva + mem->memory_size;
-	bool writable = !(mem->flags & KVM_MEM_READONLY);
-	int ret = 0;
-
-	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
-			change != KVM_MR_FLAGS_ONLY)
-		return 0;
-
-	/*
-	 * Prevent userspace from creating a memory region outside of the IPA
-	 * space addressable by the KVM guest IPA space.
-	 */
-	if (memslot->base_gfn + memslot->npages >=
-	    (kvm_phys_size(kvm) >> PAGE_SHIFT))
-		return -EFAULT;
-
-	down_read(&current->mm->mmap_sem);
-	/*
-	 * A memory region could potentially cover multiple VMAs, and any holes
-	 * between them, so iterate over all of them to find out if we can map
-	 * any of them right now.
-	 *
-	 *     +--------------------------------------------+
-	 * +---------------+----------------+   +----------------+
-	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-	 * +---------------+----------------+   +----------------+
-	 *     |               memory region                |
-	 *     +--------------------------------------------+
-	 */
-	do {
-		struct vm_area_struct *vma = find_vma(current->mm, hva);
-		hva_t vm_start, vm_end;
-
-		if (!vma || vma->vm_start >= reg_end)
-			break;
-
-		/*
-		 * Take the intersection of this VMA with the memory region
-		 */
-		vm_start = max(hva, vma->vm_start);
-		vm_end = min(reg_end, vma->vm_end);
-
-		if (vma->vm_flags & VM_PFNMAP) {
-			gpa_t gpa = mem->guest_phys_addr +
-				    (vm_start - mem->userspace_addr);
-			phys_addr_t pa;
-
-			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
-			pa += vm_start - vma->vm_start;
-
-			/* IO region dirty page logging not allowed */
-			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-				ret = -EINVAL;
-				goto out;
-			}
-
-			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
-						    vm_end - vm_start,
-						    writable);
-			if (ret)
-				break;
-		}
-		hva = vm_end;
-	} while (hva < reg_end);
-
-	if (change == KVM_MR_FLAGS_ONLY)
-		goto out;
-
-	spin_lock(&kvm->mmu_lock);
-	if (ret)
-		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-	else
-		stage2_flush_memslot(kvm, memslot);
-	spin_unlock(&kvm->mmu_lock);
-out:
-	up_read(&current->mm->mmap_sem);
-	return ret;
-}
-
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-}
-
-void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
-{
-}
-
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
-	kvm_free_stage2_pgd(kvm);
-}
-
-void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-				   struct kvm_memory_slot *slot)
-{
-	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
-	phys_addr_t size = slot->npages << PAGE_SHIFT;
-
-	spin_lock(&kvm->mmu_lock);
-	unmap_stage2_range(kvm, gpa, size);
-	spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
- *
- * Main problems:
- * - S/W ops are local to a CPU (not broadcast)
- * - We have line migration behind our back (speculation)
- * - System caches don't support S/W at all (damn!)
- *
- * In the face of the above, the best we can do is to try and convert
- * S/W ops to VA ops. Because the guest is not allowed to infer the
- * S/W to PA mapping, it can only use S/W to nuke the whole cache,
- * which is a rather good thing for us.
- *
- * Also, it is only used when turning caches on/off ("The expected
- * usage of the cache maintenance instructions that operate by set/way
- * is associated with the cache maintenance instructions associated
- * with the powerdown and powerup of caches, if this is required by
- * the implementation.").
- *
- * We use the following policy:
- *
- * - If we trap a S/W operation, we enable VM trapping to detect
- *   caches being turned on/off, and do a full clean.
- *
- * - We flush the caches on both caches being turned on and off.
- *
- * - Once the caches are enabled, we stop trapping VM ops.
- */
-void kvm_set_way_flush(struct kvm_vcpu *vcpu)
-{
-	unsigned long hcr = *vcpu_hcr(vcpu);
-
-	/*
-	 * If this is the first time we do a S/W operation
-	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
-	 * VM trapping.
-	 *
-	 * Otherwise, rely on the VM trapping to wait for the MMU +
-	 * Caches to be turned off. At that point, we'll be able to
-	 * clean the caches again.
-	 */
-	if (!(hcr & HCR_TVM)) {
-		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
-					vcpu_has_cache_enabled(vcpu));
-		stage2_flush_vm(vcpu->kvm);
-		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
-	}
-}
-
-void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
-{
-	bool now_enabled = vcpu_has_cache_enabled(vcpu);
-
-	/*
-	 * If switching the MMU+caches on, need to invalidate the caches.
-	 * If switching it off, need to clean the caches.
-	 * Clean + invalidate does the trick always.
-	 */
-	if (now_enabled != was_enabled)
-		stage2_flush_vm(vcpu->kvm);
-
-	/* Caches are now on, stop trapping VM ops (until a S/W op) */
-	if (now_enabled)
-		*vcpu_hcr(vcpu) &= ~HCR_TVM;
-
-	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
-}
diff --git a/virt/kvm/arm/perf.c b/virt/kvm/arm/perf.c
deleted file mode 100644
index d45b8b9a4415..000000000000
--- a/virt/kvm/arm/perf.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Based on the x86 implementation.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/perf_event.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-
-static int kvm_is_in_guest(void)
-{
-        return kvm_get_running_vcpu() != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
-	struct kvm_vcpu *vcpu;
-
-	vcpu = kvm_get_running_vcpu();
-
-	if (vcpu)
-		return !vcpu_mode_priv(vcpu);
-
-	return 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
-	struct kvm_vcpu *vcpu;
-
-	vcpu = kvm_get_running_vcpu();
-
-	if (vcpu)
-		return *vcpu_pc(vcpu);
-
-	return 0;
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
-	.is_in_guest	= kvm_is_in_guest,
-	.is_user_mode	= kvm_is_user_mode,
-	.get_guest_ip	= kvm_get_guest_ip,
-};
-
-int kvm_perf_init(void)
-{
-	return perf_register_guest_info_callbacks(&kvm_guest_cbs);
-}
-
-int kvm_perf_teardown(void)
-{
-	return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
-}
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
deleted file mode 100644
index f0d0312c0a55..000000000000
--- a/virt/kvm/arm/pmu.c
+++ /dev/null
@@ -1,869 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Shannon Zhao <shannon.zhao@linaro.org>
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/perf_event.h>
-#include <linux/perf/arm_pmu.h>
-#include <linux/uaccess.h>
-#include <asm/kvm_emulate.h>
-#include <kvm/arm_pmu.h>
-#include <kvm/arm_vgic.h>
-
-static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx);
-static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx);
-static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
-
-#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
-
-/**
- * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- */
-static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-	return (select_idx == ARMV8_PMU_CYCLE_IDX &&
-		__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC);
-}
-
-static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc)
-{
-	struct kvm_pmu *pmu;
-	struct kvm_vcpu_arch *vcpu_arch;
-
-	pmc -= pmc->idx;
-	pmu = container_of(pmc, struct kvm_pmu, pmc[0]);
-	vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu);
-	return container_of(vcpu_arch, struct kvm_vcpu, arch);
-}
-
-/**
- * kvm_pmu_pmc_is_chained - determine if the pmc is chained
- * @pmc: The PMU counter pointer
- */
-static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc)
-{
-	struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
-
-	return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
-}
-
-/**
- * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter
- * @select_idx: The counter index
- */
-static bool kvm_pmu_idx_is_high_counter(u64 select_idx)
-{
-	return select_idx & 0x1;
-}
-
-/**
- * kvm_pmu_get_canonical_pmc - obtain the canonical pmc
- * @pmc: The PMU counter pointer
- *
- * When a pair of PMCs are chained together we use the low counter (canonical)
- * to hold the underlying perf event.
- */
-static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc)
-{
-	if (kvm_pmu_pmc_is_chained(pmc) &&
-	    kvm_pmu_idx_is_high_counter(pmc->idx))
-		return pmc - 1;
-
-	return pmc;
-}
-static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc)
-{
-	if (kvm_pmu_idx_is_high_counter(pmc->idx))
-		return pmc - 1;
-	else
-		return pmc + 1;
-}
-
-/**
- * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- */
-static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-	u64 eventsel, reg;
-
-	select_idx |= 0x1;
-
-	if (select_idx == ARMV8_PMU_CYCLE_IDX)
-		return false;
-
-	reg = PMEVTYPER0_EL0 + select_idx;
-	eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
-
-	return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
-}
-
-/**
- * kvm_pmu_get_pair_counter_value - get PMU counter value
- * @vcpu: The vcpu pointer
- * @pmc: The PMU counter pointer
- */
-static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
-					  struct kvm_pmc *pmc)
-{
-	u64 counter, counter_high, reg, enabled, running;
-
-	if (kvm_pmu_pmc_is_chained(pmc)) {
-		pmc = kvm_pmu_get_canonical_pmc(pmc);
-		reg = PMEVCNTR0_EL0 + pmc->idx;
-
-		counter = __vcpu_sys_reg(vcpu, reg);
-		counter_high = __vcpu_sys_reg(vcpu, reg + 1);
-
-		counter = lower_32_bits(counter) | (counter_high << 32);
-	} else {
-		reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
-		      ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
-		counter = __vcpu_sys_reg(vcpu, reg);
-	}
-
-	/*
-	 * The real counter value is equal to the value of counter register plus
-	 * the value perf event counts.
-	 */
-	if (pmc->perf_event)
-		counter += perf_event_read_value(pmc->perf_event, &enabled,
-						 &running);
-
-	return counter;
-}
-
-/**
- * kvm_pmu_get_counter_value - get PMU counter value
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- */
-u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-	u64 counter;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
-
-	counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
-
-	if (kvm_pmu_pmc_is_chained(pmc) &&
-	    kvm_pmu_idx_is_high_counter(select_idx))
-		counter = upper_32_bits(counter);
-	else if (select_idx != ARMV8_PMU_CYCLE_IDX)
-		counter = lower_32_bits(counter);
-
-	return counter;
-}
-
-/**
- * kvm_pmu_set_counter_value - set PMU counter value
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- * @val: The counter value
- */
-void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
-{
-	u64 reg;
-
-	reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
-	      ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
-	__vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
-
-	/* Recreate the perf event to reflect the updated sample_period */
-	kvm_pmu_create_perf_event(vcpu, select_idx);
-}
-
-/**
- * kvm_pmu_release_perf_event - remove the perf event
- * @pmc: The PMU counter pointer
- */
-static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
-{
-	pmc = kvm_pmu_get_canonical_pmc(pmc);
-	if (pmc->perf_event) {
-		perf_event_disable(pmc->perf_event);
-		perf_event_release_kernel(pmc->perf_event);
-		pmc->perf_event = NULL;
-	}
-}
-
-/**
- * kvm_pmu_stop_counter - stop PMU counter
- * @pmc: The PMU counter pointer
- *
- * If this counter has been configured to monitor some event, release it here.
- */
-static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
-{
-	u64 counter, reg, val;
-
-	pmc = kvm_pmu_get_canonical_pmc(pmc);
-	if (!pmc->perf_event)
-		return;
-
-	counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
-
-	if (pmc->idx == ARMV8_PMU_CYCLE_IDX) {
-		reg = PMCCNTR_EL0;
-		val = counter;
-	} else {
-		reg = PMEVCNTR0_EL0 + pmc->idx;
-		val = lower_32_bits(counter);
-	}
-
-	__vcpu_sys_reg(vcpu, reg) = val;
-
-	if (kvm_pmu_pmc_is_chained(pmc))
-		__vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
-
-	kvm_pmu_release_perf_event(pmc);
-}
-
-/**
- * kvm_pmu_vcpu_init - assign pmu counter idx for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
-		pmu->pmc[i].idx = i;
-}
-
-/**
- * kvm_pmu_vcpu_reset - reset pmu state for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-	unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	int i;
-
-	for_each_set_bit(i, &mask, 32)
-		kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
-
-	bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
-}
-
-/**
- * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
-		kvm_pmu_release_perf_event(&pmu->pmc[i]);
-}
-
-u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
-{
-	u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
-
-	val &= ARMV8_PMU_PMCR_N_MASK;
-	if (val == 0)
-		return BIT(ARMV8_PMU_CYCLE_IDX);
-	else
-		return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
-}
-
-/**
- * kvm_pmu_enable_counter_mask - enable selected PMU counters
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENSET register
- *
- * Call perf_event_enable to start counting the perf event
- */
-void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
-{
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
-
-	if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
-		return;
-
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
-		if (!(val & BIT(i)))
-			continue;
-
-		pmc = &pmu->pmc[i];
-
-		/* A change in the enable state may affect the chain state */
-		kvm_pmu_update_pmc_chained(vcpu, i);
-		kvm_pmu_create_perf_event(vcpu, i);
-
-		/* At this point, pmc must be the canonical */
-		if (pmc->perf_event) {
-			perf_event_enable(pmc->perf_event);
-			if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
-				kvm_debug("fail to enable perf event\n");
-		}
-	}
-}
-
-/**
- * kvm_pmu_disable_counter_mask - disable selected PMU counters
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENCLR register
- *
- * Call perf_event_disable to stop counting the perf event
- */
-void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
-{
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
-
-	if (!val)
-		return;
-
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
-		if (!(val & BIT(i)))
-			continue;
-
-		pmc = &pmu->pmc[i];
-
-		/* A change in the enable state may affect the chain state */
-		kvm_pmu_update_pmc_chained(vcpu, i);
-		kvm_pmu_create_perf_event(vcpu, i);
-
-		/* At this point, pmc must be the canonical */
-		if (pmc->perf_event)
-			perf_event_disable(pmc->perf_event);
-	}
-}
-
-static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
-{
-	u64 reg = 0;
-
-	if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
-		reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
-		reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
-		reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
-		reg &= kvm_pmu_valid_counter_mask(vcpu);
-	}
-
-	return reg;
-}
-
-static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	bool overflow;
-
-	if (!kvm_arm_pmu_v3_ready(vcpu))
-		return;
-
-	overflow = !!kvm_pmu_overflow_status(vcpu);
-	if (pmu->irq_level == overflow)
-		return;
-
-	pmu->irq_level = overflow;
-
-	if (likely(irqchip_in_kernel(vcpu->kvm))) {
-		int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-					      pmu->irq_num, overflow, pmu);
-		WARN_ON(ret);
-	}
-}
-
-bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
-	bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU;
-
-	if (likely(irqchip_in_kernel(vcpu->kvm)))
-		return false;
-
-	return pmu->irq_level != run_level;
-}
-
-/*
- * Reflect the PMU overflow interrupt output level into the kvm_run structure
- */
-void kvm_pmu_update_run(struct kvm_vcpu *vcpu)
-{
-	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
-
-	/* Populate the timer bitmap for user space */
-	regs->device_irq_level &= ~KVM_ARM_DEV_PMU;
-	if (vcpu->arch.pmu.irq_level)
-		regs->device_irq_level |= KVM_ARM_DEV_PMU;
-}
-
-/**
- * kvm_pmu_flush_hwstate - flush pmu state to cpu
- * @vcpu: The vcpu pointer
- *
- * Check if the PMU has overflowed while we were running in the host, and inject
- * an interrupt if that was the case.
- */
-void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-	kvm_pmu_update_state(vcpu);
-}
-
-/**
- * kvm_pmu_sync_hwstate - sync pmu state from cpu
- * @vcpu: The vcpu pointer
- *
- * Check if the PMU has overflowed while we were running in the guest, and
- * inject an interrupt if that was the case.
- */
-void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	kvm_pmu_update_state(vcpu);
-}
-
-/**
- * When the perf event overflows, set the overflow status and inform the vcpu.
- */
-static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
-				  struct perf_sample_data *data,
-				  struct pt_regs *regs)
-{
-	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
-	struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
-	struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
-	int idx = pmc->idx;
-	u64 period;
-
-	cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE);
-
-	/*
-	 * Reset the sample period to the architectural limit,
-	 * i.e. the point where the counter overflows.
-	 */
-	period = -(local64_read(&perf_event->count));
-
-	if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
-		period &= GENMASK(31, 0);
-
-	local64_set(&perf_event->hw.period_left, 0);
-	perf_event->attr.sample_period = period;
-	perf_event->hw.sample_period = period;
-
-	__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
-
-	if (kvm_pmu_overflow_status(vcpu)) {
-		kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-		kvm_vcpu_kick(vcpu);
-	}
-
-	cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD);
-}
-
-/**
- * kvm_pmu_software_increment - do software increment
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMSWINC register
- */
-void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	int i;
-
-	if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
-		return;
-
-	/* Weed out disabled counters */
-	val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
-
-	for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
-		u64 type, reg;
-
-		if (!(val & BIT(i)))
-			continue;
-
-		/* PMSWINC only applies to ... SW_INC! */
-		type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
-		type &= ARMV8_PMU_EVTYPE_EVENT;
-		if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
-			continue;
-
-		/* increment this even SW_INC counter */
-		reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
-		reg = lower_32_bits(reg);
-		__vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
-
-		if (reg) /* no overflow on the low part */
-			continue;
-
-		if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) {
-			/* increment the high counter */
-			reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1;
-			reg = lower_32_bits(reg);
-			__vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg;
-			if (!reg) /* mark overflow on the high counter */
-				__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1);
-		} else {
-			/* mark overflow on low counter */
-			__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
-		}
-	}
-}
-
-/**
- * kvm_pmu_handle_pmcr - handle PMCR register
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCR register
- */
-void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
-{
-	unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
-	int i;
-
-	if (val & ARMV8_PMU_PMCR_E) {
-		kvm_pmu_enable_counter_mask(vcpu,
-		       __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
-	} else {
-		kvm_pmu_disable_counter_mask(vcpu, mask);
-	}
-
-	if (val & ARMV8_PMU_PMCR_C)
-		kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
-
-	if (val & ARMV8_PMU_PMCR_P) {
-		for_each_set_bit(i, &mask, 32)
-			kvm_pmu_set_counter_value(vcpu, i, 0);
-	}
-}
-
-static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-	return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
-	       (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
-}
-
-/**
- * kvm_pmu_create_perf_event - create a perf event for a counter
- * @vcpu: The vcpu pointer
- * @select_idx: The number of selected counter
- */
-static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
-	struct perf_event *event;
-	struct perf_event_attr attr;
-	u64 eventsel, counter, reg, data;
-
-	/*
-	 * For chained counters the event type and filtering attributes are
-	 * obtained from the low/even counter. We also use this counter to
-	 * determine if the event is enabled/disabled.
-	 */
-	pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]);
-
-	reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
-	      ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx;
-	data = __vcpu_sys_reg(vcpu, reg);
-
-	kvm_pmu_stop_counter(vcpu, pmc);
-	eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
-
-	/* Software increment event does't need to be backed by a perf event */
-	if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
-	    pmc->idx != ARMV8_PMU_CYCLE_IDX)
-		return;
-
-	memset(&attr, 0, sizeof(struct perf_event_attr));
-	attr.type = PERF_TYPE_RAW;
-	attr.size = sizeof(attr);
-	attr.pinned = 1;
-	attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx);
-	attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
-	attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
-	attr.exclude_hv = 1; /* Don't count EL2 events */
-	attr.exclude_host = 1; /* Don't count host events */
-	attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
-		ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
-
-	counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
-
-	if (kvm_pmu_pmc_is_chained(pmc)) {
-		/**
-		 * The initial sample period (overflow count) of an event. For
-		 * chained counters we only support overflow interrupts on the
-		 * high counter.
-		 */
-		attr.sample_period = (-counter) & GENMASK(63, 0);
-		attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
-
-		event = perf_event_create_kernel_counter(&attr, -1, current,
-							 kvm_pmu_perf_overflow,
-							 pmc + 1);
-	} else {
-		/* The initial sample period (overflow count) of an event. */
-		if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
-			attr.sample_period = (-counter) & GENMASK(63, 0);
-		else
-			attr.sample_period = (-counter) & GENMASK(31, 0);
-
-		event = perf_event_create_kernel_counter(&attr, -1, current,
-						 kvm_pmu_perf_overflow, pmc);
-	}
-
-	if (IS_ERR(event)) {
-		pr_err_once("kvm: pmu event creation failed %ld\n",
-			    PTR_ERR(event));
-		return;
-	}
-
-	pmc->perf_event = event;
-}
-
-/**
- * kvm_pmu_update_pmc_chained - update chained bitmap
- * @vcpu: The vcpu pointer
- * @select_idx: The number of selected counter
- *
- * Update the chained bitmap based on the event type written in the
- * typer register and the enable state of the odd register.
- */
-static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc;
-	bool new_state, old_state;
-
-	old_state = kvm_pmu_pmc_is_chained(pmc);
-	new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) &&
-		    kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1);
-
-	if (old_state == new_state)
-		return;
-
-	canonical_pmc = kvm_pmu_get_canonical_pmc(pmc);
-	kvm_pmu_stop_counter(vcpu, canonical_pmc);
-	if (new_state) {
-		/*
-		 * During promotion from !chained to chained we must ensure
-		 * the adjacent counter is stopped and its event destroyed
-		 */
-		kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc));
-		set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
-		return;
-	}
-	clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
-}
-
-/**
- * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
- * @vcpu: The vcpu pointer
- * @data: The data guest writes to PMXEVTYPER_EL0
- * @select_idx: The number of selected counter
- *
- * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an
- * event with given hardware event number. Here we call perf_event API to
- * emulate this action and create a kernel perf event for it.
- */
-void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
-				    u64 select_idx)
-{
-	u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
-
-	reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
-	      ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
-
-	__vcpu_sys_reg(vcpu, reg) = event_type;
-
-	kvm_pmu_update_pmc_chained(vcpu, select_idx);
-	kvm_pmu_create_perf_event(vcpu, select_idx);
-}
-
-bool kvm_arm_support_pmu_v3(void)
-{
-	/*
-	 * Check if HW_PERF_EVENTS are supported by checking the number of
-	 * hardware performance counters. This could ensure the presence of
-	 * a physical PMU and CONFIG_PERF_EVENT is selected.
-	 */
-	return (perf_num_counters() > 0);
-}
-
-int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->arch.pmu.created)
-		return 0;
-
-	/*
-	 * A valid interrupt configuration for the PMU is either to have a
-	 * properly configured interrupt number and using an in-kernel
-	 * irqchip, or to not have an in-kernel GIC and not set an IRQ.
-	 */
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		int irq = vcpu->arch.pmu.irq_num;
-		if (!kvm_arm_pmu_irq_initialized(vcpu))
-			return -EINVAL;
-
-		/*
-		 * If we are using an in-kernel vgic, at this point we know
-		 * the vgic will be initialized, so we can check the PMU irq
-		 * number against the dimensions of the vgic and make sure
-		 * it's valid.
-		 */
-		if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
-			return -EINVAL;
-	} else if (kvm_arm_pmu_irq_initialized(vcpu)) {
-		   return -EINVAL;
-	}
-
-	kvm_pmu_vcpu_reset(vcpu);
-	vcpu->arch.pmu.ready = true;
-
-	return 0;
-}
-
-static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
-{
-	if (!kvm_arm_support_pmu_v3())
-		return -ENODEV;
-
-	if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-		return -ENXIO;
-
-	if (vcpu->arch.pmu.created)
-		return -EBUSY;
-
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		int ret;
-
-		/*
-		 * If using the PMU with an in-kernel virtual GIC
-		 * implementation, we require the GIC to be already
-		 * initialized when initializing the PMU.
-		 */
-		if (!vgic_initialized(vcpu->kvm))
-			return -ENODEV;
-
-		if (!kvm_arm_pmu_irq_initialized(vcpu))
-			return -ENXIO;
-
-		ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
-					 &vcpu->arch.pmu);
-		if (ret)
-			return ret;
-	}
-
-	vcpu->arch.pmu.created = true;
-	return 0;
-}
-
-/*
- * For one VM the interrupt type must be same for each vcpu.
- * As a PPI, the interrupt number is the same for all vcpus,
- * while as an SPI it must be a separate number per vcpu.
- */
-static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (!kvm_arm_pmu_irq_initialized(vcpu))
-			continue;
-
-		if (irq_is_ppi(irq)) {
-			if (vcpu->arch.pmu.irq_num != irq)
-				return false;
-		} else {
-			if (vcpu->arch.pmu.irq_num == irq)
-				return false;
-		}
-	}
-
-	return true;
-}
-
-int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_PMU_V3_IRQ: {
-		int __user *uaddr = (int __user *)(long)attr->addr;
-		int irq;
-
-		if (!irqchip_in_kernel(vcpu->kvm))
-			return -EINVAL;
-
-		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-			return -ENODEV;
-
-		if (get_user(irq, uaddr))
-			return -EFAULT;
-
-		/* The PMU overflow interrupt can be a PPI or a valid SPI. */
-		if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
-			return -EINVAL;
-
-		if (!pmu_irq_is_valid(vcpu->kvm, irq))
-			return -EINVAL;
-
-		if (kvm_arm_pmu_irq_initialized(vcpu))
-			return -EBUSY;
-
-		kvm_debug("Set kvm ARM PMU irq: %d\n", irq);
-		vcpu->arch.pmu.irq_num = irq;
-		return 0;
-	}
-	case KVM_ARM_VCPU_PMU_V3_INIT:
-		return kvm_arm_pmu_v3_init(vcpu);
-	}
-
-	return -ENXIO;
-}
-
-int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_PMU_V3_IRQ: {
-		int __user *uaddr = (int __user *)(long)attr->addr;
-		int irq;
-
-		if (!irqchip_in_kernel(vcpu->kvm))
-			return -EINVAL;
-
-		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-			return -ENODEV;
-
-		if (!kvm_arm_pmu_irq_initialized(vcpu))
-			return -ENXIO;
-
-		irq = vcpu->arch.pmu.irq_num;
-		return put_user(irq, uaddr);
-	}
-	}
-
-	return -ENXIO;
-}
-
-int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_PMU_V3_IRQ:
-	case KVM_ARM_VCPU_PMU_V3_INIT:
-		if (kvm_arm_support_pmu_v3() &&
-		    test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-			return 0;
-	}
-
-	return -ENXIO;
-}
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
deleted file mode 100644
index ae364716ee40..000000000000
--- a/virt/kvm/arm/psci.c
+++ /dev/null
@@ -1,564 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2012 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/arm-smccc.h>
-#include <linux/preempt.h>
-#include <linux/kvm_host.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
-
-#include <asm/cputype.h>
-#include <asm/kvm_emulate.h>
-
-#include <kvm/arm_psci.h>
-#include <kvm/arm_hypercalls.h>
-
-/*
- * This is an implementation of the Power State Coordination Interface
- * as described in ARM document number ARM DEN 0022A.
- */
-
-#define AFFINITY_MASK(level)	~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
-
-static unsigned long psci_affinity_mask(unsigned long affinity_level)
-{
-	if (affinity_level <= 3)
-		return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
-
-	return 0;
-}
-
-static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * NOTE: For simplicity, we make VCPU suspend emulation to be
-	 * same-as WFI (Wait-for-interrupt) emulation.
-	 *
-	 * This means for KVM the wakeup events are interrupts and
-	 * this is consistent with intended use of StateID as described
-	 * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
-	 *
-	 * Further, we also treat power-down request to be same as
-	 * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
-	 * specification (ARM DEN 0022A). This means all suspend states
-	 * for KVM will preserve the register state.
-	 */
-	kvm_vcpu_block(vcpu);
-	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
-
-	return PSCI_RET_SUCCESS;
-}
-
-static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.power_off = true;
-	kvm_make_request(KVM_REQ_SLEEP, vcpu);
-	kvm_vcpu_kick(vcpu);
-}
-
-static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
-{
-	struct vcpu_reset_state *reset_state;
-	struct kvm *kvm = source_vcpu->kvm;
-	struct kvm_vcpu *vcpu = NULL;
-	unsigned long cpu_id;
-
-	cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
-	if (vcpu_mode_is_32bit(source_vcpu))
-		cpu_id &= ~((u32) 0);
-
-	vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
-
-	/*
-	 * Make sure the caller requested a valid CPU and that the CPU is
-	 * turned off.
-	 */
-	if (!vcpu)
-		return PSCI_RET_INVALID_PARAMS;
-	if (!vcpu->arch.power_off) {
-		if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1)
-			return PSCI_RET_ALREADY_ON;
-		else
-			return PSCI_RET_INVALID_PARAMS;
-	}
-
-	reset_state = &vcpu->arch.reset_state;
-
-	reset_state->pc = smccc_get_arg2(source_vcpu);
-
-	/* Propagate caller endianness */
-	reset_state->be = kvm_vcpu_is_be(source_vcpu);
-
-	/*
-	 * NOTE: We always update r0 (or x0) because for PSCI v0.1
-	 * the general puspose registers are undefined upon CPU_ON.
-	 */
-	reset_state->r0 = smccc_get_arg3(source_vcpu);
-
-	WRITE_ONCE(reset_state->reset, true);
-	kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
-
-	/*
-	 * Make sure the reset request is observed if the change to
-	 * power_state is observed.
-	 */
-	smp_wmb();
-
-	vcpu->arch.power_off = false;
-	kvm_vcpu_wake_up(vcpu);
-
-	return PSCI_RET_SUCCESS;
-}
-
-static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
-{
-	int i, matching_cpus = 0;
-	unsigned long mpidr;
-	unsigned long target_affinity;
-	unsigned long target_affinity_mask;
-	unsigned long lowest_affinity_level;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_vcpu *tmp;
-
-	target_affinity = smccc_get_arg1(vcpu);
-	lowest_affinity_level = smccc_get_arg2(vcpu);
-
-	/* Determine target affinity mask */
-	target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
-	if (!target_affinity_mask)
-		return PSCI_RET_INVALID_PARAMS;
-
-	/* Ignore other bits of target affinity */
-	target_affinity &= target_affinity_mask;
-
-	/*
-	 * If one or more VCPU matching target affinity are running
-	 * then ON else OFF
-	 */
-	kvm_for_each_vcpu(i, tmp, kvm) {
-		mpidr = kvm_vcpu_get_mpidr_aff(tmp);
-		if ((mpidr & target_affinity_mask) == target_affinity) {
-			matching_cpus++;
-			if (!tmp->arch.power_off)
-				return PSCI_0_2_AFFINITY_LEVEL_ON;
-		}
-	}
-
-	if (!matching_cpus)
-		return PSCI_RET_INVALID_PARAMS;
-
-	return PSCI_0_2_AFFINITY_LEVEL_OFF;
-}
-
-static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
-{
-	int i;
-	struct kvm_vcpu *tmp;
-
-	/*
-	 * The KVM ABI specifies that a system event exit may call KVM_RUN
-	 * again and may perform shutdown/reboot at a later time that when the
-	 * actual request is made.  Since we are implementing PSCI and a
-	 * caller of PSCI reboot and shutdown expects that the system shuts
-	 * down or reboots immediately, let's make sure that VCPUs are not run
-	 * after this call is handled and before the VCPUs have been
-	 * re-initialized.
-	 */
-	kvm_for_each_vcpu(i, tmp, vcpu->kvm)
-		tmp->arch.power_off = true;
-	kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
-
-	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
-	vcpu->run->system_event.type = type;
-	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
-}
-
-static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
-{
-	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
-}
-
-static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
-{
-	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
-}
-
-static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	/*
-	 * Zero the input registers' upper 32 bits. They will be fully
-	 * zeroed on exit, so we're fine changing them in place.
-	 */
-	for (i = 1; i < 4; i++)
-		vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
-}
-
-static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn)
-{
-	switch(fn) {
-	case PSCI_0_2_FN64_CPU_SUSPEND:
-	case PSCI_0_2_FN64_CPU_ON:
-	case PSCI_0_2_FN64_AFFINITY_INFO:
-		/* Disallow these functions for 32bit guests */
-		if (vcpu_mode_is_32bit(vcpu))
-			return PSCI_RET_NOT_SUPPORTED;
-		break;
-	}
-
-	return 0;
-}
-
-static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	u32 psci_fn = smccc_get_function(vcpu);
-	unsigned long val;
-	int ret = 1;
-
-	val = kvm_psci_check_allowed_function(vcpu, psci_fn);
-	if (val)
-		goto out;
-
-	switch (psci_fn) {
-	case PSCI_0_2_FN_PSCI_VERSION:
-		/*
-		 * Bits[31:16] = Major Version = 0
-		 * Bits[15:0] = Minor Version = 2
-		 */
-		val = KVM_ARM_PSCI_0_2;
-		break;
-	case PSCI_0_2_FN_CPU_SUSPEND:
-	case PSCI_0_2_FN64_CPU_SUSPEND:
-		val = kvm_psci_vcpu_suspend(vcpu);
-		break;
-	case PSCI_0_2_FN_CPU_OFF:
-		kvm_psci_vcpu_off(vcpu);
-		val = PSCI_RET_SUCCESS;
-		break;
-	case PSCI_0_2_FN_CPU_ON:
-		kvm_psci_narrow_to_32bit(vcpu);
-		fallthrough;
-	case PSCI_0_2_FN64_CPU_ON:
-		mutex_lock(&kvm->lock);
-		val = kvm_psci_vcpu_on(vcpu);
-		mutex_unlock(&kvm->lock);
-		break;
-	case PSCI_0_2_FN_AFFINITY_INFO:
-		kvm_psci_narrow_to_32bit(vcpu);
-		fallthrough;
-	case PSCI_0_2_FN64_AFFINITY_INFO:
-		val = kvm_psci_vcpu_affinity_info(vcpu);
-		break;
-	case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-		/*
-		 * Trusted OS is MP hence does not require migration
-	         * or
-		 * Trusted OS is not present
-		 */
-		val = PSCI_0_2_TOS_MP;
-		break;
-	case PSCI_0_2_FN_SYSTEM_OFF:
-		kvm_psci_system_off(vcpu);
-		/*
-		 * We should'nt be going back to guest VCPU after
-		 * receiving SYSTEM_OFF request.
-		 *
-		 * If user space accidently/deliberately resumes
-		 * guest VCPU after SYSTEM_OFF request then guest
-		 * VCPU should see internal failure from PSCI return
-		 * value. To achieve this, we preload r0 (or x0) with
-		 * PSCI return value INTERNAL_FAILURE.
-		 */
-		val = PSCI_RET_INTERNAL_FAILURE;
-		ret = 0;
-		break;
-	case PSCI_0_2_FN_SYSTEM_RESET:
-		kvm_psci_system_reset(vcpu);
-		/*
-		 * Same reason as SYSTEM_OFF for preloading r0 (or x0)
-		 * with PSCI return value INTERNAL_FAILURE.
-		 */
-		val = PSCI_RET_INTERNAL_FAILURE;
-		ret = 0;
-		break;
-	default:
-		val = PSCI_RET_NOT_SUPPORTED;
-		break;
-	}
-
-out:
-	smccc_set_retval(vcpu, val, 0, 0, 0);
-	return ret;
-}
-
-static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
-{
-	u32 psci_fn = smccc_get_function(vcpu);
-	u32 feature;
-	unsigned long val;
-	int ret = 1;
-
-	switch(psci_fn) {
-	case PSCI_0_2_FN_PSCI_VERSION:
-		val = KVM_ARM_PSCI_1_0;
-		break;
-	case PSCI_1_0_FN_PSCI_FEATURES:
-		feature = smccc_get_arg1(vcpu);
-		val = kvm_psci_check_allowed_function(vcpu, feature);
-		if (val)
-			break;
-
-		switch(feature) {
-		case PSCI_0_2_FN_PSCI_VERSION:
-		case PSCI_0_2_FN_CPU_SUSPEND:
-		case PSCI_0_2_FN64_CPU_SUSPEND:
-		case PSCI_0_2_FN_CPU_OFF:
-		case PSCI_0_2_FN_CPU_ON:
-		case PSCI_0_2_FN64_CPU_ON:
-		case PSCI_0_2_FN_AFFINITY_INFO:
-		case PSCI_0_2_FN64_AFFINITY_INFO:
-		case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-		case PSCI_0_2_FN_SYSTEM_OFF:
-		case PSCI_0_2_FN_SYSTEM_RESET:
-		case PSCI_1_0_FN_PSCI_FEATURES:
-		case ARM_SMCCC_VERSION_FUNC_ID:
-			val = 0;
-			break;
-		default:
-			val = PSCI_RET_NOT_SUPPORTED;
-			break;
-		}
-		break;
-	default:
-		return kvm_psci_0_2_call(vcpu);
-	}
-
-	smccc_set_retval(vcpu, val, 0, 0, 0);
-	return ret;
-}
-
-static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	u32 psci_fn = smccc_get_function(vcpu);
-	unsigned long val;
-
-	switch (psci_fn) {
-	case KVM_PSCI_FN_CPU_OFF:
-		kvm_psci_vcpu_off(vcpu);
-		val = PSCI_RET_SUCCESS;
-		break;
-	case KVM_PSCI_FN_CPU_ON:
-		mutex_lock(&kvm->lock);
-		val = kvm_psci_vcpu_on(vcpu);
-		mutex_unlock(&kvm->lock);
-		break;
-	default:
-		val = PSCI_RET_NOT_SUPPORTED;
-		break;
-	}
-
-	smccc_set_retval(vcpu, val, 0, 0, 0);
-	return 1;
-}
-
-/**
- * kvm_psci_call - handle PSCI call if r0 value is in range
- * @vcpu: Pointer to the VCPU struct
- *
- * Handle PSCI calls from guests through traps from HVC instructions.
- * The calling convention is similar to SMC calls to the secure world
- * where the function number is placed in r0.
- *
- * This function returns: > 0 (success), 0 (success but exit to user
- * space), and < 0 (errors)
- *
- * Errors:
- * -EINVAL: Unrecognized PSCI function
- */
-int kvm_psci_call(struct kvm_vcpu *vcpu)
-{
-	switch (kvm_psci_version(vcpu, vcpu->kvm)) {
-	case KVM_ARM_PSCI_1_0:
-		return kvm_psci_1_0_call(vcpu);
-	case KVM_ARM_PSCI_0_2:
-		return kvm_psci_0_2_call(vcpu);
-	case KVM_ARM_PSCI_0_1:
-		return kvm_psci_0_1_call(vcpu);
-	default:
-		return -EINVAL;
-	};
-}
-
-int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
-{
-	return 3;		/* PSCI version and two workaround registers */
-}
-
-int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
-{
-	if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++))
-		return -EFAULT;
-
-	if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++))
-		return -EFAULT;
-
-	if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++))
-		return -EFAULT;
-
-	return 0;
-}
-
-#define KVM_REG_FEATURE_LEVEL_WIDTH	4
-#define KVM_REG_FEATURE_LEVEL_MASK	(BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1)
-
-/*
- * Convert the workaround level into an easy-to-compare number, where higher
- * values mean better protection.
- */
-static int get_kernel_wa_level(u64 regid)
-{
-	switch (regid) {
-	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-		switch (kvm_arm_harden_branch_predictor()) {
-		case KVM_BP_HARDEN_UNKNOWN:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
-		case KVM_BP_HARDEN_WA_NEEDED:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
-		case KVM_BP_HARDEN_NOT_REQUIRED:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
-		}
-		return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
-	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-		switch (kvm_arm_have_ssbd()) {
-		case KVM_SSBD_FORCE_DISABLE:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
-		case KVM_SSBD_KERNEL:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
-		case KVM_SSBD_FORCE_ENABLE:
-		case KVM_SSBD_MITIGATED:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
-		case KVM_SSBD_UNKNOWN:
-		default:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
-		}
-	}
-
-	return -EINVAL;
-}
-
-int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
-	void __user *uaddr = (void __user *)(long)reg->addr;
-	u64 val;
-
-	switch (reg->id) {
-	case KVM_REG_ARM_PSCI_VERSION:
-		val = kvm_psci_version(vcpu, vcpu->kvm);
-		break;
-	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-		val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-		break;
-	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-		val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-
-		if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
-		    kvm_arm_get_vcpu_workaround_2_flag(vcpu))
-			val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
-		break;
-	default:
-		return -ENOENT;
-	}
-
-	if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
-		return -EFAULT;
-
-	return 0;
-}
-
-int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
-	void __user *uaddr = (void __user *)(long)reg->addr;
-	u64 val;
-	int wa_level;
-
-	if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
-		return -EFAULT;
-
-	switch (reg->id) {
-	case KVM_REG_ARM_PSCI_VERSION:
-	{
-		bool wants_02;
-
-		wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
-
-		switch (val) {
-		case KVM_ARM_PSCI_0_1:
-			if (wants_02)
-				return -EINVAL;
-			vcpu->kvm->arch.psci_version = val;
-			return 0;
-		case KVM_ARM_PSCI_0_2:
-		case KVM_ARM_PSCI_1_0:
-			if (!wants_02)
-				return -EINVAL;
-			vcpu->kvm->arch.psci_version = val;
-			return 0;
-		}
-		break;
-	}
-
-	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-		if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
-			return -EINVAL;
-
-		if (get_kernel_wa_level(reg->id) < val)
-			return -EINVAL;
-
-		return 0;
-
-	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-		if (val & ~(KVM_REG_FEATURE_LEVEL_MASK |
-			    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
-			return -EINVAL;
-
-		wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
-
-		if (get_kernel_wa_level(reg->id) < wa_level)
-			return -EINVAL;
-
-		/* The enabled bit must not be set unless the level is AVAIL. */
-		if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
-		    wa_level != val)
-			return -EINVAL;
-
-		/* Are we finished or do we need to check the enable bit ? */
-		if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
-			return 0;
-
-		/*
-		 * If this kernel supports the workaround to be switched on
-		 * or off, make sure it matches the requested setting.
-		 */
-		switch (wa_level) {
-		case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
-			kvm_arm_set_vcpu_workaround_2_flag(vcpu,
-			    val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
-			break;
-		case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
-			kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
-			break;
-		}
-
-		return 0;
-	default:
-		return -ENOENT;
-	}
-
-	return -EINVAL;
-}
diff --git a/virt/kvm/arm/pvtime.c b/virt/kvm/arm/pvtime.c
deleted file mode 100644
index 1e0f4c284888..000000000000
--- a/virt/kvm/arm/pvtime.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Arm Ltd.
-
-#include <linux/arm-smccc.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_mmu.h>
-#include <asm/pvclock-abi.h>
-
-#include <kvm/arm_hypercalls.h>
-
-void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	u64 steal;
-	__le64 steal_le;
-	u64 offset;
-	int idx;
-	u64 base = vcpu->arch.steal.base;
-
-	if (base == GPA_INVALID)
-		return;
-
-	/* Let's do the local bookkeeping */
-	steal = vcpu->arch.steal.steal;
-	steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
-	vcpu->arch.steal.last_steal = current->sched_info.run_delay;
-	vcpu->arch.steal.steal = steal;
-
-	steal_le = cpu_to_le64(steal);
-	idx = srcu_read_lock(&kvm->srcu);
-	offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
-	kvm_put_guest(kvm, base + offset, steal_le, u64);
-	srcu_read_unlock(&kvm->srcu, idx);
-}
-
-long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
-{
-	u32 feature = smccc_get_arg1(vcpu);
-	long val = SMCCC_RET_NOT_SUPPORTED;
-
-	switch (feature) {
-	case ARM_SMCCC_HV_PV_TIME_FEATURES:
-	case ARM_SMCCC_HV_PV_TIME_ST:
-		val = SMCCC_RET_SUCCESS;
-		break;
-	}
-
-	return val;
-}
-
-gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
-{
-	struct pvclock_vcpu_stolen_time init_values = {};
-	struct kvm *kvm = vcpu->kvm;
-	u64 base = vcpu->arch.steal.base;
-	int idx;
-
-	if (base == GPA_INVALID)
-		return base;
-
-	/*
-	 * Start counting stolen time from the time the guest requests
-	 * the feature enabled.
-	 */
-	vcpu->arch.steal.steal = 0;
-	vcpu->arch.steal.last_steal = current->sched_info.run_delay;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	kvm_write_guest(kvm, base, &init_values, sizeof(init_values));
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return base;
-}
-
-int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
-			    struct kvm_device_attr *attr)
-{
-	u64 __user *user = (u64 __user *)attr->addr;
-	struct kvm *kvm = vcpu->kvm;
-	u64 ipa;
-	int ret = 0;
-	int idx;
-
-	if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
-		return -ENXIO;
-
-	if (get_user(ipa, user))
-		return -EFAULT;
-	if (!IS_ALIGNED(ipa, 64))
-		return -EINVAL;
-	if (vcpu->arch.steal.base != GPA_INVALID)
-		return -EEXIST;
-
-	/* Check the address is in a valid memslot */
-	idx = srcu_read_lock(&kvm->srcu);
-	if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT)))
-		ret = -EINVAL;
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	if (!ret)
-		vcpu->arch.steal.base = ipa;
-
-	return ret;
-}
-
-int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
-			    struct kvm_device_attr *attr)
-{
-	u64 __user *user = (u64 __user *)attr->addr;
-	u64 ipa;
-
-	if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
-		return -ENXIO;
-
-	ipa = vcpu->arch.steal.base;
-
-	if (put_user(ipa, user))
-		return -EFAULT;
-	return 0;
-}
-
-int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
-			    struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_PVTIME_IPA:
-		return 0;
-	}
-	return -ENXIO;
-}
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
deleted file mode 100644
index cc94ccc68821..000000000000
--- a/virt/kvm/arm/trace.h
+++ /dev/null
@@ -1,379 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_KVM_H
-
-#include <kvm/arm_arch_timer.h>
-#include <linux/tracepoint.h>
-#include <asm/kvm_arm.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-/*
- * Tracepoints for entry/exit to guest
- */
-TRACE_EVENT(kvm_entry,
-	TP_PROTO(unsigned long vcpu_pc),
-	TP_ARGS(vcpu_pc),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-	),
-
-	TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_exit,
-	TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
-	TP_ARGS(ret, esr_ec, vcpu_pc),
-
-	TP_STRUCT__entry(
-		__field(	int,		ret		)
-		__field(	unsigned int,	esr_ec		)
-		__field(	unsigned long,	vcpu_pc		)
-	),
-
-	TP_fast_assign(
-		__entry->ret			= ARM_EXCEPTION_CODE(ret);
-		__entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
-		__entry->vcpu_pc		= vcpu_pc;
-	),
-
-	TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
-		  __print_symbolic(__entry->ret, kvm_arm_exception_type),
-		  __entry->esr_ec,
-		  __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
-		  __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_guest_fault,
-	TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
-		 unsigned long hxfar,
-		 unsigned long long ipa),
-	TP_ARGS(vcpu_pc, hsr, hxfar, ipa),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-		__field(	unsigned long,	hsr		)
-		__field(	unsigned long,	hxfar		)
-		__field(   unsigned long long,	ipa		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-		__entry->hsr			= hsr;
-		__entry->hxfar			= hxfar;
-		__entry->ipa			= ipa;
-	),
-
-	TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
-		  __entry->ipa, __entry->hsr,
-		  __entry->hxfar, __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_access_fault,
-	TP_PROTO(unsigned long ipa),
-	TP_ARGS(ipa),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	ipa		)
-	),
-
-	TP_fast_assign(
-		__entry->ipa		= ipa;
-	),
-
-	TP_printk("IPA: %lx", __entry->ipa)
-);
-
-TRACE_EVENT(kvm_irq_line,
-	TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
-	TP_ARGS(type, vcpu_idx, irq_num, level),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	type		)
-		__field(	int,		vcpu_idx	)
-		__field(	int,		irq_num		)
-		__field(	int,		level		)
-	),
-
-	TP_fast_assign(
-		__entry->type		= type;
-		__entry->vcpu_idx	= vcpu_idx;
-		__entry->irq_num	= irq_num;
-		__entry->level		= level;
-	),
-
-	TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d",
-		  (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" :
-		  (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" :
-		  (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN",
-		  __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
-);
-
-TRACE_EVENT(kvm_mmio_emulate,
-	TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
-		 unsigned long cpsr),
-	TP_ARGS(vcpu_pc, instr, cpsr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-		__field(	unsigned long,	instr		)
-		__field(	unsigned long,	cpsr		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-		__entry->instr			= instr;
-		__entry->cpsr			= cpsr;
-	),
-
-	TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
-		  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
-);
-
-TRACE_EVENT(kvm_unmap_hva_range,
-	TP_PROTO(unsigned long start, unsigned long end),
-	TP_ARGS(start, end),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-	),
-
-	TP_fast_assign(
-		__entry->start		= start;
-		__entry->end		= end;
-	),
-
-	TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
-		  __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
-	TP_PROTO(unsigned long start, unsigned long end),
-	TP_ARGS(start, end),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-	),
-
-	TP_fast_assign(
-		__entry->start		= start;
-		__entry->end		= end;
-	),
-
-	TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
-		  __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_set_way_flush,
-	    TP_PROTO(unsigned long vcpu_pc, bool cache),
-	    TP_ARGS(vcpu_pc, cache),
-
-	    TP_STRUCT__entry(
-		    __field(	unsigned long,	vcpu_pc		)
-		    __field(	bool,		cache		)
-	    ),
-
-	    TP_fast_assign(
-		    __entry->vcpu_pc		= vcpu_pc;
-		    __entry->cache		= cache;
-	    ),
-
-	    TP_printk("S/W flush at 0x%016lx (cache %s)",
-		      __entry->vcpu_pc, __entry->cache ? "on" : "off")
-);
-
-TRACE_EVENT(kvm_toggle_cache,
-	    TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
-	    TP_ARGS(vcpu_pc, was, now),
-
-	    TP_STRUCT__entry(
-		    __field(	unsigned long,	vcpu_pc		)
-		    __field(	bool,		was		)
-		    __field(	bool,		now		)
-	    ),
-
-	    TP_fast_assign(
-		    __entry->vcpu_pc		= vcpu_pc;
-		    __entry->was		= was;
-		    __entry->now		= now;
-	    ),
-
-	    TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
-		      __entry->vcpu_pc, __entry->was ? "on" : "off",
-		      __entry->now ? "on" : "off")
-);
-
-/*
- * Tracepoints for arch_timer
- */
-TRACE_EVENT(kvm_timer_update_irq,
-	TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
-	TP_ARGS(vcpu_id, irq, level),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_id	)
-		__field(	__u32,		irq	)
-		__field(	int,		level	)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_id	= vcpu_id;
-		__entry->irq		= irq;
-		__entry->level		= level;
-	),
-
-	TP_printk("VCPU: %ld, IRQ %d, level %d",
-		  __entry->vcpu_id, __entry->irq, __entry->level)
-);
-
-TRACE_EVENT(kvm_get_timer_map,
-	TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
-	TP_ARGS(vcpu_id, map),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,		vcpu_id	)
-		__field(	int,			direct_vtimer	)
-		__field(	int,			direct_ptimer	)
-		__field(	int,			emul_ptimer	)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_id		= vcpu_id;
-		__entry->direct_vtimer		= arch_timer_ctx_index(map->direct_vtimer);
-		__entry->direct_ptimer =
-			(map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
-		__entry->emul_ptimer =
-			(map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
-	),
-
-	TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
-		  __entry->vcpu_id,
-		  __entry->direct_vtimer,
-		  __entry->direct_ptimer,
-		  __entry->emul_ptimer)
-);
-
-TRACE_EVENT(kvm_timer_save_state,
-	TP_PROTO(struct arch_timer_context *ctx),
-	TP_ARGS(ctx),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,		ctl		)
-		__field(	unsigned long long,	cval		)
-		__field(	int,			timer_idx	)
-	),
-
-	TP_fast_assign(
-		__entry->ctl			= ctx->cnt_ctl;
-		__entry->cval			= ctx->cnt_cval;
-		__entry->timer_idx		= arch_timer_ctx_index(ctx);
-	),
-
-	TP_printk("   CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
-		  __entry->ctl,
-		  __entry->cval,
-		  __entry->timer_idx)
-);
-
-TRACE_EVENT(kvm_timer_restore_state,
-	TP_PROTO(struct arch_timer_context *ctx),
-	TP_ARGS(ctx),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,		ctl		)
-		__field(	unsigned long long,	cval		)
-		__field(	int,			timer_idx	)
-	),
-
-	TP_fast_assign(
-		__entry->ctl			= ctx->cnt_ctl;
-		__entry->cval			= ctx->cnt_cval;
-		__entry->timer_idx		= arch_timer_ctx_index(ctx);
-	),
-
-	TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
-		  __entry->ctl,
-		  __entry->cval,
-		  __entry->timer_idx)
-);
-
-TRACE_EVENT(kvm_timer_hrtimer_expire,
-	TP_PROTO(struct arch_timer_context *ctx),
-	TP_ARGS(ctx),
-
-	TP_STRUCT__entry(
-		__field(	int,			timer_idx	)
-	),
-
-	TP_fast_assign(
-		__entry->timer_idx		= arch_timer_ctx_index(ctx);
-	),
-
-	TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
-);
-
-TRACE_EVENT(kvm_timer_emulate,
-	TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
-	TP_ARGS(ctx, should_fire),
-
-	TP_STRUCT__entry(
-		__field(	int,			timer_idx	)
-		__field(	bool,			should_fire	)
-	),
-
-	TP_fast_assign(
-		__entry->timer_idx		= arch_timer_ctx_index(ctx);
-		__entry->should_fire		= should_fire;
-	),
-
-	TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
-		  __entry->timer_idx, __entry->should_fire)
-);
-
-#endif /* _TRACE_KVM_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../virt/kvm/arm
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h
deleted file mode 100644
index 4fd4f6db181b..000000000000
--- a/virt/kvm/arm/vgic/trace.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_VGIC_H
-
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-TRACE_EVENT(vgic_update_irq_pending,
-	TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
-	TP_ARGS(vcpu_id, irq, level),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_id	)
-		__field(	__u32,		irq	)
-		__field(	bool,		level	)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_id	= vcpu_id;
-		__entry->irq		= irq;
-		__entry->level		= level;
-	),
-
-	TP_printk("VCPU: %ld, IRQ %d, level: %d",
-		  __entry->vcpu_id, __entry->irq, __entry->level)
-);
-
-#endif /* _TRACE_VGIC_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../virt/kvm/arm/vgic
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
deleted file mode 100644
index b13a9e3f99dd..000000000000
--- a/virt/kvm/arm/vgic/vgic-debug.c
+++ /dev/null
@@ -1,300 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2016 Linaro
- * Author: Christoffer Dall <christoffer.dall@linaro.org>
- */
-
-#include <linux/cpu.h>
-#include <linux/debugfs.h>
-#include <linux/interrupt.h>
-#include <linux/kvm_host.h>
-#include <linux/seq_file.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_mmu.h>
-#include "vgic.h"
-
-/*
- * Structure to control looping through the entire vgic state.  We start at
- * zero for each field and move upwards.  So, if dist_id is 0 we print the
- * distributor info.  When dist_id is 1, we have already printed it and move
- * on.
- *
- * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and
- * so on.
- */
-struct vgic_state_iter {
-	int nr_cpus;
-	int nr_spis;
-	int nr_lpis;
-	int dist_id;
-	int vcpu_id;
-	int intid;
-	int lpi_idx;
-	u32 *lpi_array;
-};
-
-static void iter_next(struct vgic_state_iter *iter)
-{
-	if (iter->dist_id == 0) {
-		iter->dist_id++;
-		return;
-	}
-
-	iter->intid++;
-	if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
-	    ++iter->vcpu_id < iter->nr_cpus)
-		iter->intid = 0;
-
-	if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
-		if (iter->lpi_idx < iter->nr_lpis)
-			iter->intid = iter->lpi_array[iter->lpi_idx];
-		iter->lpi_idx++;
-	}
-}
-
-static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
-		      loff_t pos)
-{
-	int nr_cpus = atomic_read(&kvm->online_vcpus);
-
-	memset(iter, 0, sizeof(*iter));
-
-	iter->nr_cpus = nr_cpus;
-	iter->nr_spis = kvm->arch.vgic.nr_spis;
-	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
-		if (iter->nr_lpis < 0)
-			iter->nr_lpis = 0;
-	}
-
-	/* Fast forward to the right position if needed */
-	while (pos--)
-		iter_next(iter);
-}
-
-static bool end_of_vgic(struct vgic_state_iter *iter)
-{
-	return iter->dist_id > 0 &&
-		iter->vcpu_id == iter->nr_cpus &&
-		iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
-		iter->lpi_idx > iter->nr_lpis;
-}
-
-static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
-{
-	struct kvm *kvm = (struct kvm *)s->private;
-	struct vgic_state_iter *iter;
-
-	mutex_lock(&kvm->lock);
-	iter = kvm->arch.vgic.iter;
-	if (iter) {
-		iter = ERR_PTR(-EBUSY);
-		goto out;
-	}
-
-	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter) {
-		iter = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	iter_init(kvm, iter, *pos);
-	kvm->arch.vgic.iter = iter;
-
-	if (end_of_vgic(iter))
-		iter = NULL;
-out:
-	mutex_unlock(&kvm->lock);
-	return iter;
-}
-
-static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct kvm *kvm = (struct kvm *)s->private;
-	struct vgic_state_iter *iter = kvm->arch.vgic.iter;
-
-	++*pos;
-	iter_next(iter);
-	if (end_of_vgic(iter))
-		iter = NULL;
-	return iter;
-}
-
-static void vgic_debug_stop(struct seq_file *s, void *v)
-{
-	struct kvm *kvm = (struct kvm *)s->private;
-	struct vgic_state_iter *iter;
-
-	/*
-	 * If the seq file wasn't properly opened, there's nothing to clearn
-	 * up.
-	 */
-	if (IS_ERR(v))
-		return;
-
-	mutex_lock(&kvm->lock);
-	iter = kvm->arch.vgic.iter;
-	kfree(iter->lpi_array);
-	kfree(iter);
-	kvm->arch.vgic.iter = NULL;
-	mutex_unlock(&kvm->lock);
-}
-
-static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
-{
-	bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
-
-	seq_printf(s, "Distributor\n");
-	seq_printf(s, "===========\n");
-	seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
-	seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
-	if (v3)
-		seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count);
-	seq_printf(s, "enabled:\t%d\n", dist->enabled);
-	seq_printf(s, "\n");
-
-	seq_printf(s, "P=pending_latch, L=line_level, A=active\n");
-	seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n");
-	seq_printf(s, "G=group\n");
-}
-
-static void print_header(struct seq_file *s, struct vgic_irq *irq,
-			 struct kvm_vcpu *vcpu)
-{
-	int id = 0;
-	char *hdr = "SPI ";
-
-	if (vcpu) {
-		hdr = "VCPU";
-		id = vcpu->vcpu_id;
-	}
-
-	seq_printf(s, "\n");
-	seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHCG     HWID   TARGET SRC PRI VCPU_ID\n", hdr, id);
-	seq_printf(s, "----------------------------------------------------------------\n");
-}
-
-static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
-			    struct kvm_vcpu *vcpu)
-{
-	char *type;
-	bool pending;
-
-	if (irq->intid < VGIC_NR_SGIS)
-		type = "SGI";
-	else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
-		type = "PPI";
-	else if (irq->intid < VGIC_MAX_SPI)
-		type = "SPI";
-	else
-		type = "LPI";
-
-	if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
-		print_header(s, irq, vcpu);
-
-	pending = irq->pending_latch;
-	if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-		int err;
-
-		err = irq_get_irqchip_state(irq->host_irq,
-					    IRQCHIP_STATE_PENDING,
-					    &pending);
-		WARN_ON_ONCE(err);
-	}
-
-	seq_printf(s, "       %s %4d "
-		      "    %2d "
-		      "%d%d%d%d%d%d%d "
-		      "%8d "
-		      "%8x "
-		      " %2x "
-		      "%3d "
-		      "     %2d "
-		      "\n",
-			type, irq->intid,
-			(irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
-			pending,
-			irq->line_level,
-			irq->active,
-			irq->enabled,
-			irq->hw,
-			irq->config == VGIC_CONFIG_LEVEL,
-			irq->group,
-			irq->hwintid,
-			irq->mpidr,
-			irq->source,
-			irq->priority,
-			(irq->vcpu) ? irq->vcpu->vcpu_id : -1);
-}
-
-static int vgic_debug_show(struct seq_file *s, void *v)
-{
-	struct kvm *kvm = (struct kvm *)s->private;
-	struct vgic_state_iter *iter = (struct vgic_state_iter *)v;
-	struct vgic_irq *irq;
-	struct kvm_vcpu *vcpu = NULL;
-	unsigned long flags;
-
-	if (iter->dist_id == 0) {
-		print_dist_state(s, &kvm->arch.vgic);
-		return 0;
-	}
-
-	if (!kvm->arch.vgic.initialized)
-		return 0;
-
-	if (iter->vcpu_id < iter->nr_cpus)
-		vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
-
-	irq = vgic_get_irq(kvm, vcpu, iter->intid);
-	if (!irq) {
-		seq_printf(s, "       LPI %4d freed\n", iter->intid);
-		return 0;
-	}
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	print_irq_state(s, irq, vcpu);
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	vgic_put_irq(kvm, irq);
-	return 0;
-}
-
-static const struct seq_operations vgic_debug_seq_ops = {
-	.start = vgic_debug_start,
-	.next  = vgic_debug_next,
-	.stop  = vgic_debug_stop,
-	.show  = vgic_debug_show
-};
-
-static int debug_open(struct inode *inode, struct file *file)
-{
-	int ret;
-	ret = seq_open(file, &vgic_debug_seq_ops);
-	if (!ret) {
-		struct seq_file *seq;
-		/* seq_open will have modified file->private_data */
-		seq = file->private_data;
-		seq->private = inode->i_private;
-	}
-
-	return ret;
-};
-
-static const struct file_operations vgic_debug_fops = {
-	.owner   = THIS_MODULE,
-	.open    = debug_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
-void vgic_debug_init(struct kvm *kvm)
-{
-	debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm,
-			    &vgic_debug_fops);
-}
-
-void vgic_debug_destroy(struct kvm *kvm)
-{
-}
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
deleted file mode 100644
index 32e32d67a127..000000000000
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ /dev/null
@@ -1,556 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_mmu.h>
-#include "vgic.h"
-
-/*
- * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.  The basic
- * idea is that even though the VGIC is not functional or not requested from
- * user space, the critical path of the run loop can still call VGIC functions
- * that just won't do anything, without them having to check additional
- * initialization flags to ensure they don't look at uninitialized data
- * structures.
- *
- * Distributor:
- *
- * - kvm_vgic_early_init(): initialization of static data that doesn't
- *   depend on any sizing information or emulation type. No allocation
- *   is allowed there.
- *
- * - vgic_init(): allocation and initialization of the generic data
- *   structures that depend on sizing information (number of CPUs,
- *   number of interrupts). Also initializes the vcpu specific data
- *   structures. Can be executed lazily for GICv2.
- *
- * CPU Interface:
- *
- * - kvm_vgic_vcpu_init(): initialization of static data that
- *   doesn't depend on any sizing information or emulation type. No
- *   allocation is allowed there.
- */
-
-/* EARLY INIT */
-
-/**
- * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
- * @kvm: The VM whose VGIC districutor should be initialized
- *
- * Only do initialization of static structures that don't require any
- * allocation or sizing information from userspace.  vgic_init() called
- * kvm_vgic_dist_init() which takes care of the rest.
- */
-void kvm_vgic_early_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	INIT_LIST_HEAD(&dist->lpi_list_head);
-	INIT_LIST_HEAD(&dist->lpi_translation_cache);
-	raw_spin_lock_init(&dist->lpi_list_lock);
-}
-
-/* CREATION */
-
-/**
- * kvm_vgic_create: triggered by the instantiation of the VGIC device by
- * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
- * or through the generic KVM_CREATE_DEVICE API ioctl.
- * irqchip_in_kernel() tells you if this function succeeded or not.
- * @kvm: kvm struct pointer
- * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
- */
-int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
-	int i, ret;
-	struct kvm_vcpu *vcpu;
-
-	if (irqchip_in_kernel(kvm))
-		return -EEXIST;
-
-	/*
-	 * This function is also called by the KVM_CREATE_IRQCHIP handler,
-	 * which had no chance yet to check the availability of the GICv2
-	 * emulation. So check this here again. KVM_CREATE_DEVICE does
-	 * the proper checks already.
-	 */
-	if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
-		!kvm_vgic_global_state.can_emulate_gicv2)
-		return -ENODEV;
-
-	ret = -EBUSY;
-	if (!lock_all_vcpus(kvm))
-		return ret;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (vcpu->arch.has_run_once)
-			goto out_unlock;
-	}
-	ret = 0;
-
-	if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
-		kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
-	else
-		kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
-
-	if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
-		ret = -E2BIG;
-		goto out_unlock;
-	}
-
-	kvm->arch.vgic.in_kernel = true;
-	kvm->arch.vgic.vgic_model = type;
-
-	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
-
-	if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
-		kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
-	else
-		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
-
-out_unlock:
-	unlock_all_vcpus(kvm);
-	return ret;
-}
-
-/* INIT/DESTROY */
-
-/**
- * kvm_vgic_dist_init: initialize the dist data structures
- * @kvm: kvm struct pointer
- * @nr_spis: number of spis, frozen by caller
- */
-static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
-	int i;
-
-	dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
-	if (!dist->spis)
-		return  -ENOMEM;
-
-	/*
-	 * In the following code we do not take the irq struct lock since
-	 * no other action on irq structs can happen while the VGIC is
-	 * not initialized yet:
-	 * If someone wants to inject an interrupt or does a MMIO access, we
-	 * require prior initialization in case of a virtual GICv3 or trigger
-	 * initialization when using a virtual GICv2.
-	 */
-	for (i = 0; i < nr_spis; i++) {
-		struct vgic_irq *irq = &dist->spis[i];
-
-		irq->intid = i + VGIC_NR_PRIVATE_IRQS;
-		INIT_LIST_HEAD(&irq->ap_list);
-		raw_spin_lock_init(&irq->irq_lock);
-		irq->vcpu = NULL;
-		irq->target_vcpu = vcpu0;
-		kref_init(&irq->refcount);
-		switch (dist->vgic_model) {
-		case KVM_DEV_TYPE_ARM_VGIC_V2:
-			irq->targets = 0;
-			irq->group = 0;
-			break;
-		case KVM_DEV_TYPE_ARM_VGIC_V3:
-			irq->mpidr = 0;
-			irq->group = 1;
-			break;
-		default:
-			kfree(dist->spis);
-			dist->spis = NULL;
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
-/**
- * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
- * structures and register VCPU-specific KVM iodevs
- *
- * @vcpu: pointer to the VCPU being created and initialized
- *
- * Only do initialization, but do not actually enable the
- * VGIC CPU interface
- */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	int ret = 0;
-	int i;
-
-	vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
-
-	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-	raw_spin_lock_init(&vgic_cpu->ap_list_lock);
-	atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
-
-	/*
-	 * Enable and configure all SGIs to be edge-triggered and
-	 * configure all PPIs as level-triggered.
-	 */
-	for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-		struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
-
-		INIT_LIST_HEAD(&irq->ap_list);
-		raw_spin_lock_init(&irq->irq_lock);
-		irq->intid = i;
-		irq->vcpu = NULL;
-		irq->target_vcpu = vcpu;
-		kref_init(&irq->refcount);
-		if (vgic_irq_is_sgi(i)) {
-			/* SGIs */
-			irq->enabled = 1;
-			irq->config = VGIC_CONFIG_EDGE;
-		} else {
-			/* PPIs */
-			irq->config = VGIC_CONFIG_LEVEL;
-		}
-	}
-
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return 0;
-
-	/*
-	 * If we are creating a VCPU with a GICv3 we must also register the
-	 * KVM io device for the redistributor that belongs to this VCPU.
-	 */
-	if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		mutex_lock(&vcpu->kvm->lock);
-		ret = vgic_register_redist_iodev(vcpu);
-		mutex_unlock(&vcpu->kvm->lock);
-	}
-	return ret;
-}
-
-static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_enable(vcpu);
-	else
-		vgic_v3_enable(vcpu);
-}
-
-/*
- * vgic_init: allocates and initializes dist and vcpu data structures
- * depending on two dimensioning parameters:
- * - the number of spis
- * - the number of vcpus
- * The function is generally called when nr_spis has been explicitly set
- * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
- * vgic_initialized() returns true when this function has succeeded.
- * Must be called with kvm->lock held!
- */
-int vgic_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int ret = 0, i, idx;
-
-	if (vgic_initialized(kvm))
-		return 0;
-
-	/* Are we also in the middle of creating a VCPU? */
-	if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
-		return -EBUSY;
-
-	/* freeze the number of spis */
-	if (!dist->nr_spis)
-		dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
-
-	ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
-	if (ret)
-		goto out;
-
-	/* Initialize groups on CPUs created before the VGIC type was known */
-	kvm_for_each_vcpu(idx, vcpu, kvm) {
-		struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-		for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-			struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
-			switch (dist->vgic_model) {
-			case KVM_DEV_TYPE_ARM_VGIC_V3:
-				irq->group = 1;
-				irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
-				break;
-			case KVM_DEV_TYPE_ARM_VGIC_V2:
-				irq->group = 0;
-				irq->targets = 1U << idx;
-				break;
-			default:
-				ret = -EINVAL;
-				goto out;
-			}
-		}
-	}
-
-	if (vgic_has_its(kvm))
-		vgic_lpi_translation_cache_init(kvm);
-
-	/*
-	 * If we have GICv4.1 enabled, unconditionnaly request enable the
-	 * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
-	 * enable it if we present a virtual ITS to the guest.
-	 */
-	if (vgic_supports_direct_msis(kvm)) {
-		ret = vgic_v4_init(kvm);
-		if (ret)
-			goto out;
-	}
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_vgic_vcpu_enable(vcpu);
-
-	ret = kvm_vgic_setup_default_irq_routing(kvm);
-	if (ret)
-		goto out;
-
-	vgic_debug_init(kvm);
-
-	dist->implementation_rev = 2;
-	dist->initialized = true;
-
-out:
-	return ret;
-}
-
-static void kvm_vgic_dist_destroy(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_redist_region *rdreg, *next;
-
-	dist->ready = false;
-	dist->initialized = false;
-
-	kfree(dist->spis);
-	dist->spis = NULL;
-	dist->nr_spis = 0;
-
-	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
-			list_del(&rdreg->list);
-			kfree(rdreg);
-		}
-		INIT_LIST_HEAD(&dist->rd_regions);
-	}
-
-	if (vgic_has_its(kvm))
-		vgic_lpi_translation_cache_destroy(kvm);
-
-	if (vgic_supports_direct_msis(kvm))
-		vgic_v4_teardown(kvm);
-}
-
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	/*
-	 * Retire all pending LPIs on this vcpu anyway as we're
-	 * going to destroy it.
-	 */
-	vgic_flush_pending_lpis(vcpu);
-
-	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-}
-
-/* To be called with kvm->lock held */
-static void __kvm_vgic_destroy(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	vgic_debug_destroy(kvm);
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_vgic_vcpu_destroy(vcpu);
-
-	kvm_vgic_dist_destroy(kvm);
-}
-
-void kvm_vgic_destroy(struct kvm *kvm)
-{
-	mutex_lock(&kvm->lock);
-	__kvm_vgic_destroy(kvm);
-	mutex_unlock(&kvm->lock);
-}
-
-/**
- * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
- * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
- * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
- * @kvm: kvm struct pointer
- */
-int vgic_lazy_init(struct kvm *kvm)
-{
-	int ret = 0;
-
-	if (unlikely(!vgic_initialized(kvm))) {
-		/*
-		 * We only provide the automatic initialization of the VGIC
-		 * for the legacy case of a GICv2. Any other type must
-		 * be explicitly initialized once setup with the respective
-		 * KVM device call.
-		 */
-		if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
-			return -EBUSY;
-
-		mutex_lock(&kvm->lock);
-		ret = vgic_init(kvm);
-		mutex_unlock(&kvm->lock);
-	}
-
-	return ret;
-}
-
-/* RESOURCE MAPPING */
-
-/**
- * Map the MMIO regions depending on the VGIC model exposed to the guest
- * called on the first VCPU run.
- * Also map the virtual CPU interface into the VM.
- * v2/v3 derivatives call vgic_init if not already done.
- * vgic_ready() returns true if this function has succeeded.
- * @kvm: kvm struct pointer
- */
-int kvm_vgic_map_resources(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int ret = 0;
-
-	mutex_lock(&kvm->lock);
-	if (!irqchip_in_kernel(kvm))
-		goto out;
-
-	if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
-		ret = vgic_v2_map_resources(kvm);
-	else
-		ret = vgic_v3_map_resources(kvm);
-
-	if (ret)
-		__kvm_vgic_destroy(kvm);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return ret;
-}
-
-/* GENERIC PROBE */
-
-static int vgic_init_cpu_starting(unsigned int cpu)
-{
-	enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
-	return 0;
-}
-
-
-static int vgic_init_cpu_dying(unsigned int cpu)
-{
-	disable_percpu_irq(kvm_vgic_global_state.maint_irq);
-	return 0;
-}
-
-static irqreturn_t vgic_maintenance_handler(int irq, void *data)
-{
-	/*
-	 * We cannot rely on the vgic maintenance interrupt to be
-	 * delivered synchronously. This means we can only use it to
-	 * exit the VM, and we perform the handling of EOIed
-	 * interrupts on the exit path (see vgic_fold_lr_state).
-	 */
-	return IRQ_HANDLED;
-}
-
-/**
- * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
- *
- * For a specific CPU, initialize the GIC VE hardware.
- */
-void kvm_vgic_init_cpu_hardware(void)
-{
-	BUG_ON(preemptible());
-
-	/*
-	 * We want to make sure the list registers start out clear so that we
-	 * only have the program the used registers.
-	 */
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_init_lrs();
-	else
-		kvm_call_hyp(__vgic_v3_init_lrs);
-}
-
-/**
- * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
- * according to the host GIC model. Accordingly calls either
- * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
- * instantiated by a guest later on .
- */
-int kvm_vgic_hyp_init(void)
-{
-	const struct gic_kvm_info *gic_kvm_info;
-	int ret;
-
-	gic_kvm_info = gic_get_kvm_info();
-	if (!gic_kvm_info)
-		return -ENODEV;
-
-	if (!gic_kvm_info->maint_irq) {
-		kvm_err("No vgic maintenance irq\n");
-		return -ENXIO;
-	}
-
-	switch (gic_kvm_info->type) {
-	case GIC_V2:
-		ret = vgic_v2_probe(gic_kvm_info);
-		break;
-	case GIC_V3:
-		ret = vgic_v3_probe(gic_kvm_info);
-		if (!ret) {
-			static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
-			kvm_info("GIC system register CPU interface enabled\n");
-		}
-		break;
-	default:
-		ret = -ENODEV;
-	}
-
-	if (ret)
-		return ret;
-
-	kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
-	ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
-				 vgic_maintenance_handler,
-				 "vgic", kvm_get_running_vcpus());
-	if (ret) {
-		kvm_err("Cannot register interrupt %d\n",
-			kvm_vgic_global_state.maint_irq);
-		return ret;
-	}
-
-	ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
-				"kvm/arm/vgic:starting",
-				vgic_init_cpu_starting, vgic_init_cpu_dying);
-	if (ret) {
-		kvm_err("Cannot register vgic CPU notifier\n");
-		goto out_free_irq;
-	}
-
-	kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
-	return 0;
-
-out_free_irq:
-	free_percpu_irq(kvm_vgic_global_state.maint_irq,
-			kvm_get_running_vcpus());
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
deleted file mode 100644
index d8cdfea5cc96..000000000000
--- a/virt/kvm/arm/vgic/vgic-irqfd.c
+++ /dev/null
@@ -1,141 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <trace/events/kvm.h>
-#include <kvm/arm_vgic.h>
-#include "vgic.h"
-
-/**
- * vgic_irqfd_set_irq: inject the IRQ corresponding to the
- * irqchip routing entry
- *
- * This is the entry point for irqfd IRQ injection
- */
-static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
-			struct kvm *kvm, int irq_source_id,
-			int level, bool line_status)
-{
-	unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS;
-
-	if (!vgic_valid_spi(kvm, spi_id))
-		return -EINVAL;
-	return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
-}
-
-/**
- * kvm_set_routing_entry: populate a kvm routing entry
- * from a user routing entry
- *
- * @kvm: the VM this entry is applied to
- * @e: kvm kernel routing entry handle
- * @ue: user api routing entry handle
- * return 0 on success, -EINVAL on errors.
- */
-int kvm_set_routing_entry(struct kvm *kvm,
-			  struct kvm_kernel_irq_routing_entry *e,
-			  const struct kvm_irq_routing_entry *ue)
-{
-	int r = -EINVAL;
-
-	switch (ue->type) {
-	case KVM_IRQ_ROUTING_IRQCHIP:
-		e->set = vgic_irqfd_set_irq;
-		e->irqchip.irqchip = ue->u.irqchip.irqchip;
-		e->irqchip.pin = ue->u.irqchip.pin;
-		if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
-		    (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
-			goto out;
-		break;
-	case KVM_IRQ_ROUTING_MSI:
-		e->set = kvm_set_msi;
-		e->msi.address_lo = ue->u.msi.address_lo;
-		e->msi.address_hi = ue->u.msi.address_hi;
-		e->msi.data = ue->u.msi.data;
-		e->msi.flags = ue->flags;
-		e->msi.devid = ue->u.msi.devid;
-		break;
-	default:
-		goto out;
-	}
-	r = 0;
-out:
-	return r;
-}
-
-static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
-			     struct kvm_msi *msi)
-{
-	msi->address_lo = e->msi.address_lo;
-	msi->address_hi = e->msi.address_hi;
-	msi->data = e->msi.data;
-	msi->flags = e->msi.flags;
-	msi->devid = e->msi.devid;
-}
-/**
- * kvm_set_msi: inject the MSI corresponding to the
- * MSI routing entry
- *
- * This is the entry point for irqfd MSI injection
- * and userspace MSI injection.
- */
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-		struct kvm *kvm, int irq_source_id,
-		int level, bool line_status)
-{
-	struct kvm_msi msi;
-
-	if (!vgic_has_its(kvm))
-		return -ENODEV;
-
-	if (!level)
-		return -1;
-
-	kvm_populate_msi(e, &msi);
-	return vgic_its_inject_msi(kvm, &msi);
-}
-
-/**
- * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
- *
- * Currently only direct MSI injection is supported.
- */
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
-			      struct kvm *kvm, int irq_source_id, int level,
-			      bool line_status)
-{
-	if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) {
-		struct kvm_msi msi;
-
-		kvm_populate_msi(e, &msi);
-		if (!vgic_its_inject_cached_translation(kvm, &msi))
-			return 0;
-	}
-
-	return -EWOULDBLOCK;
-}
-
-int kvm_vgic_setup_default_irq_routing(struct kvm *kvm)
-{
-	struct kvm_irq_routing_entry *entries;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	u32 nr = dist->nr_spis;
-	int i, ret;
-
-	entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL);
-	if (!entries)
-		return -ENOMEM;
-
-	for (i = 0; i < nr; i++) {
-		entries[i].gsi = i;
-		entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
-		entries[i].u.irqchip.irqchip = 0;
-		entries[i].u.irqchip.pin = i;
-	}
-	ret = kvm_set_irq_routing(kvm, entries, nr, 0);
-	kfree(entries);
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
deleted file mode 100644
index c012a52b19f5..000000000000
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ /dev/null
@@ -1,2783 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * GICv3 ITS emulation
- *
- * Copyright (C) 2015,2016 ARM Ltd.
- * Author: Andre Przywara <andre.przywara@arm.com>
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/list.h>
-#include <linux/uaccess.h>
-#include <linux/list_sort.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-static int vgic_its_save_tables_v0(struct vgic_its *its);
-static int vgic_its_restore_tables_v0(struct vgic_its *its);
-static int vgic_its_commit_v0(struct vgic_its *its);
-static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
-			     struct kvm_vcpu *filter_vcpu, bool needs_inv);
-
-/*
- * Creates a new (reference to a) struct vgic_irq for a given LPI.
- * If this LPI is already mapped on another ITS, we increase its refcount
- * and return a pointer to the existing structure.
- * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
- * This function returns a pointer to the _unlocked_ structure.
- */
-static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
-				     struct kvm_vcpu *vcpu)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
-	unsigned long flags;
-	int ret;
-
-	/* In this case there is no put, since we keep the reference. */
-	if (irq)
-		return irq;
-
-	irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
-	if (!irq)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&irq->lpi_list);
-	INIT_LIST_HEAD(&irq->ap_list);
-	raw_spin_lock_init(&irq->irq_lock);
-
-	irq->config = VGIC_CONFIG_EDGE;
-	kref_init(&irq->refcount);
-	irq->intid = intid;
-	irq->target_vcpu = vcpu;
-	irq->group = 1;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	/*
-	 * There could be a race with another vgic_add_lpi(), so we need to
-	 * check that we don't add a second list entry with the same LPI.
-	 */
-	list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
-		if (oldirq->intid != intid)
-			continue;
-
-		/* Someone was faster with adding this LPI, lets use that. */
-		kfree(irq);
-		irq = oldirq;
-
-		/*
-		 * This increases the refcount, the caller is expected to
-		 * call vgic_put_irq() on the returned pointer once it's
-		 * finished with the IRQ.
-		 */
-		vgic_get_irq_kref(irq);
-
-		goto out_unlock;
-	}
-
-	list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
-	dist->lpi_list_count++;
-
-out_unlock:
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	/*
-	 * We "cache" the configuration table entries in our struct vgic_irq's.
-	 * However we only have those structs for mapped IRQs, so we read in
-	 * the respective config data from memory here upon mapping the LPI.
-	 *
-	 * Should any of these fail, behave as if we couldn't create the LPI
-	 * by dropping the refcount and returning the error.
-	 */
-	ret = update_lpi_config(kvm, irq, NULL, false);
-	if (ret) {
-		vgic_put_irq(kvm, irq);
-		return ERR_PTR(ret);
-	}
-
-	ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
-	if (ret) {
-		vgic_put_irq(kvm, irq);
-		return ERR_PTR(ret);
-	}
-
-	return irq;
-}
-
-struct its_device {
-	struct list_head dev_list;
-
-	/* the head for the list of ITTEs */
-	struct list_head itt_head;
-	u32 num_eventid_bits;
-	gpa_t itt_addr;
-	u32 device_id;
-};
-
-#define COLLECTION_NOT_MAPPED ((u32)~0)
-
-struct its_collection {
-	struct list_head coll_list;
-
-	u32 collection_id;
-	u32 target_addr;
-};
-
-#define its_is_collection_mapped(coll) ((coll) && \
-				((coll)->target_addr != COLLECTION_NOT_MAPPED))
-
-struct its_ite {
-	struct list_head ite_list;
-
-	struct vgic_irq *irq;
-	struct its_collection *collection;
-	u32 event_id;
-};
-
-struct vgic_translation_cache_entry {
-	struct list_head	entry;
-	phys_addr_t		db;
-	u32			devid;
-	u32			eventid;
-	struct vgic_irq		*irq;
-};
-
-/**
- * struct vgic_its_abi - ITS abi ops and settings
- * @cte_esz: collection table entry size
- * @dte_esz: device table entry size
- * @ite_esz: interrupt translation table entry size
- * @save tables: save the ITS tables into guest RAM
- * @restore_tables: restore the ITS internal structs from tables
- *  stored in guest RAM
- * @commit: initialize the registers which expose the ABI settings,
- *  especially the entry sizes
- */
-struct vgic_its_abi {
-	int cte_esz;
-	int dte_esz;
-	int ite_esz;
-	int (*save_tables)(struct vgic_its *its);
-	int (*restore_tables)(struct vgic_its *its);
-	int (*commit)(struct vgic_its *its);
-};
-
-#define ABI_0_ESZ	8
-#define ESZ_MAX		ABI_0_ESZ
-
-static const struct vgic_its_abi its_table_abi_versions[] = {
-	[0] = {
-	 .cte_esz = ABI_0_ESZ,
-	 .dte_esz = ABI_0_ESZ,
-	 .ite_esz = ABI_0_ESZ,
-	 .save_tables = vgic_its_save_tables_v0,
-	 .restore_tables = vgic_its_restore_tables_v0,
-	 .commit = vgic_its_commit_v0,
-	},
-};
-
-#define NR_ITS_ABIS	ARRAY_SIZE(its_table_abi_versions)
-
-inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
-{
-	return &its_table_abi_versions[its->abi_rev];
-}
-
-static int vgic_its_set_abi(struct vgic_its *its, u32 rev)
-{
-	const struct vgic_its_abi *abi;
-
-	its->abi_rev = rev;
-	abi = vgic_its_get_abi(its);
-	return abi->commit(its);
-}
-
-/*
- * Find and returns a device in the device table for an ITS.
- * Must be called with the its_lock mutex held.
- */
-static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
-{
-	struct its_device *device;
-
-	list_for_each_entry(device, &its->device_list, dev_list)
-		if (device_id == device->device_id)
-			return device;
-
-	return NULL;
-}
-
-/*
- * Find and returns an interrupt translation table entry (ITTE) for a given
- * Device ID/Event ID pair on an ITS.
- * Must be called with the its_lock mutex held.
- */
-static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
-				  u32 event_id)
-{
-	struct its_device *device;
-	struct its_ite *ite;
-
-	device = find_its_device(its, device_id);
-	if (device == NULL)
-		return NULL;
-
-	list_for_each_entry(ite, &device->itt_head, ite_list)
-		if (ite->event_id == event_id)
-			return ite;
-
-	return NULL;
-}
-
-/* To be used as an iterator this macro misses the enclosing parentheses */
-#define for_each_lpi_its(dev, ite, its) \
-	list_for_each_entry(dev, &(its)->device_list, dev_list) \
-		list_for_each_entry(ite, &(dev)->itt_head, ite_list)
-
-#define GIC_LPI_OFFSET 8192
-
-#define VITS_TYPER_IDBITS 16
-#define VITS_TYPER_DEVBITS 16
-#define VITS_DTE_MAX_DEVID_OFFSET	(BIT(14) - 1)
-#define VITS_ITE_MAX_EVENTID_OFFSET	(BIT(16) - 1)
-
-/*
- * Finds and returns a collection in the ITS collection table.
- * Must be called with the its_lock mutex held.
- */
-static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
-{
-	struct its_collection *collection;
-
-	list_for_each_entry(collection, &its->collection_list, coll_list) {
-		if (coll_id == collection->collection_id)
-			return collection;
-	}
-
-	return NULL;
-}
-
-#define LPI_PROP_ENABLE_BIT(p)	((p) & LPI_PROP_ENABLED)
-#define LPI_PROP_PRIORITY(p)	((p) & 0xfc)
-
-/*
- * Reads the configuration data for a given LPI from guest memory and
- * updates the fields in struct vgic_irq.
- * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
- * VCPU. Unconditionally applies if filter_vcpu is NULL.
- */
-static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
-			     struct kvm_vcpu *filter_vcpu, bool needs_inv)
-{
-	u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
-	u8 prop;
-	int ret;
-	unsigned long flags;
-
-	ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
-				  &prop, 1);
-
-	if (ret)
-		return ret;
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-	if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
-		irq->priority = LPI_PROP_PRIORITY(prop);
-		irq->enabled = LPI_PROP_ENABLE_BIT(prop);
-
-		if (!irq->hw) {
-			vgic_queue_irq_unlock(kvm, irq, flags);
-			return 0;
-		}
-	}
-
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	if (irq->hw)
-		return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
-
-	return 0;
-}
-
-/*
- * Create a snapshot of the current LPIs targeting @vcpu, so that we can
- * enumerate those LPIs without holding any lock.
- * Returns their number and puts the kmalloc'ed array into intid_ptr.
- */
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq;
-	unsigned long flags;
-	u32 *intids;
-	int irq_count, i = 0;
-
-	/*
-	 * There is an obvious race between allocating the array and LPIs
-	 * being mapped/unmapped. If we ended up here as a result of a
-	 * command, we're safe (locks are held, preventing another
-	 * command). If coming from another path (such as enabling LPIs),
-	 * we must be careful not to overrun the array.
-	 */
-	irq_count = READ_ONCE(dist->lpi_list_count);
-	intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
-	if (!intids)
-		return -ENOMEM;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-		if (i == irq_count)
-			break;
-		/* We don't need to "get" the IRQ, as we hold the list lock. */
-		if (vcpu && irq->target_vcpu != vcpu)
-			continue;
-		intids[i++] = irq->intid;
-	}
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	*intid_ptr = intids;
-	return i;
-}
-
-static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
-{
-	int ret = 0;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	irq->target_vcpu = vcpu;
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	if (irq->hw) {
-		struct its_vlpi_map map;
-
-		ret = its_get_vlpi(irq->host_irq, &map);
-		if (ret)
-			return ret;
-
-		if (map.vpe)
-			atomic_dec(&map.vpe->vlpi_count);
-		map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-		atomic_inc(&map.vpe->vlpi_count);
-
-		ret = its_map_vlpi(irq->host_irq, &map);
-	}
-
-	return ret;
-}
-
-/*
- * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
- * is targeting) to the VGIC's view, which deals with target VCPUs.
- * Needs to be called whenever either the collection for a LPIs has
- * changed or the collection itself got retargeted.
- */
-static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
-{
-	struct kvm_vcpu *vcpu;
-
-	if (!its_is_collection_mapped(ite->collection))
-		return;
-
-	vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
-	update_affinity(ite->irq, vcpu);
-}
-
-/*
- * Updates the target VCPU for every LPI targeting this collection.
- * Must be called with the its_lock mutex held.
- */
-static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
-				       struct its_collection *coll)
-{
-	struct its_device *device;
-	struct its_ite *ite;
-
-	for_each_lpi_its(device, ite, its) {
-		if (!ite->collection || coll != ite->collection)
-			continue;
-
-		update_affinity_ite(kvm, ite);
-	}
-}
-
-static u32 max_lpis_propbaser(u64 propbaser)
-{
-	int nr_idbits = (propbaser & 0x1f) + 1;
-
-	return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
-}
-
-/*
- * Sync the pending table pending bit of LPIs targeting @vcpu
- * with our own data structures. This relies on the LPI being
- * mapped before.
- */
-static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
-{
-	gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-	struct vgic_irq *irq;
-	int last_byte_offset = -1;
-	int ret = 0;
-	u32 *intids;
-	int nr_irqs, i;
-	unsigned long flags;
-	u8 pendmask;
-
-	nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids);
-	if (nr_irqs < 0)
-		return nr_irqs;
-
-	for (i = 0; i < nr_irqs; i++) {
-		int byte_offset, bit_nr;
-
-		byte_offset = intids[i] / BITS_PER_BYTE;
-		bit_nr = intids[i] % BITS_PER_BYTE;
-
-		/*
-		 * For contiguously allocated LPIs chances are we just read
-		 * this very same byte in the last iteration. Reuse that.
-		 */
-		if (byte_offset != last_byte_offset) {
-			ret = kvm_read_guest_lock(vcpu->kvm,
-						  pendbase + byte_offset,
-						  &pendmask, 1);
-			if (ret) {
-				kfree(intids);
-				return ret;
-			}
-			last_byte_offset = byte_offset;
-		}
-
-		irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->pending_latch = pendmask & (1U << bit_nr);
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	kfree(intids);
-
-	return ret;
-}
-
-static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
-					      struct vgic_its *its,
-					      gpa_t addr, unsigned int len)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 reg = GITS_TYPER_PLPIS;
-
-	/*
-	 * We use linear CPU numbers for redistributor addressing,
-	 * so GITS_TYPER.PTA is 0.
-	 * Also we force all PROPBASER registers to be the same, so
-	 * CommonLPIAff is 0 as well.
-	 * To avoid memory waste in the guest, we keep the number of IDBits and
-	 * DevBits low - as least for the time being.
-	 */
-	reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT;
-	reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
-	reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
-
-	return extract_bytes(reg, addr & 7, len);
-}
-
-static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
-					     struct vgic_its *its,
-					     gpa_t addr, unsigned int len)
-{
-	u32 val;
-
-	val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
-	val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
-	return val;
-}
-
-static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
-					    struct vgic_its *its,
-					    gpa_t addr, unsigned int len,
-					    unsigned long val)
-{
-	u32 rev = GITS_IIDR_REV(val);
-
-	if (rev >= NR_ITS_ABIS)
-		return -EINVAL;
-	return vgic_its_set_abi(its, rev);
-}
-
-static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
-					       struct vgic_its *its,
-					       gpa_t addr, unsigned int len)
-{
-	switch (addr & 0xffff) {
-	case GITS_PIDR0:
-		return 0x92;	/* part number, bits[7:0] */
-	case GITS_PIDR1:
-		return 0xb4;	/* part number, bits[11:8] */
-	case GITS_PIDR2:
-		return GIC_PIDR2_ARCH_GICv3 | 0x0b;
-	case GITS_PIDR4:
-		return 0x40;	/* This is a 64K software visible page */
-	/* The following are the ID registers for (any) GIC. */
-	case GITS_CIDR0:
-		return 0x0d;
-	case GITS_CIDR1:
-		return 0xf0;
-	case GITS_CIDR2:
-		return 0x05;
-	case GITS_CIDR3:
-		return 0xb1;
-	}
-
-	return 0;
-}
-
-static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
-					       phys_addr_t db,
-					       u32 devid, u32 eventid)
-{
-	struct vgic_translation_cache_entry *cte;
-
-	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
-		/*
-		 * If we hit a NULL entry, there is nothing after this
-		 * point.
-		 */
-		if (!cte->irq)
-			break;
-
-		if (cte->db != db || cte->devid != devid ||
-		    cte->eventid != eventid)
-			continue;
-
-		/*
-		 * Move this entry to the head, as it is the most
-		 * recently used.
-		 */
-		if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
-			list_move(&cte->entry, &dist->lpi_translation_cache);
-
-		return cte->irq;
-	}
-
-	return NULL;
-}
-
-static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
-					     u32 devid, u32 eventid)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	irq = __vgic_its_check_cache(dist, db, devid, eventid);
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	return irq;
-}
-
-static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
-				       u32 devid, u32 eventid,
-				       struct vgic_irq *irq)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_translation_cache_entry *cte;
-	unsigned long flags;
-	phys_addr_t db;
-
-	/* Do not cache a directly injected interrupt */
-	if (irq->hw)
-		return;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	if (unlikely(list_empty(&dist->lpi_translation_cache)))
-		goto out;
-
-	/*
-	 * We could have raced with another CPU caching the same
-	 * translation behind our back, so let's check it is not in
-	 * already
-	 */
-	db = its->vgic_its_base + GITS_TRANSLATER;
-	if (__vgic_its_check_cache(dist, db, devid, eventid))
-		goto out;
-
-	/* Always reuse the last entry (LRU policy) */
-	cte = list_last_entry(&dist->lpi_translation_cache,
-			      typeof(*cte), entry);
-
-	/*
-	 * Caching the translation implies having an extra reference
-	 * to the interrupt, so drop the potential reference on what
-	 * was in the cache, and increment it on the new interrupt.
-	 */
-	if (cte->irq)
-		__vgic_put_lpi_locked(kvm, cte->irq);
-
-	vgic_get_irq_kref(irq);
-
-	cte->db		= db;
-	cte->devid	= devid;
-	cte->eventid	= eventid;
-	cte->irq	= irq;
-
-	/* Move the new translation to the head of the list */
-	list_move(&cte->entry, &dist->lpi_translation_cache);
-
-out:
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-}
-
-void vgic_its_invalidate_cache(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_translation_cache_entry *cte;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
-		/*
-		 * If we hit a NULL entry, there is nothing after this
-		 * point.
-		 */
-		if (!cte->irq)
-			break;
-
-		__vgic_put_lpi_locked(kvm, cte->irq);
-		cte->irq = NULL;
-	}
-
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-}
-
-int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
-			 u32 devid, u32 eventid, struct vgic_irq **irq)
-{
-	struct kvm_vcpu *vcpu;
-	struct its_ite *ite;
-
-	if (!its->enabled)
-		return -EBUSY;
-
-	ite = find_ite(its, devid, eventid);
-	if (!ite || !its_is_collection_mapped(ite->collection))
-		return E_ITS_INT_UNMAPPED_INTERRUPT;
-
-	vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
-	if (!vcpu)
-		return E_ITS_INT_UNMAPPED_INTERRUPT;
-
-	if (!vcpu->arch.vgic_cpu.lpis_enabled)
-		return -EBUSY;
-
-	vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
-
-	*irq = ite->irq;
-	return 0;
-}
-
-struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
-{
-	u64 address;
-	struct kvm_io_device *kvm_io_dev;
-	struct vgic_io_device *iodev;
-
-	if (!vgic_has_its(kvm))
-		return ERR_PTR(-ENODEV);
-
-	if (!(msi->flags & KVM_MSI_VALID_DEVID))
-		return ERR_PTR(-EINVAL);
-
-	address = (u64)msi->address_hi << 32 | msi->address_lo;
-
-	kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
-	if (!kvm_io_dev)
-		return ERR_PTR(-EINVAL);
-
-	if (kvm_io_dev->ops != &kvm_io_gic_ops)
-		return ERR_PTR(-EINVAL);
-
-	iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
-	if (iodev->iodev_type != IODEV_ITS)
-		return ERR_PTR(-EINVAL);
-
-	return iodev->its;
-}
-
-/*
- * Find the target VCPU and the LPI number for a given devid/eventid pair
- * and make this IRQ pending, possibly injecting it.
- * Must be called with the its_lock mutex held.
- * Returns 0 on success, a positive error value for any ITS mapping
- * related errors and negative error values for generic errors.
- */
-static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
-				u32 devid, u32 eventid)
-{
-	struct vgic_irq *irq = NULL;
-	unsigned long flags;
-	int err;
-
-	err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq);
-	if (err)
-		return err;
-
-	if (irq->hw)
-		return irq_set_irqchip_state(irq->host_irq,
-					     IRQCHIP_STATE_PENDING, true);
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	irq->pending_latch = true;
-	vgic_queue_irq_unlock(kvm, irq, flags);
-
-	return 0;
-}
-
-int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
-{
-	struct vgic_irq *irq;
-	unsigned long flags;
-	phys_addr_t db;
-
-	db = (u64)msi->address_hi << 32 | msi->address_lo;
-	irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
-
-	if (!irq)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	irq->pending_latch = true;
-	vgic_queue_irq_unlock(kvm, irq, flags);
-
-	return 0;
-}
-
-/*
- * Queries the KVM IO bus framework to get the ITS pointer from the given
- * doorbell address.
- * We then call vgic_its_trigger_msi() with the decoded data.
- * According to the KVM_SIGNAL_MSI API description returns 1 on success.
- */
-int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
-{
-	struct vgic_its *its;
-	int ret;
-
-	if (!vgic_its_inject_cached_translation(kvm, msi))
-		return 1;
-
-	its = vgic_msi_to_its(kvm, msi);
-	if (IS_ERR(its))
-		return PTR_ERR(its);
-
-	mutex_lock(&its->its_lock);
-	ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data);
-	mutex_unlock(&its->its_lock);
-
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * KVM_SIGNAL_MSI demands a return value > 0 for success and 0
-	 * if the guest has blocked the MSI. So we map any LPI mapping
-	 * related error to that.
-	 */
-	if (ret)
-		return 0;
-	else
-		return 1;
-}
-
-/* Requires the its_lock to be held. */
-static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
-{
-	list_del(&ite->ite_list);
-
-	/* This put matches the get in vgic_add_lpi. */
-	if (ite->irq) {
-		if (ite->irq->hw)
-			WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
-
-		vgic_put_irq(kvm, ite->irq);
-	}
-
-	kfree(ite);
-}
-
-static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
-{
-	return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
-}
-
-#define its_cmd_get_command(cmd)	its_cmd_mask_field(cmd, 0,  0,  8)
-#define its_cmd_get_deviceid(cmd)	its_cmd_mask_field(cmd, 0, 32, 32)
-#define its_cmd_get_size(cmd)		(its_cmd_mask_field(cmd, 1,  0,  5) + 1)
-#define its_cmd_get_id(cmd)		its_cmd_mask_field(cmd, 1,  0, 32)
-#define its_cmd_get_physical_id(cmd)	its_cmd_mask_field(cmd, 1, 32, 32)
-#define its_cmd_get_collection(cmd)	its_cmd_mask_field(cmd, 2,  0, 16)
-#define its_cmd_get_ittaddr(cmd)	(its_cmd_mask_field(cmd, 2,  8, 44) << 8)
-#define its_cmd_get_target_addr(cmd)	its_cmd_mask_field(cmd, 2, 16, 32)
-#define its_cmd_get_validbit(cmd)	its_cmd_mask_field(cmd, 2, 63,  1)
-
-/*
- * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
-				       u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	struct its_ite *ite;
-
-	ite = find_ite(its, device_id, event_id);
-	if (ite && its_is_collection_mapped(ite->collection)) {
-		/*
-		 * Though the spec talks about removing the pending state, we
-		 * don't bother here since we clear the ITTE anyway and the
-		 * pending state is a property of the ITTE struct.
-		 */
-		vgic_its_invalidate_cache(kvm);
-
-		its_free_ite(kvm, ite);
-		return 0;
-	}
-
-	return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
-}
-
-/*
- * The MOVI command moves an ITTE to a different collection.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	u32 coll_id = its_cmd_get_collection(its_cmd);
-	struct kvm_vcpu *vcpu;
-	struct its_ite *ite;
-	struct its_collection *collection;
-
-	ite = find_ite(its, device_id, event_id);
-	if (!ite)
-		return E_ITS_MOVI_UNMAPPED_INTERRUPT;
-
-	if (!its_is_collection_mapped(ite->collection))
-		return E_ITS_MOVI_UNMAPPED_COLLECTION;
-
-	collection = find_collection(its, coll_id);
-	if (!its_is_collection_mapped(collection))
-		return E_ITS_MOVI_UNMAPPED_COLLECTION;
-
-	ite->collection = collection;
-	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-	vgic_its_invalidate_cache(kvm);
-
-	return update_affinity(ite->irq, vcpu);
-}
-
-/*
- * Check whether an ID can be stored into the corresponding guest table.
- * For a direct table this is pretty easy, but gets a bit nasty for
- * indirect tables. We check whether the resulting guest physical address
- * is actually valid (covered by a memslot and guest accessible).
- * For this we have to read the respective first level entry.
- */
-static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
-			      gpa_t *eaddr)
-{
-	int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-	u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
-	phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
-	int esz = GITS_BASER_ENTRY_SIZE(baser);
-	int index, idx;
-	gfn_t gfn;
-	bool ret;
-
-	switch (type) {
-	case GITS_BASER_TYPE_DEVICE:
-		if (id >= BIT_ULL(VITS_TYPER_DEVBITS))
-			return false;
-		break;
-	case GITS_BASER_TYPE_COLLECTION:
-		/* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */
-		if (id >= BIT_ULL(16))
-			return false;
-		break;
-	default:
-		return false;
-	}
-
-	if (!(baser & GITS_BASER_INDIRECT)) {
-		phys_addr_t addr;
-
-		if (id >= (l1_tbl_size / esz))
-			return false;
-
-		addr = base + id * esz;
-		gfn = addr >> PAGE_SHIFT;
-
-		if (eaddr)
-			*eaddr = addr;
-
-		goto out;
-	}
-
-	/* calculate and check the index into the 1st level */
-	index = id / (SZ_64K / esz);
-	if (index >= (l1_tbl_size / sizeof(u64)))
-		return false;
-
-	/* Each 1st level entry is represented by a 64-bit value. */
-	if (kvm_read_guest_lock(its->dev->kvm,
-			   base + index * sizeof(indirect_ptr),
-			   &indirect_ptr, sizeof(indirect_ptr)))
-		return false;
-
-	indirect_ptr = le64_to_cpu(indirect_ptr);
-
-	/* check the valid bit of the first level entry */
-	if (!(indirect_ptr & BIT_ULL(63)))
-		return false;
-
-	/* Mask the guest physical address and calculate the frame number. */
-	indirect_ptr &= GENMASK_ULL(51, 16);
-
-	/* Find the address of the actual entry */
-	index = id % (SZ_64K / esz);
-	indirect_ptr += index * esz;
-	gfn = indirect_ptr >> PAGE_SHIFT;
-
-	if (eaddr)
-		*eaddr = indirect_ptr;
-
-out:
-	idx = srcu_read_lock(&its->dev->kvm->srcu);
-	ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
-	srcu_read_unlock(&its->dev->kvm->srcu, idx);
-	return ret;
-}
-
-static int vgic_its_alloc_collection(struct vgic_its *its,
-				     struct its_collection **colp,
-				     u32 coll_id)
-{
-	struct its_collection *collection;
-
-	if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
-		return E_ITS_MAPC_COLLECTION_OOR;
-
-	collection = kzalloc(sizeof(*collection), GFP_KERNEL);
-	if (!collection)
-		return -ENOMEM;
-
-	collection->collection_id = coll_id;
-	collection->target_addr = COLLECTION_NOT_MAPPED;
-
-	list_add_tail(&collection->coll_list, &its->collection_list);
-	*colp = collection;
-
-	return 0;
-}
-
-static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
-{
-	struct its_collection *collection;
-	struct its_device *device;
-	struct its_ite *ite;
-
-	/*
-	 * Clearing the mapping for that collection ID removes the
-	 * entry from the list. If there wasn't any before, we can
-	 * go home early.
-	 */
-	collection = find_collection(its, coll_id);
-	if (!collection)
-		return;
-
-	for_each_lpi_its(device, ite, its)
-		if (ite->collection &&
-		    ite->collection->collection_id == coll_id)
-			ite->collection = NULL;
-
-	list_del(&collection->coll_list);
-	kfree(collection);
-}
-
-/* Must be called with its_lock mutex held */
-static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
-					  struct its_collection *collection,
-					  u32 event_id)
-{
-	struct its_ite *ite;
-
-	ite = kzalloc(sizeof(*ite), GFP_KERNEL);
-	if (!ite)
-		return ERR_PTR(-ENOMEM);
-
-	ite->event_id	= event_id;
-	ite->collection = collection;
-
-	list_add_tail(&ite->ite_list, &device->itt_head);
-	return ite;
-}
-
-/*
- * The MAPTI and MAPI commands map LPIs to ITTEs.
- * Must be called with its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	u32 coll_id = its_cmd_get_collection(its_cmd);
-	struct its_ite *ite;
-	struct kvm_vcpu *vcpu = NULL;
-	struct its_device *device;
-	struct its_collection *collection, *new_coll = NULL;
-	struct vgic_irq *irq;
-	int lpi_nr;
-
-	device = find_its_device(its, device_id);
-	if (!device)
-		return E_ITS_MAPTI_UNMAPPED_DEVICE;
-
-	if (event_id >= BIT_ULL(device->num_eventid_bits))
-		return E_ITS_MAPTI_ID_OOR;
-
-	if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
-		lpi_nr = its_cmd_get_physical_id(its_cmd);
-	else
-		lpi_nr = event_id;
-	if (lpi_nr < GIC_LPI_OFFSET ||
-	    lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
-		return E_ITS_MAPTI_PHYSICALID_OOR;
-
-	/* If there is an existing mapping, behavior is UNPREDICTABLE. */
-	if (find_ite(its, device_id, event_id))
-		return 0;
-
-	collection = find_collection(its, coll_id);
-	if (!collection) {
-		int ret = vgic_its_alloc_collection(its, &collection, coll_id);
-		if (ret)
-			return ret;
-		new_coll = collection;
-	}
-
-	ite = vgic_its_alloc_ite(device, collection, event_id);
-	if (IS_ERR(ite)) {
-		if (new_coll)
-			vgic_its_free_collection(its, coll_id);
-		return PTR_ERR(ite);
-	}
-
-	if (its_is_collection_mapped(collection))
-		vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-	irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
-	if (IS_ERR(irq)) {
-		if (new_coll)
-			vgic_its_free_collection(its, coll_id);
-		its_free_ite(kvm, ite);
-		return PTR_ERR(irq);
-	}
-	ite->irq = irq;
-
-	return 0;
-}
-
-/* Requires the its_lock to be held. */
-static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
-{
-	struct its_ite *ite, *temp;
-
-	/*
-	 * The spec says that unmapping a device with still valid
-	 * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
-	 * since we cannot leave the memory unreferenced.
-	 */
-	list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
-		its_free_ite(kvm, ite);
-
-	vgic_its_invalidate_cache(kvm);
-
-	list_del(&device->dev_list);
-	kfree(device);
-}
-
-/* its lock must be held */
-static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
-{
-	struct its_device *cur, *temp;
-
-	list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
-		vgic_its_free_device(kvm, cur);
-}
-
-/* its lock must be held */
-static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
-{
-	struct its_collection *cur, *temp;
-
-	list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
-		vgic_its_free_collection(its, cur->collection_id);
-}
-
-/* Must be called with its_lock mutex held */
-static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
-						u32 device_id, gpa_t itt_addr,
-						u8 num_eventid_bits)
-{
-	struct its_device *device;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return ERR_PTR(-ENOMEM);
-
-	device->device_id = device_id;
-	device->itt_addr = itt_addr;
-	device->num_eventid_bits = num_eventid_bits;
-	INIT_LIST_HEAD(&device->itt_head);
-
-	list_add_tail(&device->dev_list, &its->device_list);
-	return device;
-}
-
-/*
- * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	bool valid = its_cmd_get_validbit(its_cmd);
-	u8 num_eventid_bits = its_cmd_get_size(its_cmd);
-	gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
-	struct its_device *device;
-
-	if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL))
-		return E_ITS_MAPD_DEVICE_OOR;
-
-	if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
-		return E_ITS_MAPD_ITTSIZE_OOR;
-
-	device = find_its_device(its, device_id);
-
-	/*
-	 * The spec says that calling MAPD on an already mapped device
-	 * invalidates all cached data for this device. We implement this
-	 * by removing the mapping and re-establishing it.
-	 */
-	if (device)
-		vgic_its_free_device(kvm, device);
-
-	/*
-	 * The spec does not say whether unmapping a not-mapped device
-	 * is an error, so we are done in any case.
-	 */
-	if (!valid)
-		return 0;
-
-	device = vgic_its_alloc_device(its, device_id, itt_addr,
-				       num_eventid_bits);
-
-	return PTR_ERR_OR_ZERO(device);
-}
-
-/*
- * The MAPC command maps collection IDs to redistributors.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd)
-{
-	u16 coll_id;
-	u32 target_addr;
-	struct its_collection *collection;
-	bool valid;
-
-	valid = its_cmd_get_validbit(its_cmd);
-	coll_id = its_cmd_get_collection(its_cmd);
-	target_addr = its_cmd_get_target_addr(its_cmd);
-
-	if (target_addr >= atomic_read(&kvm->online_vcpus))
-		return E_ITS_MAPC_PROCNUM_OOR;
-
-	if (!valid) {
-		vgic_its_free_collection(its, coll_id);
-		vgic_its_invalidate_cache(kvm);
-	} else {
-		collection = find_collection(its, coll_id);
-
-		if (!collection) {
-			int ret;
-
-			ret = vgic_its_alloc_collection(its, &collection,
-							coll_id);
-			if (ret)
-				return ret;
-			collection->target_addr = target_addr;
-		} else {
-			collection->target_addr = target_addr;
-			update_affinity_collection(kvm, its, collection);
-		}
-	}
-
-	return 0;
-}
-
-/*
- * The CLEAR command removes the pending state for a particular LPI.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
-				     u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	struct its_ite *ite;
-
-
-	ite = find_ite(its, device_id, event_id);
-	if (!ite)
-		return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
-
-	ite->irq->pending_latch = false;
-
-	if (ite->irq->hw)
-		return irq_set_irqchip_state(ite->irq->host_irq,
-					     IRQCHIP_STATE_PENDING, false);
-
-	return 0;
-}
-
-/*
- * The INV command syncs the configuration bits from the memory table.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
-				   u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	struct its_ite *ite;
-
-
-	ite = find_ite(its, device_id, event_id);
-	if (!ite)
-		return E_ITS_INV_UNMAPPED_INTERRUPT;
-
-	return update_lpi_config(kvm, ite->irq, NULL, true);
-}
-
-/*
- * The INVALL command requests flushing of all IRQ data in this collection.
- * Find the VCPU mapped to that collection, then iterate over the VM's list
- * of mapped LPIs and update the configuration for each IRQ which targets
- * the specified vcpu. The configuration will be read from the in-memory
- * configuration table.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
-				      u64 *its_cmd)
-{
-	u32 coll_id = its_cmd_get_collection(its_cmd);
-	struct its_collection *collection;
-	struct kvm_vcpu *vcpu;
-	struct vgic_irq *irq;
-	u32 *intids;
-	int irq_count, i;
-
-	collection = find_collection(its, coll_id);
-	if (!its_is_collection_mapped(collection))
-		return E_ITS_INVALL_UNMAPPED_COLLECTION;
-
-	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-	irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
-	if (irq_count < 0)
-		return irq_count;
-
-	for (i = 0; i < irq_count; i++) {
-		irq = vgic_get_irq(kvm, NULL, intids[i]);
-		if (!irq)
-			continue;
-		update_lpi_config(kvm, irq, vcpu, false);
-		vgic_put_irq(kvm, irq);
-	}
-
-	kfree(intids);
-
-	if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
-		its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
-
-	return 0;
-}
-
-/*
- * The MOVALL command moves the pending state of all IRQs targeting one
- * redistributor to another. We don't hold the pending state in the VCPUs,
- * but in the IRQs instead, so there is really not much to do for us here.
- * However the spec says that no IRQ must target the old redistributor
- * afterwards, so we make sure that no LPI is using the associated target_vcpu.
- * This command affects all LPIs in the system that target that redistributor.
- */
-static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
-				      u64 *its_cmd)
-{
-	u32 target1_addr = its_cmd_get_target_addr(its_cmd);
-	u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
-	struct kvm_vcpu *vcpu1, *vcpu2;
-	struct vgic_irq *irq;
-	u32 *intids;
-	int irq_count, i;
-
-	if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
-	    target2_addr >= atomic_read(&kvm->online_vcpus))
-		return E_ITS_MOVALL_PROCNUM_OOR;
-
-	if (target1_addr == target2_addr)
-		return 0;
-
-	vcpu1 = kvm_get_vcpu(kvm, target1_addr);
-	vcpu2 = kvm_get_vcpu(kvm, target2_addr);
-
-	irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
-	if (irq_count < 0)
-		return irq_count;
-
-	for (i = 0; i < irq_count; i++) {
-		irq = vgic_get_irq(kvm, NULL, intids[i]);
-
-		update_affinity(irq, vcpu2);
-
-		vgic_put_irq(kvm, irq);
-	}
-
-	vgic_its_invalidate_cache(kvm);
-
-	kfree(intids);
-	return 0;
-}
-
-/*
- * The INT command injects the LPI associated with that DevID/EvID pair.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
-				   u64 *its_cmd)
-{
-	u32 msi_data = its_cmd_get_id(its_cmd);
-	u64 msi_devid = its_cmd_get_deviceid(its_cmd);
-
-	return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
-}
-
-/*
- * This function is called with the its_cmd lock held, but the ITS data
- * structure lock dropped.
- */
-static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
-				   u64 *its_cmd)
-{
-	int ret = -ENODEV;
-
-	mutex_lock(&its->its_lock);
-	switch (its_cmd_get_command(its_cmd)) {
-	case GITS_CMD_MAPD:
-		ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MAPC:
-		ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MAPI:
-		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MAPTI:
-		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MOVI:
-		ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_DISCARD:
-		ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_CLEAR:
-		ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MOVALL:
-		ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_INT:
-		ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_INV:
-		ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_INVALL:
-		ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_SYNC:
-		/* we ignore this command: we are in sync all of the time */
-		ret = 0;
-		break;
-	}
-	mutex_unlock(&its->its_lock);
-
-	return ret;
-}
-
-static u64 vgic_sanitise_its_baser(u64 reg)
-{
-	reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
-				  GITS_BASER_SHAREABILITY_SHIFT,
-				  vgic_sanitise_shareability);
-	reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
-				  GITS_BASER_INNER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_inner_cacheability);
-	reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
-				  GITS_BASER_OUTER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_outer_cacheability);
-
-	/* We support only one (ITS) page size: 64K */
-	reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
-
-	return reg;
-}
-
-static u64 vgic_sanitise_its_cbaser(u64 reg)
-{
-	reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
-				  GITS_CBASER_SHAREABILITY_SHIFT,
-				  vgic_sanitise_shareability);
-	reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
-				  GITS_CBASER_INNER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_inner_cacheability);
-	reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
-				  GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_outer_cacheability);
-
-	/* Sanitise the physical address to be 64k aligned. */
-	reg &= ~GENMASK_ULL(15, 12);
-
-	return reg;
-}
-
-static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
-					       struct vgic_its *its,
-					       gpa_t addr, unsigned int len)
-{
-	return extract_bytes(its->cbaser, addr & 7, len);
-}
-
-static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
-				       gpa_t addr, unsigned int len,
-				       unsigned long val)
-{
-	/* When GITS_CTLR.Enable is 1, this register is RO. */
-	if (its->enabled)
-		return;
-
-	mutex_lock(&its->cmd_lock);
-	its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
-	its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
-	its->creadr = 0;
-	/*
-	 * CWRITER is architecturally UNKNOWN on reset, but we need to reset
-	 * it to CREADR to make sure we start with an empty command buffer.
-	 */
-	its->cwriter = its->creadr;
-	mutex_unlock(&its->cmd_lock);
-}
-
-#define ITS_CMD_BUFFER_SIZE(baser)	((((baser) & 0xff) + 1) << 12)
-#define ITS_CMD_SIZE			32
-#define ITS_CMD_OFFSET(reg)		((reg) & GENMASK(19, 5))
-
-/* Must be called with the cmd_lock held. */
-static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
-{
-	gpa_t cbaser;
-	u64 cmd_buf[4];
-
-	/* Commands are only processed when the ITS is enabled. */
-	if (!its->enabled)
-		return;
-
-	cbaser = GITS_CBASER_ADDRESS(its->cbaser);
-
-	while (its->cwriter != its->creadr) {
-		int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
-					      cmd_buf, ITS_CMD_SIZE);
-		/*
-		 * If kvm_read_guest() fails, this could be due to the guest
-		 * programming a bogus value in CBASER or something else going
-		 * wrong from which we cannot easily recover.
-		 * According to section 6.3.2 in the GICv3 spec we can just
-		 * ignore that command then.
-		 */
-		if (!ret)
-			vgic_its_handle_command(kvm, its, cmd_buf);
-
-		its->creadr += ITS_CMD_SIZE;
-		if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
-			its->creadr = 0;
-	}
-}
-
-/*
- * By writing to CWRITER the guest announces new commands to be processed.
- * To avoid any races in the first place, we take the its_cmd lock, which
- * protects our ring buffer variables, so that there is only one user
- * per ITS handling commands at a given time.
- */
-static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
-					gpa_t addr, unsigned int len,
-					unsigned long val)
-{
-	u64 reg;
-
-	if (!its)
-		return;
-
-	mutex_lock(&its->cmd_lock);
-
-	reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
-	reg = ITS_CMD_OFFSET(reg);
-	if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
-		mutex_unlock(&its->cmd_lock);
-		return;
-	}
-	its->cwriter = reg;
-
-	vgic_its_process_commands(kvm, its);
-
-	mutex_unlock(&its->cmd_lock);
-}
-
-static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
-						struct vgic_its *its,
-						gpa_t addr, unsigned int len)
-{
-	return extract_bytes(its->cwriter, addr & 0x7, len);
-}
-
-static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
-					       struct vgic_its *its,
-					       gpa_t addr, unsigned int len)
-{
-	return extract_bytes(its->creadr, addr & 0x7, len);
-}
-
-static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm,
-					      struct vgic_its *its,
-					      gpa_t addr, unsigned int len,
-					      unsigned long val)
-{
-	u32 cmd_offset;
-	int ret = 0;
-
-	mutex_lock(&its->cmd_lock);
-
-	if (its->enabled) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	cmd_offset = ITS_CMD_OFFSET(val);
-	if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	its->creadr = cmd_offset;
-out:
-	mutex_unlock(&its->cmd_lock);
-	return ret;
-}
-
-#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
-static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
-					      struct vgic_its *its,
-					      gpa_t addr, unsigned int len)
-{
-	u64 reg;
-
-	switch (BASER_INDEX(addr)) {
-	case 0:
-		reg = its->baser_device_table;
-		break;
-	case 1:
-		reg = its->baser_coll_table;
-		break;
-	default:
-		reg = 0;
-		break;
-	}
-
-	return extract_bytes(reg, addr & 7, len);
-}
-
-#define GITS_BASER_RO_MASK	(GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
-static void vgic_mmio_write_its_baser(struct kvm *kvm,
-				      struct vgic_its *its,
-				      gpa_t addr, unsigned int len,
-				      unsigned long val)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 entry_size, table_type;
-	u64 reg, *regptr, clearbits = 0;
-
-	/* When GITS_CTLR.Enable is 1, we ignore write accesses. */
-	if (its->enabled)
-		return;
-
-	switch (BASER_INDEX(addr)) {
-	case 0:
-		regptr = &its->baser_device_table;
-		entry_size = abi->dte_esz;
-		table_type = GITS_BASER_TYPE_DEVICE;
-		break;
-	case 1:
-		regptr = &its->baser_coll_table;
-		entry_size = abi->cte_esz;
-		table_type = GITS_BASER_TYPE_COLLECTION;
-		clearbits = GITS_BASER_INDIRECT;
-		break;
-	default:
-		return;
-	}
-
-	reg = update_64bit_reg(*regptr, addr & 7, len, val);
-	reg &= ~GITS_BASER_RO_MASK;
-	reg &= ~clearbits;
-
-	reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
-	reg |= table_type << GITS_BASER_TYPE_SHIFT;
-	reg = vgic_sanitise_its_baser(reg);
-
-	*regptr = reg;
-
-	if (!(reg & GITS_BASER_VALID)) {
-		/* Take the its_lock to prevent a race with a save/restore */
-		mutex_lock(&its->its_lock);
-		switch (table_type) {
-		case GITS_BASER_TYPE_DEVICE:
-			vgic_its_free_device_list(kvm, its);
-			break;
-		case GITS_BASER_TYPE_COLLECTION:
-			vgic_its_free_collection_list(kvm, its);
-			break;
-		}
-		mutex_unlock(&its->its_lock);
-	}
-}
-
-static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
-					     struct vgic_its *its,
-					     gpa_t addr, unsigned int len)
-{
-	u32 reg = 0;
-
-	mutex_lock(&its->cmd_lock);
-	if (its->creadr == its->cwriter)
-		reg |= GITS_CTLR_QUIESCENT;
-	if (its->enabled)
-		reg |= GITS_CTLR_ENABLE;
-	mutex_unlock(&its->cmd_lock);
-
-	return reg;
-}
-
-static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	mutex_lock(&its->cmd_lock);
-
-	/*
-	 * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
-	 * device/collection BASER are invalid
-	 */
-	if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
-		(!(its->baser_device_table & GITS_BASER_VALID) ||
-		 !(its->baser_coll_table & GITS_BASER_VALID) ||
-		 !(its->cbaser & GITS_CBASER_VALID)))
-		goto out;
-
-	its->enabled = !!(val & GITS_CTLR_ENABLE);
-	if (!its->enabled)
-		vgic_its_invalidate_cache(kvm);
-
-	/*
-	 * Try to process any pending commands. This function bails out early
-	 * if the ITS is disabled or no commands have been queued.
-	 */
-	vgic_its_process_commands(kvm, its);
-
-out:
-	mutex_unlock(&its->cmd_lock);
-}
-
-#define REGISTER_ITS_DESC(off, rd, wr, length, acc)		\
-{								\
-	.reg_offset = off,					\
-	.len = length,						\
-	.access_flags = acc,					\
-	.its_read = rd,						\
-	.its_write = wr,					\
-}
-
-#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\
-{								\
-	.reg_offset = off,					\
-	.len = length,						\
-	.access_flags = acc,					\
-	.its_read = rd,						\
-	.its_write = wr,					\
-	.uaccess_its_write = uwr,				\
-}
-
-static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
-			      gpa_t addr, unsigned int len, unsigned long val)
-{
-	/* Ignore */
-}
-
-static struct vgic_register_region its_registers[] = {
-	REGISTER_ITS_DESC(GITS_CTLR,
-		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
-		vgic_mmio_read_its_iidr, its_mmio_write_wi,
-		vgic_mmio_uaccess_write_its_iidr, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_TYPER,
-		vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_CBASER,
-		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_CWRITER,
-		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC_UACCESS(GITS_CREADR,
-		vgic_mmio_read_its_creadr, its_mmio_write_wi,
-		vgic_mmio_uaccess_write_its_creadr, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_BASER,
-		vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_IDREGS_BASE,
-		vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
-		VGIC_ACCESS_32bit),
-};
-
-/* This is called on setting the LPI enable bit in the redistributor. */
-void vgic_enable_lpis(struct kvm_vcpu *vcpu)
-{
-	if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
-		its_sync_lpi_pending_table(vcpu);
-}
-
-static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its,
-				   u64 addr)
-{
-	struct vgic_io_device *iodev = &its->iodev;
-	int ret;
-
-	mutex_lock(&kvm->slots_lock);
-	if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	its->vgic_its_base = addr;
-	iodev->regions = its_registers;
-	iodev->nr_regions = ARRAY_SIZE(its_registers);
-	kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
-
-	iodev->base_addr = its->vgic_its_base;
-	iodev->iodev_type = IODEV_ITS;
-	iodev->its = its;
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
-				      KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
-out:
-	mutex_unlock(&kvm->slots_lock);
-
-	return ret;
-}
-
-/* Default is 16 cached LPIs per vcpu */
-#define LPI_DEFAULT_PCPU_CACHE_SIZE	16
-
-void vgic_lpi_translation_cache_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	unsigned int sz;
-	int i;
-
-	if (!list_empty(&dist->lpi_translation_cache))
-		return;
-
-	sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
-
-	for (i = 0; i < sz; i++) {
-		struct vgic_translation_cache_entry *cte;
-
-		/* An allocation failure is not fatal */
-		cte = kzalloc(sizeof(*cte), GFP_KERNEL);
-		if (WARN_ON(!cte))
-			break;
-
-		INIT_LIST_HEAD(&cte->entry);
-		list_add(&cte->entry, &dist->lpi_translation_cache);
-	}
-}
-
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_translation_cache_entry *cte, *tmp;
-
-	vgic_its_invalidate_cache(kvm);
-
-	list_for_each_entry_safe(cte, tmp,
-				 &dist->lpi_translation_cache, entry) {
-		list_del(&cte->entry);
-		kfree(cte);
-	}
-}
-
-#define INITIAL_BASER_VALUE						  \
-	(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)		| \
-	 GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)		| \
-	 GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)		| \
-	 GITS_BASER_PAGE_SIZE_64K)
-
-#define INITIAL_PROPBASER_VALUE						  \
-	(GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)		| \
-	 GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)	| \
-	 GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
-
-static int vgic_its_create(struct kvm_device *dev, u32 type)
-{
-	struct vgic_its *its;
-
-	if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
-		return -ENODEV;
-
-	its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
-	if (!its)
-		return -ENOMEM;
-
-	if (vgic_initialized(dev->kvm)) {
-		int ret = vgic_v4_init(dev->kvm);
-		if (ret < 0) {
-			kfree(its);
-			return ret;
-		}
-
-		vgic_lpi_translation_cache_init(dev->kvm);
-	}
-
-	mutex_init(&its->its_lock);
-	mutex_init(&its->cmd_lock);
-
-	its->vgic_its_base = VGIC_ADDR_UNDEF;
-
-	INIT_LIST_HEAD(&its->device_list);
-	INIT_LIST_HEAD(&its->collection_list);
-
-	dev->kvm->arch.vgic.msis_require_devid = true;
-	dev->kvm->arch.vgic.has_its = true;
-	its->enabled = false;
-	its->dev = dev;
-
-	its->baser_device_table = INITIAL_BASER_VALUE			|
-		((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
-	its->baser_coll_table = INITIAL_BASER_VALUE |
-		((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
-	dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
-
-	dev->private = its;
-
-	return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
-}
-
-static void vgic_its_destroy(struct kvm_device *kvm_dev)
-{
-	struct kvm *kvm = kvm_dev->kvm;
-	struct vgic_its *its = kvm_dev->private;
-
-	mutex_lock(&its->its_lock);
-
-	vgic_its_free_device_list(kvm, its);
-	vgic_its_free_collection_list(kvm, its);
-
-	mutex_unlock(&its->its_lock);
-	kfree(its);
-	kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */
-}
-
-static int vgic_its_has_attr_regs(struct kvm_device *dev,
-				  struct kvm_device_attr *attr)
-{
-	const struct vgic_register_region *region;
-	gpa_t offset = attr->attr;
-	int align;
-
-	align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7;
-
-	if (offset & align)
-		return -EINVAL;
-
-	region = vgic_find_mmio_region(its_registers,
-				       ARRAY_SIZE(its_registers),
-				       offset);
-	if (!region)
-		return -ENXIO;
-
-	return 0;
-}
-
-static int vgic_its_attr_regs_access(struct kvm_device *dev,
-				     struct kvm_device_attr *attr,
-				     u64 *reg, bool is_write)
-{
-	const struct vgic_register_region *region;
-	struct vgic_its *its;
-	gpa_t addr, offset;
-	unsigned int len;
-	int align, ret = 0;
-
-	its = dev->private;
-	offset = attr->attr;
-
-	/*
-	 * Although the spec supports upper/lower 32-bit accesses to
-	 * 64-bit ITS registers, the userspace ABI requires 64-bit
-	 * accesses to all 64-bit wide registers. We therefore only
-	 * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID
-	 * registers
-	 */
-	if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4))
-		align = 0x3;
-	else
-		align = 0x7;
-
-	if (offset & align)
-		return -EINVAL;
-
-	mutex_lock(&dev->kvm->lock);
-
-	if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
-		ret = -ENXIO;
-		goto out;
-	}
-
-	region = vgic_find_mmio_region(its_registers,
-				       ARRAY_SIZE(its_registers),
-				       offset);
-	if (!region) {
-		ret = -ENXIO;
-		goto out;
-	}
-
-	if (!lock_all_vcpus(dev->kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	addr = its->vgic_its_base + offset;
-
-	len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;
-
-	if (is_write) {
-		if (region->uaccess_its_write)
-			ret = region->uaccess_its_write(dev->kvm, its, addr,
-							len, *reg);
-		else
-			region->its_write(dev->kvm, its, addr, len, *reg);
-	} else {
-		*reg = region->its_read(dev->kvm, its, addr, len);
-	}
-	unlock_all_vcpus(dev->kvm);
-out:
-	mutex_unlock(&dev->kvm->lock);
-	return ret;
-}
-
-static u32 compute_next_devid_offset(struct list_head *h,
-				     struct its_device *dev)
-{
-	struct its_device *next;
-	u32 next_offset;
-
-	if (list_is_last(&dev->dev_list, h))
-		return 0;
-	next = list_next_entry(dev, dev_list);
-	next_offset = next->device_id - dev->device_id;
-
-	return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET);
-}
-
-static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite)
-{
-	struct its_ite *next;
-	u32 next_offset;
-
-	if (list_is_last(&ite->ite_list, h))
-		return 0;
-	next = list_next_entry(ite, ite_list);
-	next_offset = next->event_id - ite->event_id;
-
-	return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET);
-}
-
-/**
- * entry_fn_t - Callback called on a table entry restore path
- * @its: its handle
- * @id: id of the entry
- * @entry: pointer to the entry
- * @opaque: pointer to an opaque data
- *
- * Return: < 0 on error, 0 if last element was identified, id offset to next
- * element otherwise
- */
-typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
-			  void *opaque);
-
-/**
- * scan_its_table - Scan a contiguous table in guest RAM and applies a function
- * to each entry
- *
- * @its: its handle
- * @base: base gpa of the table
- * @size: size of the table in bytes
- * @esz: entry size in bytes
- * @start_id: the ID of the first entry in the table
- * (non zero for 2d level tables)
- * @fn: function to apply on each entry
- *
- * Return: < 0 on error, 0 if last element was identified, 1 otherwise
- * (the last element may not be found on second level tables)
- */
-static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
-			  int start_id, entry_fn_t fn, void *opaque)
-{
-	struct kvm *kvm = its->dev->kvm;
-	unsigned long len = size;
-	int id = start_id;
-	gpa_t gpa = base;
-	char entry[ESZ_MAX];
-	int ret;
-
-	memset(entry, 0, esz);
-
-	while (len > 0) {
-		int next_offset;
-		size_t byte_offset;
-
-		ret = kvm_read_guest_lock(kvm, gpa, entry, esz);
-		if (ret)
-			return ret;
-
-		next_offset = fn(its, id, entry, opaque);
-		if (next_offset <= 0)
-			return next_offset;
-
-		byte_offset = next_offset * esz;
-		id += next_offset;
-		gpa += byte_offset;
-		len -= byte_offset;
-	}
-	return 1;
-}
-
-/**
- * vgic_its_save_ite - Save an interrupt translation entry at @gpa
- */
-static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
-			      struct its_ite *ite, gpa_t gpa, int ite_esz)
-{
-	struct kvm *kvm = its->dev->kvm;
-	u32 next_offset;
-	u64 val;
-
-	next_offset = compute_next_eventid_offset(&dev->itt_head, ite);
-	val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) |
-	       ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
-		ite->collection->collection_id;
-	val = cpu_to_le64(val);
-	return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
-}
-
-/**
- * vgic_its_restore_ite - restore an interrupt translation entry
- * @event_id: id used for indexing
- * @ptr: pointer to the ITE entry
- * @opaque: pointer to the its_device
- */
-static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
-				void *ptr, void *opaque)
-{
-	struct its_device *dev = (struct its_device *)opaque;
-	struct its_collection *collection;
-	struct kvm *kvm = its->dev->kvm;
-	struct kvm_vcpu *vcpu = NULL;
-	u64 val;
-	u64 *p = (u64 *)ptr;
-	struct vgic_irq *irq;
-	u32 coll_id, lpi_id;
-	struct its_ite *ite;
-	u32 offset;
-
-	val = *p;
-
-	val = le64_to_cpu(val);
-
-	coll_id = val & KVM_ITS_ITE_ICID_MASK;
-	lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT;
-
-	if (!lpi_id)
-		return 1; /* invalid entry, no choice but to scan next entry */
-
-	if (lpi_id < VGIC_MIN_LPI)
-		return -EINVAL;
-
-	offset = val >> KVM_ITS_ITE_NEXT_SHIFT;
-	if (event_id + offset >= BIT_ULL(dev->num_eventid_bits))
-		return -EINVAL;
-
-	collection = find_collection(its, coll_id);
-	if (!collection)
-		return -EINVAL;
-
-	ite = vgic_its_alloc_ite(dev, collection, event_id);
-	if (IS_ERR(ite))
-		return PTR_ERR(ite);
-
-	if (its_is_collection_mapped(collection))
-		vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-	irq = vgic_add_lpi(kvm, lpi_id, vcpu);
-	if (IS_ERR(irq))
-		return PTR_ERR(irq);
-	ite->irq = irq;
-
-	return offset;
-}
-
-static int vgic_its_ite_cmp(void *priv, struct list_head *a,
-			    struct list_head *b)
-{
-	struct its_ite *itea = container_of(a, struct its_ite, ite_list);
-	struct its_ite *iteb = container_of(b, struct its_ite, ite_list);
-
-	if (itea->event_id < iteb->event_id)
-		return -1;
-	else
-		return 1;
-}
-
-static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	gpa_t base = device->itt_addr;
-	struct its_ite *ite;
-	int ret;
-	int ite_esz = abi->ite_esz;
-
-	list_sort(NULL, &device->itt_head, vgic_its_ite_cmp);
-
-	list_for_each_entry(ite, &device->itt_head, ite_list) {
-		gpa_t gpa = base + ite->event_id * ite_esz;
-
-		/*
-		 * If an LPI carries the HW bit, this means that this
-		 * interrupt is controlled by GICv4, and we do not
-		 * have direct access to that state. Let's simply fail
-		 * the save operation...
-		 */
-		if (ite->irq->hw)
-			return -EACCES;
-
-		ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-/**
- * vgic_its_restore_itt - restore the ITT of a device
- *
- * @its: its handle
- * @dev: device handle
- *
- * Return 0 on success, < 0 on error
- */
-static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	gpa_t base = dev->itt_addr;
-	int ret;
-	int ite_esz = abi->ite_esz;
-	size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz;
-
-	ret = scan_its_table(its, base, max_size, ite_esz, 0,
-			     vgic_its_restore_ite, dev);
-
-	/* scan_its_table returns +1 if all ITEs are invalid */
-	if (ret > 0)
-		ret = 0;
-
-	return ret;
-}
-
-/**
- * vgic_its_save_dte - Save a device table entry at a given GPA
- *
- * @its: ITS handle
- * @dev: ITS device
- * @ptr: GPA
- */
-static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
-			     gpa_t ptr, int dte_esz)
-{
-	struct kvm *kvm = its->dev->kvm;
-	u64 val, itt_addr_field;
-	u32 next_offset;
-
-	itt_addr_field = dev->itt_addr >> 8;
-	next_offset = compute_next_devid_offset(&its->device_list, dev);
-	val = (1ULL << KVM_ITS_DTE_VALID_SHIFT |
-	       ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) |
-	       (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
-		(dev->num_eventid_bits - 1));
-	val = cpu_to_le64(val);
-	return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
-}
-
-/**
- * vgic_its_restore_dte - restore a device table entry
- *
- * @its: its handle
- * @id: device id the DTE corresponds to
- * @ptr: kernel VA where the 8 byte DTE is located
- * @opaque: unused
- *
- * Return: < 0 on error, 0 if the dte is the last one, id offset to the
- * next dte otherwise
- */
-static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
-				void *ptr, void *opaque)
-{
-	struct its_device *dev;
-	gpa_t itt_addr;
-	u8 num_eventid_bits;
-	u64 entry = *(u64 *)ptr;
-	bool valid;
-	u32 offset;
-	int ret;
-
-	entry = le64_to_cpu(entry);
-
-	valid = entry >> KVM_ITS_DTE_VALID_SHIFT;
-	num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1;
-	itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK)
-			>> KVM_ITS_DTE_ITTADDR_SHIFT) << 8;
-
-	if (!valid)
-		return 1;
-
-	/* dte entry is valid */
-	offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;
-
-	dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits);
-	if (IS_ERR(dev))
-		return PTR_ERR(dev);
-
-	ret = vgic_its_restore_itt(its, dev);
-	if (ret) {
-		vgic_its_free_device(its->dev->kvm, dev);
-		return ret;
-	}
-
-	return offset;
-}
-
-static int vgic_its_device_cmp(void *priv, struct list_head *a,
-			       struct list_head *b)
-{
-	struct its_device *deva = container_of(a, struct its_device, dev_list);
-	struct its_device *devb = container_of(b, struct its_device, dev_list);
-
-	if (deva->device_id < devb->device_id)
-		return -1;
-	else
-		return 1;
-}
-
-/**
- * vgic_its_save_device_tables - Save the device table and all ITT
- * into guest RAM
- *
- * L1/L2 handling is hidden by vgic_its_check_id() helper which directly
- * returns the GPA of the device entry
- */
-static int vgic_its_save_device_tables(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 baser = its->baser_device_table;
-	struct its_device *dev;
-	int dte_esz = abi->dte_esz;
-
-	if (!(baser & GITS_BASER_VALID))
-		return 0;
-
-	list_sort(NULL, &its->device_list, vgic_its_device_cmp);
-
-	list_for_each_entry(dev, &its->device_list, dev_list) {
-		int ret;
-		gpa_t eaddr;
-
-		if (!vgic_its_check_id(its, baser,
-				       dev->device_id, &eaddr))
-			return -EINVAL;
-
-		ret = vgic_its_save_itt(its, dev);
-		if (ret)
-			return ret;
-
-		ret = vgic_its_save_dte(its, dev, eaddr, dte_esz);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-/**
- * handle_l1_dte - callback used for L1 device table entries (2 stage case)
- *
- * @its: its handle
- * @id: index of the entry in the L1 table
- * @addr: kernel VA
- * @opaque: unused
- *
- * L1 table entries are scanned by steps of 1 entry
- * Return < 0 if error, 0 if last dte was found when scanning the L2
- * table, +1 otherwise (meaning next L1 entry must be scanned)
- */
-static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
-			 void *opaque)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	int l2_start_id = id * (SZ_64K / abi->dte_esz);
-	u64 entry = *(u64 *)addr;
-	int dte_esz = abi->dte_esz;
-	gpa_t gpa;
-	int ret;
-
-	entry = le64_to_cpu(entry);
-
-	if (!(entry & KVM_ITS_L1E_VALID_MASK))
-		return 1;
-
-	gpa = entry & KVM_ITS_L1E_ADDR_MASK;
-
-	ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
-			     l2_start_id, vgic_its_restore_dte, NULL);
-
-	return ret;
-}
-
-/**
- * vgic_its_restore_device_tables - Restore the device table and all ITT
- * from guest RAM to internal data structs
- */
-static int vgic_its_restore_device_tables(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 baser = its->baser_device_table;
-	int l1_esz, ret;
-	int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-	gpa_t l1_gpa;
-
-	if (!(baser & GITS_BASER_VALID))
-		return 0;
-
-	l1_gpa = GITS_BASER_ADDR_48_to_52(baser);
-
-	if (baser & GITS_BASER_INDIRECT) {
-		l1_esz = GITS_LVL1_ENTRY_SIZE;
-		ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
-				     handle_l1_dte, NULL);
-	} else {
-		l1_esz = abi->dte_esz;
-		ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
-				     vgic_its_restore_dte, NULL);
-	}
-
-	/* scan_its_table returns +1 if all entries are invalid */
-	if (ret > 0)
-		ret = 0;
-
-	return ret;
-}
-
-static int vgic_its_save_cte(struct vgic_its *its,
-			     struct its_collection *collection,
-			     gpa_t gpa, int esz)
-{
-	u64 val;
-
-	val = (1ULL << KVM_ITS_CTE_VALID_SHIFT |
-	       ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
-	       collection->collection_id);
-	val = cpu_to_le64(val);
-	return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
-}
-
-static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
-{
-	struct its_collection *collection;
-	struct kvm *kvm = its->dev->kvm;
-	u32 target_addr, coll_id;
-	u64 val;
-	int ret;
-
-	BUG_ON(esz > sizeof(val));
-	ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
-	if (ret)
-		return ret;
-	val = le64_to_cpu(val);
-	if (!(val & KVM_ITS_CTE_VALID_MASK))
-		return 0;
-
-	target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT);
-	coll_id = val & KVM_ITS_CTE_ICID_MASK;
-
-	if (target_addr != COLLECTION_NOT_MAPPED &&
-	    target_addr >= atomic_read(&kvm->online_vcpus))
-		return -EINVAL;
-
-	collection = find_collection(its, coll_id);
-	if (collection)
-		return -EEXIST;
-	ret = vgic_its_alloc_collection(its, &collection, coll_id);
-	if (ret)
-		return ret;
-	collection->target_addr = target_addr;
-	return 1;
-}
-
-/**
- * vgic_its_save_collection_table - Save the collection table into
- * guest RAM
- */
-static int vgic_its_save_collection_table(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 baser = its->baser_coll_table;
-	gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
-	struct its_collection *collection;
-	u64 val;
-	size_t max_size, filled = 0;
-	int ret, cte_esz = abi->cte_esz;
-
-	if (!(baser & GITS_BASER_VALID))
-		return 0;
-
-	max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-
-	list_for_each_entry(collection, &its->collection_list, coll_list) {
-		ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
-		if (ret)
-			return ret;
-		gpa += cte_esz;
-		filled += cte_esz;
-	}
-
-	if (filled == max_size)
-		return 0;
-
-	/*
-	 * table is not fully filled, add a last dummy element
-	 * with valid bit unset
-	 */
-	val = 0;
-	BUG_ON(cte_esz > sizeof(val));
-	ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
-	return ret;
-}
-
-/**
- * vgic_its_restore_collection_table - reads the collection table
- * in guest memory and restores the ITS internal state. Requires the
- * BASER registers to be restored before.
- */
-static int vgic_its_restore_collection_table(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 baser = its->baser_coll_table;
-	int cte_esz = abi->cte_esz;
-	size_t max_size, read = 0;
-	gpa_t gpa;
-	int ret;
-
-	if (!(baser & GITS_BASER_VALID))
-		return 0;
-
-	gpa = GITS_BASER_ADDR_48_to_52(baser);
-
-	max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-
-	while (read < max_size) {
-		ret = vgic_its_restore_cte(its, gpa, cte_esz);
-		if (ret <= 0)
-			break;
-		gpa += cte_esz;
-		read += cte_esz;
-	}
-
-	if (ret > 0)
-		return 0;
-
-	return ret;
-}
-
-/**
- * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
- * according to v0 ABI
- */
-static int vgic_its_save_tables_v0(struct vgic_its *its)
-{
-	int ret;
-
-	ret = vgic_its_save_device_tables(its);
-	if (ret)
-		return ret;
-
-	return vgic_its_save_collection_table(its);
-}
-
-/**
- * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
- * to internal data structs according to V0 ABI
- *
- */
-static int vgic_its_restore_tables_v0(struct vgic_its *its)
-{
-	int ret;
-
-	ret = vgic_its_restore_collection_table(its);
-	if (ret)
-		return ret;
-
-	return vgic_its_restore_device_tables(its);
-}
-
-static int vgic_its_commit_v0(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi;
-
-	abi = vgic_its_get_abi(its);
-	its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
-	its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
-
-	its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
-					<< GITS_BASER_ENTRY_SIZE_SHIFT);
-
-	its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
-					<< GITS_BASER_ENTRY_SIZE_SHIFT);
-	return 0;
-}
-
-static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
-{
-	/* We need to keep the ABI specific field values */
-	its->baser_coll_table &= ~GITS_BASER_VALID;
-	its->baser_device_table &= ~GITS_BASER_VALID;
-	its->cbaser = 0;
-	its->creadr = 0;
-	its->cwriter = 0;
-	its->enabled = 0;
-	vgic_its_free_device_list(kvm, its);
-	vgic_its_free_collection_list(kvm, its);
-}
-
-static int vgic_its_has_attr(struct kvm_device *dev,
-			     struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR:
-		switch (attr->attr) {
-		case KVM_VGIC_ITS_ADDR_TYPE:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			return 0;
-		case KVM_DEV_ARM_ITS_CTRL_RESET:
-			return 0;
-		case KVM_DEV_ARM_ITS_SAVE_TABLES:
-			return 0;
-		case KVM_DEV_ARM_ITS_RESTORE_TABLES:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS:
-		return vgic_its_has_attr_regs(dev, attr);
-	}
-	return -ENXIO;
-}
-
-static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	int ret = 0;
-
-	if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
-		return 0;
-
-	mutex_lock(&kvm->lock);
-	mutex_lock(&its->its_lock);
-
-	if (!lock_all_vcpus(kvm)) {
-		mutex_unlock(&its->its_lock);
-		mutex_unlock(&kvm->lock);
-		return -EBUSY;
-	}
-
-	switch (attr) {
-	case KVM_DEV_ARM_ITS_CTRL_RESET:
-		vgic_its_reset(kvm, its);
-		break;
-	case KVM_DEV_ARM_ITS_SAVE_TABLES:
-		ret = abi->save_tables(its);
-		break;
-	case KVM_DEV_ARM_ITS_RESTORE_TABLES:
-		ret = abi->restore_tables(its);
-		break;
-	}
-
-	unlock_all_vcpus(kvm);
-	mutex_unlock(&its->its_lock);
-	mutex_unlock(&kvm->lock);
-	return ret;
-}
-
-static int vgic_its_set_attr(struct kvm_device *dev,
-			     struct kvm_device_attr *attr)
-{
-	struct vgic_its *its = dev->private;
-	int ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		unsigned long type = (unsigned long)attr->attr;
-		u64 addr;
-
-		if (type != KVM_VGIC_ITS_ADDR_TYPE)
-			return -ENODEV;
-
-		if (copy_from_user(&addr, uaddr, sizeof(addr)))
-			return -EFAULT;
-
-		ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
-					addr, SZ_64K);
-		if (ret)
-			return ret;
-
-		return vgic_register_its_iodev(dev->kvm, its, addr);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		return vgic_its_ctrl(dev->kvm, its, attr->attr);
-	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 reg;
-
-		if (get_user(reg, uaddr))
-			return -EFAULT;
-
-		return vgic_its_attr_regs_access(dev, attr, &reg, true);
-	}
-	}
-	return -ENXIO;
-}
-
-static int vgic_its_get_attr(struct kvm_device *dev,
-			     struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		struct vgic_its *its = dev->private;
-		u64 addr = its->vgic_its_base;
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		if (type != KVM_VGIC_ITS_ADDR_TYPE)
-			return -ENODEV;
-
-		if (copy_to_user(uaddr, &addr, sizeof(addr)))
-			return -EFAULT;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 reg;
-		int ret;
-
-		ret = vgic_its_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		return put_user(reg, uaddr);
-	}
-	default:
-		return -ENXIO;
-	}
-
-	return 0;
-}
-
-static struct kvm_device_ops kvm_arm_vgic_its_ops = {
-	.name = "kvm-arm-vgic-its",
-	.create = vgic_its_create,
-	.destroy = vgic_its_destroy,
-	.set_attr = vgic_its_set_attr,
-	.get_attr = vgic_its_get_attr,
-	.has_attr = vgic_its_has_attr,
-};
-
-int kvm_vgic_register_its_device(void)
-{
-	return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
-				       KVM_DEV_TYPE_ARM_VGIC_ITS);
-}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
deleted file mode 100644
index 44419679f91a..000000000000
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ /dev/null
@@ -1,741 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VGIC: KVM DEVICE API
- *
- * Copyright (C) 2015 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <linux/uaccess.h>
-#include <asm/kvm_mmu.h>
-#include <asm/cputype.h>
-#include "vgic.h"
-
-/* common helpers */
-
-int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-		      phys_addr_t addr, phys_addr_t alignment)
-{
-	if (addr & ~kvm_phys_mask(kvm))
-		return -E2BIG;
-
-	if (!IS_ALIGNED(addr, alignment))
-		return -EINVAL;
-
-	if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
-		return -EEXIST;
-
-	return 0;
-}
-
-static int vgic_check_type(struct kvm *kvm, int type_needed)
-{
-	if (kvm->arch.vgic.vgic_model != type_needed)
-		return -ENODEV;
-	else
-		return 0;
-}
-
-/**
- * kvm_vgic_addr - set or get vgic VM base addresses
- * @kvm:   pointer to the vm struct
- * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr:  pointer to address value
- * @write: if true set the address in the VM address space, if false read the
- *          address
- *
- * Set or get the vgic base addresses for the distributor and the virtual CPU
- * interface in the VM physical address space.  These addresses are properties
- * of the emulated core/SoC and therefore user space initially knows this
- * information.
- * Check them for sanity (alignment, double assignment). We can't check for
- * overlapping regions in case of a virtual GICv3 here, since we don't know
- * the number of VCPUs yet, so we defer this check to map_resources().
- */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
-	int r = 0;
-	struct vgic_dist *vgic = &kvm->arch.vgic;
-	phys_addr_t *addr_ptr, alignment;
-	u64 undef_value = VGIC_ADDR_UNDEF;
-
-	mutex_lock(&kvm->lock);
-	switch (type) {
-	case KVM_VGIC_V2_ADDR_TYPE_DIST:
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-		addr_ptr = &vgic->vgic_dist_base;
-		alignment = SZ_4K;
-		break;
-	case KVM_VGIC_V2_ADDR_TYPE_CPU:
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-		addr_ptr = &vgic->vgic_cpu_base;
-		alignment = SZ_4K;
-		break;
-	case KVM_VGIC_V3_ADDR_TYPE_DIST:
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-		addr_ptr = &vgic->vgic_dist_base;
-		alignment = SZ_64K;
-		break;
-	case KVM_VGIC_V3_ADDR_TYPE_REDIST: {
-		struct vgic_redist_region *rdreg;
-
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-		if (r)
-			break;
-		if (write) {
-			r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
-			goto out;
-		}
-		rdreg = list_first_entry(&vgic->rd_regions,
-					 struct vgic_redist_region, list);
-		if (!rdreg)
-			addr_ptr = &undef_value;
-		else
-			addr_ptr = &rdreg->base;
-		break;
-	}
-	case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
-	{
-		struct vgic_redist_region *rdreg;
-		u8 index;
-
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-		if (r)
-			break;
-
-		index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK;
-
-		if (write) {
-			gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK;
-			u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK)
-					>> KVM_VGIC_V3_RDIST_COUNT_SHIFT;
-			u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK)
-					>> KVM_VGIC_V3_RDIST_FLAGS_SHIFT;
-
-			if (!count || flags)
-				r = -EINVAL;
-			else
-				r = vgic_v3_set_redist_base(kvm, index,
-							    base, count);
-			goto out;
-		}
-
-		rdreg = vgic_v3_rdist_region_from_index(kvm, index);
-		if (!rdreg) {
-			r = -ENOENT;
-			goto out;
-		}
-
-		*addr = index;
-		*addr |= rdreg->base;
-		*addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
-		goto out;
-	}
-	default:
-		r = -ENODEV;
-	}
-
-	if (r)
-		goto out;
-
-	if (write) {
-		r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment);
-		if (!r)
-			*addr_ptr = *addr;
-	} else {
-		*addr = *addr_ptr;
-	}
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int vgic_set_common_attr(struct kvm_device *dev,
-				struct kvm_device_attr *attr)
-{
-	int r;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		if (copy_from_user(&addr, uaddr, sizeof(addr)))
-			return -EFAULT;
-
-		r = kvm_vgic_addr(dev->kvm, type, &addr, true);
-		return (r == -ENODEV) ? -ENXIO : r;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 val;
-		int ret = 0;
-
-		if (get_user(val, uaddr))
-			return -EFAULT;
-
-		/*
-		 * We require:
-		 * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
-		 * - at most 1024 interrupts
-		 * - a multiple of 32 interrupts
-		 */
-		if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
-		    val > VGIC_MAX_RESERVED ||
-		    (val & 31))
-			return -EINVAL;
-
-		mutex_lock(&dev->kvm->lock);
-
-		if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
-			ret = -EBUSY;
-		else
-			dev->kvm->arch.vgic.nr_spis =
-				val - VGIC_NR_PRIVATE_IRQS;
-
-		mutex_unlock(&dev->kvm->lock);
-
-		return ret;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			mutex_lock(&dev->kvm->lock);
-			r = vgic_init(dev->kvm);
-			mutex_unlock(&dev->kvm->lock);
-			return r;
-		}
-		break;
-	}
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_get_common_attr(struct kvm_device *dev,
-				struct kvm_device_attr *attr)
-{
-	int r = -ENXIO;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		r = kvm_vgic_addr(dev->kvm, type, &addr, false);
-		if (r)
-			return (r == -ENODEV) ? -ENXIO : r;
-
-		if (copy_to_user(uaddr, &addr, sizeof(addr)))
-			return -EFAULT;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
-		r = put_user(dev->kvm->arch.vgic.nr_spis +
-			     VGIC_NR_PRIVATE_IRQS, uaddr);
-		break;
-	}
-	}
-
-	return r;
-}
-
-static int vgic_create(struct kvm_device *dev, u32 type)
-{
-	return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_destroy(struct kvm_device *dev)
-{
-	kfree(dev);
-}
-
-int kvm_register_vgic_device(unsigned long type)
-{
-	int ret = -ENODEV;
-
-	switch (type) {
-	case KVM_DEV_TYPE_ARM_VGIC_V2:
-		ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-					      KVM_DEV_TYPE_ARM_VGIC_V2);
-		break;
-	case KVM_DEV_TYPE_ARM_VGIC_V3:
-		ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
-					      KVM_DEV_TYPE_ARM_VGIC_V3);
-
-		if (ret)
-			break;
-		ret = kvm_vgic_register_its_device();
-		break;
-	}
-
-	return ret;
-}
-
-int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-		       struct vgic_reg_attr *reg_attr)
-{
-	int cpuid;
-
-	cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
-		 KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
-	if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
-		return -EINVAL;
-
-	reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-	reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-
-	return 0;
-}
-
-/* unlocks vcpus from @vcpu_lock_idx and smaller */
-static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
-{
-	struct kvm_vcpu *tmp_vcpu;
-
-	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-		tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
-		mutex_unlock(&tmp_vcpu->mutex);
-	}
-}
-
-void unlock_all_vcpus(struct kvm *kvm)
-{
-	unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
-}
-
-/* Returns true if all vcpus were locked, false otherwise */
-bool lock_all_vcpus(struct kvm *kvm)
-{
-	struct kvm_vcpu *tmp_vcpu;
-	int c;
-
-	/*
-	 * Any time a vcpu is run, vcpu_load is called which tries to grab the
-	 * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
-	 * that no other VCPUs are run and fiddle with the vgic state while we
-	 * access it.
-	 */
-	kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
-		if (!mutex_trylock(&tmp_vcpu->mutex)) {
-			unlock_vcpus(kvm, c - 1);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-/**
- * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
- *
- * @dev:      kvm device handle
- * @attr:     kvm device attribute
- * @reg:      address the value is read or written
- * @is_write: true if userspace is writing a register
- */
-static int vgic_v2_attr_regs_access(struct kvm_device *dev,
-				    struct kvm_device_attr *attr,
-				    u32 *reg, bool is_write)
-{
-	struct vgic_reg_attr reg_attr;
-	gpa_t addr;
-	struct kvm_vcpu *vcpu;
-	int ret;
-
-	ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
-	if (ret)
-		return ret;
-
-	vcpu = reg_attr.vcpu;
-	addr = reg_attr.addr;
-
-	mutex_lock(&dev->kvm->lock);
-
-	ret = vgic_init(dev->kvm);
-	if (ret)
-		goto out;
-
-	if (!lock_all_vcpus(dev->kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg);
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	unlock_all_vcpus(dev->kvm);
-out:
-	mutex_unlock(&dev->kvm->lock);
-	return ret;
-}
-
-static int vgic_v2_set_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_set_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 reg;
-
-		if (get_user(reg, uaddr))
-			return -EFAULT;
-
-		return vgic_v2_attr_regs_access(dev, attr, &reg, true);
-	}
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_v2_get_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_get_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 reg = 0;
-
-		ret = vgic_v2_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		return put_user(reg, uaddr);
-	}
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_v2_has_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR:
-		switch (attr->attr) {
-		case KVM_VGIC_V2_ADDR_TYPE_DIST:
-		case KVM_VGIC_V2_ADDR_TYPE_CPU:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		return vgic_v2_has_attr_regs(dev, attr);
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-		return 0;
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			return 0;
-		}
-	}
-	return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
-	.name = "kvm-arm-vgic-v2",
-	.create = vgic_create,
-	.destroy = vgic_destroy,
-	.set_attr = vgic_v2_set_attr,
-	.get_attr = vgic_v2_get_attr,
-	.has_attr = vgic_v2_has_attr,
-};
-
-int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-		       struct vgic_reg_attr *reg_attr)
-{
-	unsigned long vgic_mpidr, mpidr_reg;
-
-	/*
-	 * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group,
-	 * attr might not hold MPIDR. Hence assume vcpu0.
-	 */
-	if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) {
-		vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >>
-			      KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT;
-
-		mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr);
-		reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg);
-	} else {
-		reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0);
-	}
-
-	if (!reg_attr->vcpu)
-		return -EINVAL;
-
-	reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-
-	return 0;
-}
-
-/*
- * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
- *
- * @dev:      kvm device handle
- * @attr:     kvm device attribute
- * @reg:      address the value is read or written
- * @is_write: true if userspace is writing a register
- */
-static int vgic_v3_attr_regs_access(struct kvm_device *dev,
-				    struct kvm_device_attr *attr,
-				    u64 *reg, bool is_write)
-{
-	struct vgic_reg_attr reg_attr;
-	gpa_t addr;
-	struct kvm_vcpu *vcpu;
-	int ret;
-	u32 tmp32;
-
-	ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
-	if (ret)
-		return ret;
-
-	vcpu = reg_attr.vcpu;
-	addr = reg_attr.addr;
-
-	mutex_lock(&dev->kvm->lock);
-
-	if (unlikely(!vgic_initialized(dev->kvm))) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	if (!lock_all_vcpus(dev->kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		if (is_write)
-			tmp32 = *reg;
-
-		ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32);
-		if (!is_write)
-			*reg = tmp32;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
-		if (is_write)
-			tmp32 = *reg;
-
-		ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32);
-		if (!is_write)
-			*reg = tmp32;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-		u64 regid;
-
-		regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
-		ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write,
-						  regid, reg);
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-		unsigned int info, intid;
-
-		info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
-			KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT;
-		if (info == VGIC_LEVEL_INFO_LINE_LEVEL) {
-			intid = attr->attr &
-				KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK;
-			ret = vgic_v3_line_level_info_uaccess(vcpu, is_write,
-							      intid, reg);
-		} else {
-			ret = -EINVAL;
-		}
-		break;
-	}
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	unlock_all_vcpus(dev->kvm);
-out:
-	mutex_unlock(&dev->kvm->lock);
-	return ret;
-}
-
-static int vgic_v3_set_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_set_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 tmp32;
-		u64 reg;
-
-		if (get_user(tmp32, uaddr))
-			return -EFAULT;
-
-		reg = tmp32;
-		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 reg;
-
-		if (get_user(reg, uaddr))
-			return -EFAULT;
-
-		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u64 reg;
-		u32 tmp32;
-
-		if (get_user(tmp32, uaddr))
-			return -EFAULT;
-
-		reg = tmp32;
-		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-		int ret;
-
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
-			mutex_lock(&dev->kvm->lock);
-
-			if (!lock_all_vcpus(dev->kvm)) {
-				mutex_unlock(&dev->kvm->lock);
-				return -EBUSY;
-			}
-			ret = vgic_v3_save_pending_tables(dev->kvm);
-			unlock_all_vcpus(dev->kvm);
-			mutex_unlock(&dev->kvm->lock);
-			return ret;
-		}
-		break;
-	}
-	}
-	return -ENXIO;
-}
-
-static int vgic_v3_get_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_get_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u64 reg;
-		u32 tmp32;
-
-		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		tmp32 = reg;
-		return put_user(tmp32, uaddr);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 reg;
-
-		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		return put_user(reg, uaddr);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u64 reg;
-		u32 tmp32;
-
-		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		tmp32 = reg;
-		return put_user(tmp32, uaddr);
-	}
-	}
-	return -ENXIO;
-}
-
-static int vgic_v3_has_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR:
-		switch (attr->attr) {
-		case KVM_VGIC_V3_ADDR_TYPE_DIST:
-		case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-		case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
-		return vgic_v3_has_attr_regs(dev, attr);
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-		return 0;
-	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-		if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
-		      KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) ==
-		      VGIC_LEVEL_INFO_LINE_LEVEL)
-			return 0;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			return 0;
-		case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
-			return 0;
-		}
-	}
-	return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v3_ops = {
-	.name = "kvm-arm-vgic-v3",
-	.create = vgic_create,
-	.destroy = vgic_destroy,
-	.set_attr = vgic_v3_set_attr,
-	.get_attr = vgic_v3_get_attr,
-	.has_attr = vgic_v3_has_attr,
-};
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
deleted file mode 100644
index a016f07adc28..000000000000
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ /dev/null
@@ -1,550 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VGICv2 MMIO handling functions
- */
-
-#include <linux/irqchip/arm-gic.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/nospec.h>
-
-#include <kvm/iodev.h>
-#include <kvm/arm_vgic.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-/*
- * The Revision field in the IIDR have the following meanings:
- *
- * Revision 1: Report GICv2 interrupts as group 0 instead of group 1
- * Revision 2: Interrupt groups are guest-configurable and signaled using
- * 	       their configured groups.
- */
-
-static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len)
-{
-	struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
-	u32 value;
-
-	switch (addr & 0x0c) {
-	case GIC_DIST_CTRL:
-		value = vgic->enabled ? GICD_ENABLE : 0;
-		break;
-	case GIC_DIST_CTR:
-		value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
-		value = (value >> 5) - 1;
-		value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
-		break;
-	case GIC_DIST_IIDR:
-		value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
-			(vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
-			(IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
-		break;
-	default:
-		return 0;
-	}
-
-	return value;
-}
-
-static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	bool was_enabled = dist->enabled;
-
-	switch (addr & 0x0c) {
-	case GIC_DIST_CTRL:
-		dist->enabled = val & GICD_ENABLE;
-		if (!was_enabled && dist->enabled)
-			vgic_kick_vcpus(vcpu->kvm);
-		break;
-	case GIC_DIST_CTR:
-	case GIC_DIST_IIDR:
-		/* Nothing to do */
-		return;
-	}
-}
-
-static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
-					   gpa_t addr, unsigned int len,
-					   unsigned long val)
-{
-	switch (addr & 0x0c) {
-	case GIC_DIST_IIDR:
-		if (val != vgic_mmio_read_v2_misc(vcpu, addr, len))
-			return -EINVAL;
-
-		/*
-		 * If we observe a write to GICD_IIDR we know that userspace
-		 * has been updated and has had a chance to cope with older
-		 * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting
-		 * interrupts as group 1, and therefore we now allow groups to
-		 * be user writable.  Doing this by default would break
-		 * migration from old kernels to new kernels with legacy
-		 * userspace.
-		 */
-		vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
-		return 0;
-	}
-
-	vgic_mmio_write_v2_misc(vcpu, addr, len, val);
-	return 0;
-}
-
-static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len,
-					    unsigned long val)
-{
-	if (vcpu->kvm->arch.vgic.v2_groups_user_writable)
-		vgic_mmio_write_group(vcpu, addr, len, val);
-
-	return 0;
-}
-
-static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
-				 gpa_t addr, unsigned int len,
-				 unsigned long val)
-{
-	int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus);
-	int intid = val & 0xf;
-	int targets = (val >> 16) & 0xff;
-	int mode = (val >> 24) & 0x03;
-	int c;
-	struct kvm_vcpu *vcpu;
-	unsigned long flags;
-
-	switch (mode) {
-	case 0x0:		/* as specified by targets */
-		break;
-	case 0x1:
-		targets = (1U << nr_vcpus) - 1;			/* all, ... */
-		targets &= ~(1U << source_vcpu->vcpu_id);	/* but self */
-		break;
-	case 0x2:		/* this very vCPU only */
-		targets = (1U << source_vcpu->vcpu_id);
-		break;
-	case 0x3:		/* reserved */
-		return;
-	}
-
-	kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) {
-		struct vgic_irq *irq;
-
-		if (!(targets & (1U << c)))
-			continue;
-
-		irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->pending_latch = true;
-		irq->source |= 1U << source_vcpu->vcpu_id;
-
-		vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags);
-		vgic_put_irq(source_vcpu->kvm, irq);
-	}
-}
-
-static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
-					   gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-	int i;
-	u64 val = 0;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		val |= (u64)irq->targets << (i * 8);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return val;
-}
-
-static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
-				   gpa_t addr, unsigned int len,
-				   unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-	u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0);
-	int i;
-	unsigned long flags;
-
-	/* GICD_ITARGETSR[0-7] are read-only */
-	if (intid < VGIC_NR_PRIVATE_IRQS)
-		return;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
-		int target;
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		irq->targets = (val >> (i * 8)) & cpu_mask;
-		target = irq->targets ? __ffs(irq->targets) : 0;
-		irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
-
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len)
-{
-	u32 intid = addr & 0x0f;
-	int i;
-	u64 val = 0;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		val |= (u64)irq->source << (i * 8);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-	return val;
-}
-
-static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	u32 intid = addr & 0x0f;
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		irq->source &= ~((val >> (i * 8)) & 0xff);
-		if (!irq->source)
-			irq->pending_latch = false;
-
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	u32 intid = addr & 0x0f;
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		irq->source |= (val >> (i * 8)) & 0xff;
-
-		if (irq->source) {
-			irq->pending_latch = true;
-			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		} else {
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		}
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-#define GICC_ARCH_VERSION_V2	0x2
-
-/* These are for userland accesses only, there is no guest-facing emulation. */
-static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
-					   gpa_t addr, unsigned int len)
-{
-	struct vgic_vmcr vmcr;
-	u32 val;
-
-	vgic_get_vmcr(vcpu, &vmcr);
-
-	switch (addr & 0xff) {
-	case GIC_CPU_CTRL:
-		val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT;
-		val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT;
-		val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT;
-		val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT;
-		val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT;
-		val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT;
-
-		break;
-	case GIC_CPU_PRIMASK:
-		/*
-		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-		 * the PMR field as GICH_VMCR.VMPriMask rather than
-		 * GICC_PMR.Priority, so we expose the upper five bits of
-		 * priority mask to userspace using the lower bits in the
-		 * unsigned long.
-		 */
-		val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
-			GICV_PMR_PRIORITY_SHIFT;
-		break;
-	case GIC_CPU_BINPOINT:
-		val = vmcr.bpr;
-		break;
-	case GIC_CPU_ALIAS_BINPOINT:
-		val = vmcr.abpr;
-		break;
-	case GIC_CPU_IDENT:
-		val = ((PRODUCT_ID_KVM << 20) |
-		       (GICC_ARCH_VERSION_V2 << 16) |
-		       IMPLEMENTER_ARM);
-		break;
-	default:
-		return 0;
-	}
-
-	return val;
-}
-
-static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
-				   gpa_t addr, unsigned int len,
-				   unsigned long val)
-{
-	struct vgic_vmcr vmcr;
-
-	vgic_get_vmcr(vcpu, &vmcr);
-
-	switch (addr & 0xff) {
-	case GIC_CPU_CTRL:
-		vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0);
-		vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1);
-		vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl);
-		vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn);
-		vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR);
-		vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS);
-
-		break;
-	case GIC_CPU_PRIMASK:
-		/*
-		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-		 * the PMR field as GICH_VMCR.VMPriMask rather than
-		 * GICC_PMR.Priority, so we expose the upper five bits of
-		 * priority mask to userspace using the lower bits in the
-		 * unsigned long.
-		 */
-		vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
-			GICV_PMR_PRIORITY_MASK;
-		break;
-	case GIC_CPU_BINPOINT:
-		vmcr.bpr = val;
-		break;
-	case GIC_CPU_ALIAS_BINPOINT:
-		vmcr.abpr = val;
-		break;
-	}
-
-	vgic_set_vmcr(vcpu, &vmcr);
-}
-
-static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
-					gpa_t addr, unsigned int len)
-{
-	int n; /* which APRn is this */
-
-	n = (addr >> 2) & 0x3;
-
-	if (kvm_vgic_global_state.type == VGIC_V2) {
-		/* GICv2 hardware systems support max. 32 groups */
-		if (n != 0)
-			return 0;
-		return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr;
-	} else {
-		struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-		if (n > vgic_v3_max_apr_idx(vcpu))
-			return 0;
-
-		n = array_index_nospec(n, 4);
-
-		/* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
-		return vgicv3->vgic_ap1r[n];
-	}
-}
-
-static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu,
-				gpa_t addr, unsigned int len,
-				unsigned long val)
-{
-	int n; /* which APRn is this */
-
-	n = (addr >> 2) & 0x3;
-
-	if (kvm_vgic_global_state.type == VGIC_V2) {
-		/* GICv2 hardware systems support max. 32 groups */
-		if (n != 0)
-			return;
-		vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val;
-	} else {
-		struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-		if (n > vgic_v3_max_apr_idx(vcpu))
-			return;
-
-		n = array_index_nospec(n, 4);
-
-		/* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
-		vgicv3->vgic_ap1r[n] = val;
-	}
-}
-
-static const struct vgic_register_region vgic_v2_dist_registers[] = {
-	REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL,
-		vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc,
-		NULL, vgic_mmio_uaccess_write_v2_misc,
-		12, VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
-		vgic_mmio_read_group, vgic_mmio_write_group,
-		NULL, vgic_mmio_uaccess_write_v2_group, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
-		vgic_mmio_read_enable, vgic_mmio_write_senable,
-		NULL, vgic_uaccess_write_senable, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
-		vgic_mmio_read_enable, vgic_mmio_write_cenable,
-		NULL, vgic_uaccess_write_cenable, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
-		vgic_mmio_read_pending, vgic_mmio_write_spending,
-		NULL, vgic_uaccess_write_spending, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
-		vgic_mmio_read_pending, vgic_mmio_write_cpending,
-		NULL, vgic_uaccess_write_cpending, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
-		vgic_mmio_read_active, vgic_mmio_write_sactive,
-		vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
-		vgic_mmio_read_active, vgic_mmio_write_cactive,
-		vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
-		vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
-		8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
-		vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
-		vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
-		vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR,
-		vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET,
-		vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-};
-
-static const struct vgic_register_region vgic_v2_cpu_registers[] = {
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
-		vgic_mmio_read_apr, vgic_mmio_write_apr, 16,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-};
-
-unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
-{
-	dev->regions = vgic_v2_dist_registers;
-	dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
-
-	kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
-
-	return SZ_4K;
-}
-
-int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-	const struct vgic_register_region *region;
-	struct vgic_io_device iodev;
-	struct vgic_reg_attr reg_attr;
-	struct kvm_vcpu *vcpu;
-	gpa_t addr;
-	int ret;
-
-	ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
-	if (ret)
-		return ret;
-
-	vcpu = reg_attr.vcpu;
-	addr = reg_attr.addr;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		iodev.regions = vgic_v2_dist_registers;
-		iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
-		iodev.base_addr = 0;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		iodev.regions = vgic_v2_cpu_registers;
-		iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
-		iodev.base_addr = 0;
-		break;
-	default:
-		return -ENXIO;
-	}
-
-	/* We only support aligned 32-bit accesses. */
-	if (addr & 3)
-		return -ENXIO;
-
-	region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
-	if (!region)
-		return -ENXIO;
-
-	return 0;
-}
-
-int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			  int offset, u32 *val)
-{
-	struct vgic_io_device dev = {
-		.regions = vgic_v2_cpu_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
-		.iodev_type = IODEV_CPUIF,
-	};
-
-	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
-
-int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val)
-{
-	struct vgic_io_device dev = {
-		.regions = vgic_v2_dist_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
-		.iodev_type = IODEV_DIST,
-	};
-
-	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
deleted file mode 100644
index 89a14ec8b33b..000000000000
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ /dev/null
@@ -1,1063 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VGICv3 MMIO handling functions
- */
-
-#include <linux/bitfield.h>
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <kvm/iodev.h>
-#include <kvm/arm_vgic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-/* extract @num bytes at @offset bytes offset in data */
-unsigned long extract_bytes(u64 data, unsigned int offset,
-			    unsigned int num)
-{
-	return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
-}
-
-/* allows updates of any half of a 64-bit register (or the whole thing) */
-u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
-		     unsigned long val)
-{
-	int lower = (offset & 4) * 8;
-	int upper = lower + 8 * len - 1;
-
-	reg &= ~GENMASK_ULL(upper, lower);
-	val &= GENMASK_ULL(len * 8 - 1, 0);
-
-	return reg | ((u64)val << lower);
-}
-
-bool vgic_has_its(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
-		return false;
-
-	return dist->has_its;
-}
-
-bool vgic_supports_direct_msis(struct kvm *kvm)
-{
-	return (kvm_vgic_global_state.has_gicv4_1 ||
-		(kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
-}
-
-/*
- * The Revision field in the IIDR have the following meanings:
- *
- * Revision 2: Interrupt groups are guest-configurable and signaled using
- * 	       their configured groups.
- */
-
-static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len)
-{
-	struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
-	u32 value = 0;
-
-	switch (addr & 0x0c) {
-	case GICD_CTLR:
-		if (vgic->enabled)
-			value |= GICD_CTLR_ENABLE_SS_G1;
-		value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
-		if (vgic->nassgireq)
-			value |= GICD_CTLR_nASSGIreq;
-		break;
-	case GICD_TYPER:
-		value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
-		value = (value >> 5) - 1;
-		if (vgic_has_its(vcpu->kvm)) {
-			value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
-			value |= GICD_TYPER_LPIS;
-		} else {
-			value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
-		}
-		break;
-	case GICD_TYPER2:
-		if (kvm_vgic_global_state.has_gicv4_1)
-			value = GICD_TYPER2_nASSGIcap;
-		break;
-	case GICD_IIDR:
-		value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
-			(vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
-			(IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
-		break;
-	default:
-		return 0;
-	}
-
-	return value;
-}
-
-static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	switch (addr & 0x0c) {
-	case GICD_CTLR: {
-		bool was_enabled, is_hwsgi;
-
-		mutex_lock(&vcpu->kvm->lock);
-
-		was_enabled = dist->enabled;
-		is_hwsgi = dist->nassgireq;
-
-		dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
-
-		/* Not a GICv4.1? No HW SGIs */
-		if (!kvm_vgic_global_state.has_gicv4_1)
-			val &= ~GICD_CTLR_nASSGIreq;
-
-		/* Dist stays enabled? nASSGIreq is RO */
-		if (was_enabled && dist->enabled) {
-			val &= ~GICD_CTLR_nASSGIreq;
-			val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi);
-		}
-
-		/* Switching HW SGIs? */
-		dist->nassgireq = val & GICD_CTLR_nASSGIreq;
-		if (is_hwsgi != dist->nassgireq)
-			vgic_v4_configure_vsgis(vcpu->kvm);
-
-		if (kvm_vgic_global_state.has_gicv4_1 &&
-		    was_enabled != dist->enabled)
-			kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
-		else if (!was_enabled && dist->enabled)
-			vgic_kick_vcpus(vcpu->kvm);
-
-		mutex_unlock(&vcpu->kvm->lock);
-		break;
-	}
-	case GICD_TYPER:
-	case GICD_TYPER2:
-	case GICD_IIDR:
-		/* This is at best for documentation purposes... */
-		return;
-	}
-}
-
-static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
-					   gpa_t addr, unsigned int len,
-					   unsigned long val)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	switch (addr & 0x0c) {
-	case GICD_TYPER2:
-	case GICD_IIDR:
-		if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
-			return -EINVAL;
-		return 0;
-	case GICD_CTLR:
-		/* Not a GICv4.1? No HW SGIs */
-		if (!kvm_vgic_global_state.has_gicv4_1)
-			val &= ~GICD_CTLR_nASSGIreq;
-
-		dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
-		dist->nassgireq = val & GICD_CTLR_nASSGIreq;
-		return 0;
-	}
-
-	vgic_mmio_write_v3_misc(vcpu, addr, len, val);
-	return 0;
-}
-
-static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len)
-{
-	int intid = VGIC_ADDR_TO_INTID(addr, 64);
-	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-	unsigned long ret = 0;
-
-	if (!irq)
-		return 0;
-
-	/* The upper word is RAZ for us. */
-	if (!(addr & 4))
-		ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
-
-	vgic_put_irq(vcpu->kvm, irq);
-	return ret;
-}
-
-static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val)
-{
-	int intid = VGIC_ADDR_TO_INTID(addr, 64);
-	struct vgic_irq *irq;
-	unsigned long flags;
-
-	/* The upper word is WI for us since we don't implement Aff3. */
-	if (addr & 4)
-		return;
-
-	irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-
-	if (!irq)
-		return;
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-	/* We only care about and preserve Aff0, Aff1 and Aff2. */
-	irq->mpidr = val & GENMASK(23, 0);
-	irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
-
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-	vgic_put_irq(vcpu->kvm, irq);
-}
-
-static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
-}
-
-
-static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	bool was_enabled = vgic_cpu->lpis_enabled;
-
-	if (!vgic_has_its(vcpu->kvm))
-		return;
-
-	vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
-
-	if (was_enabled && !vgic_cpu->lpis_enabled) {
-		vgic_flush_pending_lpis(vcpu);
-		vgic_its_invalidate_cache(vcpu->kvm);
-	}
-
-	if (!was_enabled && vgic_cpu->lpis_enabled)
-		vgic_enable_lpis(vcpu);
-}
-
-static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
-					      gpa_t addr, unsigned int len)
-{
-	unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
-	int target_vcpu_id = vcpu->vcpu_id;
-	gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
-			(rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
-	u64 value;
-
-	value = (u64)(mpidr & GENMASK(23, 0)) << 32;
-	value |= ((target_vcpu_id & 0xffff) << 8);
-
-	if (addr == last_rdist_typer)
-		value |= GICR_TYPER_LAST;
-	if (vgic_has_its(vcpu->kvm))
-		value |= GICR_TYPER_PLPIS;
-
-	return extract_bytes(value, addr & 7, len);
-}
-
-static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-}
-
-static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
-					      gpa_t addr, unsigned int len)
-{
-	switch (addr & 0xffff) {
-	case GICD_PIDR2:
-		/* report a GICv3 compliant implementation */
-		return 0x3b;
-	}
-
-	return 0;
-}
-
-static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
-						  gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/*
-	 * pending state of interrupt is latched in pending_latch variable.
-	 * Userspace will save and restore pending state and line_level
-	 * separately.
-	 * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt
-	 * for handling of ISPENDR and ICPENDR.
-	 */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		bool state = irq->pending_latch;
-
-		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-			int err;
-
-			err = irq_get_irqchip_state(irq->host_irq,
-						    IRQCHIP_STATE_PENDING,
-						    &state);
-			WARN_ON(err);
-		}
-
-		if (state)
-			value |= (1U << i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
-					 gpa_t addr, unsigned int len,
-					 unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		if (test_bit(i, &val)) {
-			/*
-			 * pending_latch is set irrespective of irq type
-			 * (level or edge) to avoid dependency that VM should
-			 * restore irq config before pending info.
-			 */
-			irq->pending_latch = true;
-			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		} else {
-			irq->pending_latch = false;
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		}
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return 0;
-}
-
-/* We want to avoid outer shareable. */
-u64 vgic_sanitise_shareability(u64 field)
-{
-	switch (field) {
-	case GIC_BASER_OuterShareable:
-		return GIC_BASER_InnerShareable;
-	default:
-		return field;
-	}
-}
-
-/* Avoid any inner non-cacheable mapping. */
-u64 vgic_sanitise_inner_cacheability(u64 field)
-{
-	switch (field) {
-	case GIC_BASER_CACHE_nCnB:
-	case GIC_BASER_CACHE_nC:
-		return GIC_BASER_CACHE_RaWb;
-	default:
-		return field;
-	}
-}
-
-/* Non-cacheable or same-as-inner are OK. */
-u64 vgic_sanitise_outer_cacheability(u64 field)
-{
-	switch (field) {
-	case GIC_BASER_CACHE_SameAsInner:
-	case GIC_BASER_CACHE_nC:
-		return field;
-	default:
-		return GIC_BASER_CACHE_nC;
-	}
-}
-
-u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
-			u64 (*sanitise_fn)(u64))
-{
-	u64 field = (reg & field_mask) >> field_shift;
-
-	field = sanitise_fn(field) << field_shift;
-	return (reg & ~field_mask) | field;
-}
-
-#define PROPBASER_RES0_MASK						\
-	(GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
-#define PENDBASER_RES0_MASK						\
-	(BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |	\
-	 GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
-
-static u64 vgic_sanitise_pendbaser(u64 reg)
-{
-	reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
-				  GICR_PENDBASER_SHAREABILITY_SHIFT,
-				  vgic_sanitise_shareability);
-	reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
-				  GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_inner_cacheability);
-	reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
-				  GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_outer_cacheability);
-
-	reg &= ~PENDBASER_RES0_MASK;
-
-	return reg;
-}
-
-static u64 vgic_sanitise_propbaser(u64 reg)
-{
-	reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
-				  GICR_PROPBASER_SHAREABILITY_SHIFT,
-				  vgic_sanitise_shareability);
-	reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
-				  GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_inner_cacheability);
-	reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
-				  GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_outer_cacheability);
-
-	reg &= ~PROPBASER_RES0_MASK;
-	return reg;
-}
-
-static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return extract_bytes(dist->propbaser, addr & 7, len);
-}
-
-static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	u64 old_propbaser, propbaser;
-
-	/* Storing a value with LPIs already enabled is undefined */
-	if (vgic_cpu->lpis_enabled)
-		return;
-
-	do {
-		old_propbaser = READ_ONCE(dist->propbaser);
-		propbaser = old_propbaser;
-		propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
-		propbaser = vgic_sanitise_propbaser(propbaser);
-	} while (cmpxchg64(&dist->propbaser, old_propbaser,
-			   propbaser) != old_propbaser);
-}
-
-static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	u64 value = vgic_cpu->pendbaser;
-
-	value &= ~GICR_PENDBASER_PTZ;
-
-	return extract_bytes(value, addr & 7, len);
-}
-
-static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	u64 old_pendbaser, pendbaser;
-
-	/* Storing a value with LPIs already enabled is undefined */
-	if (vgic_cpu->lpis_enabled)
-		return;
-
-	do {
-		old_pendbaser = READ_ONCE(vgic_cpu->pendbaser);
-		pendbaser = old_pendbaser;
-		pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
-		pendbaser = vgic_sanitise_pendbaser(pendbaser);
-	} while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser,
-			   pendbaser) != old_pendbaser);
-}
-
-/*
- * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
- * redistributors, while SPIs are covered by registers in the distributor
- * block. Trying to set private IRQs in this block gets ignored.
- * We take some special care here to fix the calculation of the register
- * offset.
- */
-#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \
-	{								\
-		.reg_offset = off,					\
-		.bits_per_irq = bpi,					\
-		.len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8,		\
-		.access_flags = acc,					\
-		.read = vgic_mmio_read_raz,				\
-		.write = vgic_mmio_write_wi,				\
-	}, {								\
-		.reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8,	\
-		.bits_per_irq = bpi,					\
-		.len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8,	\
-		.access_flags = acc,					\
-		.read = rd,						\
-		.write = wr,						\
-		.uaccess_read = ur,					\
-		.uaccess_write = uw,					\
-	}
-
-static const struct vgic_register_region vgic_v3_dist_registers[] = {
-	REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR,
-		vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc,
-		NULL, vgic_mmio_uaccess_write_v3_misc,
-		16, VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICD_STATUSR,
-		vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
-		vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
-		vgic_mmio_read_enable, vgic_mmio_write_senable,
-		NULL, vgic_uaccess_write_senable, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
-		vgic_mmio_read_enable, vgic_mmio_write_cenable,
-	       NULL, vgic_uaccess_write_cenable, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
-		vgic_mmio_read_pending, vgic_mmio_write_spending,
-		vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
-		vgic_mmio_read_pending, vgic_mmio_write_cpending,
-		vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
-		vgic_mmio_read_active, vgic_mmio_write_sactive,
-		vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
-		vgic_mmio_read_active, vgic_mmio_write_cactive,
-		vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive,
-		1, VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
-		vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
-		8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
-		vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
-		vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
-		vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
-		VGIC_ACCESS_32bit),
-};
-
-static const struct vgic_register_region vgic_v3_rd_registers[] = {
-	/* RD_base registers */
-	REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
-		vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_STATUSR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
-		vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
-		vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
-		vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
-		vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
-		vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
-		VGIC_ACCESS_32bit),
-	/* SGI_base registers */
-	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0,
-		vgic_mmio_read_group, vgic_mmio_write_group, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0,
-		vgic_mmio_read_enable, vgic_mmio_write_senable,
-		NULL, vgic_uaccess_write_senable, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0,
-		vgic_mmio_read_enable, vgic_mmio_write_cenable,
-		NULL, vgic_uaccess_write_cenable, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
-		vgic_mmio_read_pending, vgic_mmio_write_spending,
-		vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
-		vgic_mmio_read_pending, vgic_mmio_write_cpending,
-		vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0,
-		vgic_mmio_read_active, vgic_mmio_write_sactive,
-		vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0,
-		vgic_mmio_read_active, vgic_mmio_write_cactive,
-		vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0,
-		vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0,
-		vgic_mmio_read_config, vgic_mmio_write_config, 8,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-};
-
-unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
-{
-	dev->regions = vgic_v3_dist_registers;
-	dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
-
-	kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
-
-	return SZ_64K;
-}
-
-/**
- * vgic_register_redist_iodev - register a single redist iodev
- * @vcpu:    The VCPU to which the redistributor belongs
- *
- * Register a KVM iodev for this VCPU's redistributor using the address
- * provided.
- *
- * Return 0 on success, -ERRNO otherwise.
- */
-int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct vgic_dist *vgic = &kvm->arch.vgic;
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
-	struct vgic_redist_region *rdreg;
-	gpa_t rd_base;
-	int ret;
-
-	if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr))
-		return 0;
-
-	/*
-	 * We may be creating VCPUs before having set the base address for the
-	 * redistributor region, in which case we will come back to this
-	 * function for all VCPUs when the base address is set.  Just return
-	 * without doing any work for now.
-	 */
-	rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions);
-	if (!rdreg)
-		return 0;
-
-	if (!vgic_v3_check_base(kvm))
-		return -EINVAL;
-
-	vgic_cpu->rdreg = rdreg;
-
-	rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
-
-	kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
-	rd_dev->base_addr = rd_base;
-	rd_dev->iodev_type = IODEV_REDIST;
-	rd_dev->regions = vgic_v3_rd_registers;
-	rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
-	rd_dev->redist_vcpu = vcpu;
-
-	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
-				      2 * SZ_64K, &rd_dev->dev);
-	mutex_unlock(&kvm->slots_lock);
-
-	if (ret)
-		return ret;
-
-	rdreg->free_index++;
-	return 0;
-}
-
-static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
-{
-	struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
-
-	kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev);
-}
-
-static int vgic_register_all_redist_iodevs(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int c, ret = 0;
-
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		ret = vgic_register_redist_iodev(vcpu);
-		if (ret)
-			break;
-	}
-
-	if (ret) {
-		/* The current c failed, so we start with the previous one. */
-		mutex_lock(&kvm->slots_lock);
-		for (c--; c >= 0; c--) {
-			vcpu = kvm_get_vcpu(kvm, c);
-			vgic_unregister_redist_iodev(vcpu);
-		}
-		mutex_unlock(&kvm->slots_lock);
-	}
-
-	return ret;
-}
-
-/**
- * vgic_v3_insert_redist_region - Insert a new redistributor region
- *
- * Performs various checks before inserting the rdist region in the list.
- * Those tests depend on whether the size of the rdist region is known
- * (ie. count != 0). The list is sorted by rdist region index.
- *
- * @kvm: kvm handle
- * @index: redist region index
- * @base: base of the new rdist region
- * @count: number of redistributors the region is made of (0 in the old style
- * single region, whose size is induced from the number of vcpus)
- *
- * Return 0 on success, < 0 otherwise
- */
-static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
-					gpa_t base, uint32_t count)
-{
-	struct vgic_dist *d = &kvm->arch.vgic;
-	struct vgic_redist_region *rdreg;
-	struct list_head *rd_regions = &d->rd_regions;
-	size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
-	int ret;
-
-	/* single rdist region already set ?*/
-	if (!count && !list_empty(rd_regions))
-		return -EINVAL;
-
-	/* cross the end of memory ? */
-	if (base + size < base)
-		return -EINVAL;
-
-	if (list_empty(rd_regions)) {
-		if (index != 0)
-			return -EINVAL;
-	} else {
-		rdreg = list_last_entry(rd_regions,
-					struct vgic_redist_region, list);
-		if (index != rdreg->index + 1)
-			return -EINVAL;
-
-		/* Cannot add an explicitly sized regions after legacy region */
-		if (!rdreg->count)
-			return -EINVAL;
-	}
-
-	/*
-	 * For legacy single-region redistributor regions (!count),
-	 * check that the redistributor region does not overlap with the
-	 * distributor's address space.
-	 */
-	if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
-		vgic_dist_overlap(kvm, base, size))
-		return -EINVAL;
-
-	/* collision with any other rdist region? */
-	if (vgic_v3_rdist_overlap(kvm, base, size))
-		return -EINVAL;
-
-	rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL);
-	if (!rdreg)
-		return -ENOMEM;
-
-	rdreg->base = VGIC_ADDR_UNDEF;
-
-	ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K);
-	if (ret)
-		goto free;
-
-	rdreg->base = base;
-	rdreg->count = count;
-	rdreg->free_index = 0;
-	rdreg->index = index;
-
-	list_add_tail(&rdreg->list, rd_regions);
-	return 0;
-free:
-	kfree(rdreg);
-	return ret;
-}
-
-int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
-{
-	int ret;
-
-	ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
-	if (ret)
-		return ret;
-
-	/*
-	 * Register iodevs for each existing VCPU.  Adding more VCPUs
-	 * afterwards will register the iodevs when needed.
-	 */
-	ret = vgic_register_all_redist_iodevs(kvm);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
-int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-	const struct vgic_register_region *region;
-	struct vgic_io_device iodev;
-	struct vgic_reg_attr reg_attr;
-	struct kvm_vcpu *vcpu;
-	gpa_t addr;
-	int ret;
-
-	ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
-	if (ret)
-		return ret;
-
-	vcpu = reg_attr.vcpu;
-	addr = reg_attr.addr;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		iodev.regions = vgic_v3_dist_registers;
-		iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
-		iodev.base_addr = 0;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{
-		iodev.regions = vgic_v3_rd_registers;
-		iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
-		iodev.base_addr = 0;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-		u64 reg, id;
-
-		id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
-		return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, &reg);
-	}
-	default:
-		return -ENXIO;
-	}
-
-	/* We only support aligned 32-bit accesses. */
-	if (addr & 3)
-		return -ENXIO;
-
-	region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
-	if (!region)
-		return -ENXIO;
-
-	return 0;
-}
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
-	unsigned long affinity;
-	int level0;
-
-	/*
-	 * Split the current VCPU's MPIDR into affinity level 0 and the
-	 * rest as this is what we have to compare against.
-	 */
-	affinity = kvm_vcpu_get_mpidr_aff(vcpu);
-	level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
-	affinity &= ~MPIDR_LEVEL_MASK;
-
-	/* bail out if the upper three levels don't match */
-	if (sgi_aff != affinity)
-		return -1;
-
-	/* Is this VCPU's bit set in the mask ? */
-	if (!(sgi_cpu_mask & BIT(level0)))
-		return -1;
-
-	return level0;
-}
-
-/*
- * The ICC_SGI* registers encode the affinity differently from the MPIDR,
- * so provide a wrapper to use the existing defines to isolate a certain
- * affinity level.
- */
-#define SGI_AFFINITY_LEVEL(reg, level) \
-	((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
-	>> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/**
- * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
- * @vcpu: The VCPU requesting a SGI
- * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU
- * @allow_group1: Does the sysreg access allow generation of G1 SGIs
- *
- * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
- * This will trap in sys_regs.c and call this function.
- * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
- * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
- */
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_vcpu *c_vcpu;
-	u16 target_cpus;
-	u64 mpidr;
-	int sgi, c;
-	int vcpu_id = vcpu->vcpu_id;
-	bool broadcast;
-	unsigned long flags;
-
-	sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-	broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
-	target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
-	mpidr = SGI_AFFINITY_LEVEL(reg, 3);
-	mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
-	mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
-	/*
-	 * We iterate over all VCPUs to find the MPIDRs matching the request.
-	 * If we have handled one CPU, we clear its bit to detect early
-	 * if we are already finished. This avoids iterating through all
-	 * VCPUs when most of the times we just signal a single VCPU.
-	 */
-	kvm_for_each_vcpu(c, c_vcpu, kvm) {
-		struct vgic_irq *irq;
-
-		/* Exit early if we have dealt with all requested CPUs */
-		if (!broadcast && target_cpus == 0)
-			break;
-
-		/* Don't signal the calling VCPU */
-		if (broadcast && c == vcpu_id)
-			continue;
-
-		if (!broadcast) {
-			int level0;
-
-			level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
-			if (level0 == -1)
-				continue;
-
-			/* remove this matching VCPU from the mask */
-			target_cpus &= ~BIT(level0);
-		}
-
-		irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		/*
-		 * An access targetting Group0 SGIs can only generate
-		 * those, while an access targetting Group1 SGIs can
-		 * generate interrupts of either group.
-		 */
-		if (!irq->group || allow_group1) {
-			if (!irq->hw) {
-				irq->pending_latch = true;
-				vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-			} else {
-				/* HW SGI? Ask the GIC to inject it */
-				int err;
-				err = irq_set_irqchip_state(irq->host_irq,
-							    IRQCHIP_STATE_PENDING,
-							    true);
-				WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-				raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-			}
-		} else {
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		}
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val)
-{
-	struct vgic_io_device dev = {
-		.regions = vgic_v3_dist_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers),
-	};
-
-	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
-
-int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			   int offset, u32 *val)
-{
-	struct vgic_io_device rd_dev = {
-		.regions = vgic_v3_rd_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers),
-	};
-
-	return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val);
-}
-
-int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-				    u32 intid, u64 *val)
-{
-	if (intid % 32)
-		return -EINVAL;
-
-	if (is_write)
-		vgic_write_irq_line_level_info(vcpu, intid, *val);
-	else
-		*val = vgic_read_irq_line_level_info(vcpu, intid);
-
-	return 0;
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
deleted file mode 100644
index b2d73fc0d1ef..000000000000
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ /dev/null
@@ -1,1088 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VGIC MMIO handling functions
- */
-
-#include <linux/bitops.h>
-#include <linux/bsearch.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/iodev.h>
-#include <kvm/arm_arch_timer.h>
-#include <kvm/arm_vgic.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
-				 gpa_t addr, unsigned int len)
-{
-	return 0;
-}
-
-unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
-				 gpa_t addr, unsigned int len)
-{
-	return -1UL;
-}
-
-void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-			unsigned int len, unsigned long val)
-{
-	/* Ignore */
-}
-
-int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-			       unsigned int len, unsigned long val)
-{
-	/* Ignore */
-	return 0;
-}
-
-unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
-				   gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/* Loop over all IRQs affected by this read */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		if (irq->group)
-			value |= BIT(i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-static void vgic_update_vsgi(struct vgic_irq *irq)
-{
-	WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group));
-}
-
-void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
-			   unsigned int len, unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->group = !!(val & BIT(i));
-		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-			vgic_update_vsgi(irq);
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		} else {
-			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		}
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-/*
- * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
- * of the enabled bit, so there is only one function for both here.
- */
-unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/* Loop over all IRQs affected by this read */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		if (irq->enabled)
-			value |= (1U << i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-			if (!irq->enabled) {
-				struct irq_data *data;
-
-				irq->enabled = true;
-				data = &irq_to_desc(irq->host_irq)->irq_data;
-				while (irqd_irq_disabled(data))
-					enable_irq(irq->host_irq);
-			}
-
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-			vgic_put_irq(vcpu->kvm, irq);
-
-			continue;
-		} else if (vgic_irq_is_mapped_level(irq)) {
-			bool was_high = irq->line_level;
-
-			/*
-			 * We need to update the state of the interrupt because
-			 * the guest might have changed the state of the device
-			 * while the interrupt was disabled at the VGIC level.
-			 */
-			irq->line_level = vgic_get_phys_line_level(irq);
-			/*
-			 * Deactivate the physical interrupt so the GIC will let
-			 * us know when it is asserted again.
-			 */
-			if (!irq->active && was_high && !irq->line_level)
-				vgic_irq_set_phys_active(irq, false);
-		}
-		irq->enabled = true;
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled)
-			disable_irq_nosync(irq->host_irq);
-
-		irq->enabled = false;
-
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
-			       gpa_t addr, unsigned int len,
-			       unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->enabled = true;
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return 0;
-}
-
-int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
-			       gpa_t addr, unsigned int len,
-			       unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->enabled = false;
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return 0;
-}
-
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/* Loop over all IRQs affected by this read */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		unsigned long flags;
-		bool val;
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-			int err;
-
-			val = false;
-			err = irq_get_irqchip_state(irq->host_irq,
-						    IRQCHIP_STATE_PENDING,
-						    &val);
-			WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-		} else {
-			val = irq_is_pending(irq);
-		}
-
-		value |= ((u32)val << i);
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
-{
-	return (vgic_irq_is_sgi(irq->intid) &&
-		vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2);
-}
-
-void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		/* GICD_ISPENDR0 SGI bits are WI */
-		if (is_vgic_v2_sgi(vcpu, irq)) {
-			vgic_put_irq(vcpu->kvm, irq);
-			continue;
-		}
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-			/* HW SGI? Ask the GIC to inject it */
-			int err;
-			err = irq_set_irqchip_state(irq->host_irq,
-						    IRQCHIP_STATE_PENDING,
-						    true);
-			WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-			vgic_put_irq(vcpu->kvm, irq);
-
-			continue;
-		}
-
-		irq->pending_latch = true;
-		if (irq->hw)
-			vgic_irq_set_phys_active(irq, true);
-
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
-				gpa_t addr, unsigned int len,
-				unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->pending_latch = true;
-
-		/*
-		 * GICv2 SGIs are terribly broken. We can't restore
-		 * the source of the interrupt, so just pick the vcpu
-		 * itself as the source...
-		 */
-		if (is_vgic_v2_sgi(vcpu, irq))
-			irq->source |= BIT(vcpu->vcpu_id);
-
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return 0;
-}
-
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
-{
-	irq->pending_latch = false;
-
-	/*
-	 * We don't want the guest to effectively mask the physical
-	 * interrupt by doing a write to SPENDR followed by a write to
-	 * CPENDR for HW interrupts, so we clear the active state on
-	 * the physical side if the virtual interrupt is not active.
-	 * This may lead to taking an additional interrupt on the
-	 * host, but that should not be a problem as the worst that
-	 * can happen is an additional vgic injection.  We also clear
-	 * the pending state to maintain proper semantics for edge HW
-	 * interrupts.
-	 */
-	vgic_irq_set_phys_pending(irq, false);
-	if (!irq->active)
-		vgic_irq_set_phys_active(irq, false);
-}
-
-void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		/* GICD_ICPENDR0 SGI bits are WI */
-		if (is_vgic_v2_sgi(vcpu, irq)) {
-			vgic_put_irq(vcpu->kvm, irq);
-			continue;
-		}
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-			/* HW SGI? Ask the GIC to clear its pending bit */
-			int err;
-			err = irq_set_irqchip_state(irq->host_irq,
-						    IRQCHIP_STATE_PENDING,
-						    false);
-			WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
-
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-			vgic_put_irq(vcpu->kvm, irq);
-
-			continue;
-		}
-
-		if (irq->hw)
-			vgic_hw_irq_cpending(vcpu, irq);
-		else
-			irq->pending_latch = false;
-
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
-				gpa_t addr, unsigned int len,
-				unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		/*
-		 * More fun with GICv2 SGIs! If we're clearing one of them
-		 * from userspace, which source vcpu to clear? Let's not
-		 * even think of it, and blow the whole set.
-		 */
-		if (is_vgic_v2_sgi(vcpu, irq))
-			irq->source = 0;
-
-		irq->pending_latch = false;
-
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return 0;
-}
-
-/*
- * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
- * is not queued on some running VCPU's LRs, because then the change to the
- * active state can be overwritten when the VCPU's state is synced coming back
- * from the guest.
- *
- * For shared interrupts as well as GICv3 private interrupts, we have to
- * stop all the VCPUs because interrupts can be migrated while we don't hold
- * the IRQ locks and we don't want to be chasing moving targets.
- *
- * For GICv2 private interrupts we don't have to do anything because
- * userspace accesses to the VGIC state already require all VCPUs to be
- * stopped, and only the VCPU itself can modify its private interrupts
- * active state, which guarantees that the VCPU is not running.
- */
-static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
-{
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
-	    intid >= VGIC_NR_PRIVATE_IRQS)
-		kvm_arm_halt_guest(vcpu->kvm);
-}
-
-/* See vgic_access_active_prepare */
-static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
-{
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
-	    intid >= VGIC_NR_PRIVATE_IRQS)
-		kvm_arm_resume_guest(vcpu->kvm);
-}
-
-static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/* Loop over all IRQs affected by this read */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		/*
-		 * Even for HW interrupts, don't evaluate the HW state as
-		 * all the guest is interested in is the virtual state.
-		 */
-		if (irq->active)
-			value |= (1U << i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 val;
-
-	mutex_lock(&vcpu->kvm->lock);
-	vgic_access_active_prepare(vcpu, intid);
-
-	val = __vgic_mmio_read_active(vcpu, addr, len);
-
-	vgic_access_active_finish(vcpu, intid);
-	mutex_unlock(&vcpu->kvm->lock);
-
-	return val;
-}
-
-unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len)
-{
-	return __vgic_mmio_read_active(vcpu, addr, len);
-}
-
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				      bool active, bool is_uaccess)
-{
-	if (is_uaccess)
-		return;
-
-	irq->active = active;
-	vgic_irq_set_phys_active(irq, active);
-}
-
-static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				    bool active)
-{
-	unsigned long flags;
-	struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu();
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-	if (irq->hw && !vgic_irq_is_sgi(irq->intid)) {
-		vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
-	} else if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
-		/*
-		 * GICv4.1 VSGI feature doesn't track an active state,
-		 * so let's not kid ourselves, there is nothing we can
-		 * do here.
-		 */
-		irq->active = false;
-	} else {
-		u32 model = vcpu->kvm->arch.vgic.vgic_model;
-		u8 active_source;
-
-		irq->active = active;
-
-		/*
-		 * The GICv2 architecture indicates that the source CPUID for
-		 * an SGI should be provided during an EOI which implies that
-		 * the active state is stored somewhere, but at the same time
-		 * this state is not architecturally exposed anywhere and we
-		 * have no way of knowing the right source.
-		 *
-		 * This may lead to a VCPU not being able to receive
-		 * additional instances of a particular SGI after migration
-		 * for a GICv2 VM on some GIC implementations.  Oh well.
-		 */
-		active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
-
-		if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
-		    active && vgic_irq_is_sgi(irq->intid))
-			irq->active_source = active_source;
-	}
-
-	if (irq->active)
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-	else
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-}
-
-static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-				      gpa_t addr, unsigned int len,
-				      unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		vgic_mmio_change_active(vcpu, irq, false);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-
-	mutex_lock(&vcpu->kvm->lock);
-	vgic_access_active_prepare(vcpu, intid);
-
-	__vgic_mmio_write_cactive(vcpu, addr, len, val);
-
-	vgic_access_active_finish(vcpu, intid);
-	mutex_unlock(&vcpu->kvm->lock);
-}
-
-int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	__vgic_mmio_write_cactive(vcpu, addr, len, val);
-	return 0;
-}
-
-static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-				      gpa_t addr, unsigned int len,
-				      unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		vgic_mmio_change_active(vcpu, irq, true);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-
-	mutex_lock(&vcpu->kvm->lock);
-	vgic_access_active_prepare(vcpu, intid);
-
-	__vgic_mmio_write_sactive(vcpu, addr, len, val);
-
-	vgic_access_active_finish(vcpu, intid);
-	mutex_unlock(&vcpu->kvm->lock);
-}
-
-int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	__vgic_mmio_write_sactive(vcpu, addr, len, val);
-	return 0;
-}
-
-unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
-				      gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-	int i;
-	u64 val = 0;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		val |= (u64)irq->priority << (i * 8);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return val;
-}
-
-/*
- * We currently don't handle changing the priority of an interrupt that
- * is already pending on a VCPU. If there is a need for this, we would
- * need to make this VCPU exit and re-evaluate the priorities, potentially
- * leading to this interrupt getting presented now to the guest (if it has
- * been masked by the priority mask before).
- */
-void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		/* Narrow the priority range to what we actually support */
-		irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
-		if (irq->hw && vgic_irq_is_sgi(irq->intid))
-			vgic_update_vsgi(irq);
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
-	u32 value = 0;
-	int i;
-
-	for (i = 0; i < len * 4; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		if (irq->config == VGIC_CONFIG_EDGE)
-			value |= (2U << (i * 2));
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
-			    gpa_t addr, unsigned int len,
-			    unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len * 4; i++) {
-		struct vgic_irq *irq;
-
-		/*
-		 * The configuration cannot be changed for SGIs in general,
-		 * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
-		 * code relies on PPIs being level triggered, so we also
-		 * make them read-only here.
-		 */
-		if (intid + i < VGIC_NR_PRIVATE_IRQS)
-			continue;
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		if (test_bit(i * 2 + 1, &val))
-			irq->config = VGIC_CONFIG_EDGE;
-		else
-			irq->config = VGIC_CONFIG_LEVEL;
-
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
-{
-	int i;
-	u64 val = 0;
-	int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-
-	for (i = 0; i < 32; i++) {
-		struct vgic_irq *irq;
-
-		if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
-			continue;
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
-			val |= (1U << i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return val;
-}
-
-void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
-				    const u64 val)
-{
-	int i;
-	int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-	unsigned long flags;
-
-	for (i = 0; i < 32; i++) {
-		struct vgic_irq *irq;
-		bool new_level;
-
-		if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
-			continue;
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		/*
-		 * Line level is set irrespective of irq type
-		 * (level or edge) to avoid dependency that VM should
-		 * restore irq config before line level.
-		 */
-		new_level = !!(val & (1U << i));
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->line_level = new_level;
-		if (new_level)
-			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		else
-			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-static int match_region(const void *key, const void *elt)
-{
-	const unsigned int offset = (unsigned long)key;
-	const struct vgic_register_region *region = elt;
-
-	if (offset < region->reg_offset)
-		return -1;
-
-	if (offset >= region->reg_offset + region->len)
-		return 1;
-
-	return 0;
-}
-
-const struct vgic_register_region *
-vgic_find_mmio_region(const struct vgic_register_region *regions,
-		      int nr_regions, unsigned int offset)
-{
-	return bsearch((void *)(uintptr_t)offset, regions, nr_regions,
-		       sizeof(regions[0]), match_region);
-}
-
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_set_vmcr(vcpu, vmcr);
-	else
-		vgic_v3_set_vmcr(vcpu, vmcr);
-}
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_get_vmcr(vcpu, vmcr);
-	else
-		vgic_v3_get_vmcr(vcpu, vmcr);
-}
-
-/*
- * kvm_mmio_read_buf() returns a value in a format where it can be converted
- * to a byte array and be directly observed as the guest wanted it to appear
- * in memory if it had done the store itself, which is LE for the GIC, as the
- * guest knows the GIC is always LE.
- *
- * We convert this value to the CPUs native format to deal with it as a data
- * value.
- */
-unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
-{
-	unsigned long data = kvm_mmio_read_buf(val, len);
-
-	switch (len) {
-	case 1:
-		return data;
-	case 2:
-		return le16_to_cpu(data);
-	case 4:
-		return le32_to_cpu(data);
-	default:
-		return le64_to_cpu(data);
-	}
-}
-
-/*
- * kvm_mmio_write_buf() expects a value in a format such that if converted to
- * a byte array it is observed as the guest would see it if it could perform
- * the load directly.  Since the GIC is LE, and the guest knows this, the
- * guest expects a value in little endian format.
- *
- * We convert the data value from the CPUs native format to LE so that the
- * value is returned in the proper format.
- */
-void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
-				unsigned long data)
-{
-	switch (len) {
-	case 1:
-		break;
-	case 2:
-		data = cpu_to_le16(data);
-		break;
-	case 4:
-		data = cpu_to_le32(data);
-		break;
-	default:
-		data = cpu_to_le64(data);
-	}
-
-	kvm_mmio_write_buf(buf, len, data);
-}
-
-static
-struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
-{
-	return container_of(dev, struct vgic_io_device, dev);
-}
-
-static bool check_region(const struct kvm *kvm,
-			 const struct vgic_register_region *region,
-			 gpa_t addr, int len)
-{
-	int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-
-	switch (len) {
-	case sizeof(u8):
-		flags = VGIC_ACCESS_8bit;
-		break;
-	case sizeof(u32):
-		flags = VGIC_ACCESS_32bit;
-		break;
-	case sizeof(u64):
-		flags = VGIC_ACCESS_64bit;
-		break;
-	default:
-		return false;
-	}
-
-	if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) {
-		if (!region->bits_per_irq)
-			return true;
-
-		/* Do we access a non-allocated IRQ? */
-		return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs;
-	}
-
-	return false;
-}
-
-const struct vgic_register_region *
-vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
-		     gpa_t addr, int len)
-{
-	const struct vgic_register_region *region;
-
-	region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
-				       addr - iodev->base_addr);
-	if (!region || !check_region(vcpu->kvm, region, addr, len))
-		return NULL;
-
-	return region;
-}
-
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-			     gpa_t addr, u32 *val)
-{
-	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-	const struct vgic_register_region *region;
-	struct kvm_vcpu *r_vcpu;
-
-	region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
-	if (!region) {
-		*val = 0;
-		return 0;
-	}
-
-	r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-	if (region->uaccess_read)
-		*val = region->uaccess_read(r_vcpu, addr, sizeof(u32));
-	else
-		*val = region->read(r_vcpu, addr, sizeof(u32));
-
-	return 0;
-}
-
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-			      gpa_t addr, const u32 *val)
-{
-	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-	const struct vgic_register_region *region;
-	struct kvm_vcpu *r_vcpu;
-
-	region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
-	if (!region)
-		return 0;
-
-	r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-	if (region->uaccess_write)
-		return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val);
-
-	region->write(r_vcpu, addr, sizeof(u32), *val);
-	return 0;
-}
-
-/*
- * Userland access to VGIC registers.
- */
-int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
-		 bool is_write, int offset, u32 *val)
-{
-	if (is_write)
-		return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
-	else
-		return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
-}
-
-static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-			      gpa_t addr, int len, void *val)
-{
-	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-	const struct vgic_register_region *region;
-	unsigned long data = 0;
-
-	region = vgic_get_mmio_region(vcpu, iodev, addr, len);
-	if (!region) {
-		memset(val, 0, len);
-		return 0;
-	}
-
-	switch (iodev->iodev_type) {
-	case IODEV_CPUIF:
-		data = region->read(vcpu, addr, len);
-		break;
-	case IODEV_DIST:
-		data = region->read(vcpu, addr, len);
-		break;
-	case IODEV_REDIST:
-		data = region->read(iodev->redist_vcpu, addr, len);
-		break;
-	case IODEV_ITS:
-		data = region->its_read(vcpu->kvm, iodev->its, addr, len);
-		break;
-	}
-
-	vgic_data_host_to_mmio_bus(val, len, data);
-	return 0;
-}
-
-static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-			       gpa_t addr, int len, const void *val)
-{
-	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-	const struct vgic_register_region *region;
-	unsigned long data = vgic_data_mmio_bus_to_host(val, len);
-
-	region = vgic_get_mmio_region(vcpu, iodev, addr, len);
-	if (!region)
-		return 0;
-
-	switch (iodev->iodev_type) {
-	case IODEV_CPUIF:
-		region->write(vcpu, addr, len, data);
-		break;
-	case IODEV_DIST:
-		region->write(vcpu, addr, len, data);
-		break;
-	case IODEV_REDIST:
-		region->write(iodev->redist_vcpu, addr, len, data);
-		break;
-	case IODEV_ITS:
-		region->its_write(vcpu->kvm, iodev->its, addr, len, data);
-		break;
-	}
-
-	return 0;
-}
-
-struct kvm_io_device_ops kvm_io_gic_ops = {
-	.read = dispatch_mmio_read,
-	.write = dispatch_mmio_write,
-};
-
-int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
-			     enum vgic_type type)
-{
-	struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
-	int ret = 0;
-	unsigned int len;
-
-	switch (type) {
-	case VGIC_V2:
-		len = vgic_v2_init_dist_iodev(io_device);
-		break;
-	case VGIC_V3:
-		len = vgic_v3_init_dist_iodev(io_device);
-		break;
-	default:
-		BUG_ON(1);
-	}
-
-	io_device->base_addr = dist_base_address;
-	io_device->iodev_type = IODEV_DIST;
-	io_device->redist_vcpu = NULL;
-
-	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
-				      len, &io_device->dev);
-	mutex_unlock(&kvm->slots_lock);
-
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
deleted file mode 100644
index fefcca2b14dc..000000000000
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-#ifndef __KVM_ARM_VGIC_MMIO_H__
-#define __KVM_ARM_VGIC_MMIO_H__
-
-struct vgic_register_region {
-	unsigned int reg_offset;
-	unsigned int len;
-	unsigned int bits_per_irq;
-	unsigned int access_flags;
-	union {
-		unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
-				      unsigned int len);
-		unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
-					  gpa_t addr, unsigned int len);
-	};
-	union {
-		void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
-			      unsigned int len, unsigned long val);
-		void (*its_write)(struct kvm *kvm, struct vgic_its *its,
-				  gpa_t addr, unsigned int len,
-				  unsigned long val);
-	};
-	unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr,
-				      unsigned int len);
-	union {
-		int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr,
-				     unsigned int len, unsigned long val);
-		int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its,
-					 gpa_t addr, unsigned int len,
-					 unsigned long val);
-	};
-};
-
-extern struct kvm_io_device_ops kvm_io_gic_ops;
-
-#define VGIC_ACCESS_8bit	1
-#define VGIC_ACCESS_32bit	2
-#define VGIC_ACCESS_64bit	4
-
-/*
- * Generate a mask that covers the number of bytes required to address
- * up to 1024 interrupts, each represented by <bits> bits. This assumes
- * that <bits> is a power of two.
- */
-#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1)
-
-/*
- * (addr & mask) gives us the _byte_ offset for the INT ID.
- * We multiply this by 8 the get the _bit_ offset, then divide this by
- * the number of bits to learn the actual INT ID.
- * But instead of a division (which requires a "long long div" implementation),
- * we shift by the binary logarithm of <bits>.
- * This assumes that <bits> is a power of two.
- */
-#define VGIC_ADDR_TO_INTID(addr, bits)  (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \
-					8 >> ilog2(bits))
-
-/*
- * Some VGIC registers store per-IRQ information, with a different number
- * of bits per IRQ. For those registers this macro is used.
- * The _WITH_LENGTH version instantiates registers with a fixed length
- * and is mutually exclusive with the _PER_IRQ version.
- */
-#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc)	\
-	{								\
-		.reg_offset = off,					\
-		.bits_per_irq = bpi,					\
-		.len = bpi * 1024 / 8,					\
-		.access_flags = acc,					\
-		.read = rd,						\
-		.write = wr,						\
-		.uaccess_read = ur,					\
-		.uaccess_write = uw,					\
-	}
-
-#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc)		\
-	{								\
-		.reg_offset = off,					\
-		.bits_per_irq = 0,					\
-		.len = length,						\
-		.access_flags = acc,					\
-		.read = rd,						\
-		.write = wr,						\
-	}
-
-#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \
-	{								\
-		.reg_offset = off,					\
-		.bits_per_irq = 0,					\
-		.len = length,						\
-		.access_flags = acc,					\
-		.read = rd,						\
-		.write = wr,						\
-		.uaccess_read = urd,					\
-		.uaccess_write = uwr,					\
-	}
-
-unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
-
-void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
-				unsigned long data);
-
-unsigned long extract_bytes(u64 data, unsigned int offset,
-			    unsigned int num);
-
-u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
-		     unsigned long val);
-
-unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
-				 gpa_t addr, unsigned int len);
-
-unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
-				 gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-			unsigned int len, unsigned long val);
-
-int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-			       unsigned int len, unsigned long val);
-
-unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr,
-				   unsigned int len);
-
-void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
-			   unsigned int len, unsigned long val);
-
-unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val);
-
-void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val);
-
-int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
-			       gpa_t addr, unsigned int len,
-			       unsigned long val);
-
-int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
-			       gpa_t addr, unsigned int len,
-			       unsigned long val);
-
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val);
-
-void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val);
-
-int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
-				gpa_t addr, unsigned int len,
-				unsigned long val);
-
-int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
-				gpa_t addr, unsigned int len,
-				unsigned long val);
-
-unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len);
-
-unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val);
-
-void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val);
-
-int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val);
-
-int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val);
-
-unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
-				      gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val);
-
-unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
-			    gpa_t addr, unsigned int len,
-			    unsigned long val);
-
-int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
-		 bool is_write, int offset, u32 *val);
-
-u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid);
-
-void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
-				    const u64 val);
-
-unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
-
-unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
-
-u64 vgic_sanitise_outer_cacheability(u64 reg);
-u64 vgic_sanitise_inner_cacheability(u64 reg);
-u64 vgic_sanitise_shareability(u64 reg);
-u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
-			u64 (*sanitise_fn)(u64));
-
-/* Find the proper register handler entry given a certain address offset */
-const struct vgic_register_region *
-vgic_find_mmio_region(const struct vgic_register_region *regions,
-		      int nr_regions, unsigned int offset);
-
-#endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
deleted file mode 100644
index 621cc168fe3f..000000000000
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ /dev/null
@@ -1,504 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-
-#include <linux/irqchip/arm-gic.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-static inline void vgic_v2_write_lr(int lr, u32 val)
-{
-	void __iomem *base = kvm_vgic_global_state.vctrl_base;
-
-	writel_relaxed(val, base + GICH_LR0 + (lr * 4));
-}
-
-void vgic_v2_init_lrs(void)
-{
-	int i;
-
-	for (i = 0; i < kvm_vgic_global_state.nr_lr; i++)
-		vgic_v2_write_lr(i, 0);
-}
-
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	cpuif->vgic_hcr |= GICH_HCR_UIE;
-}
-
-static bool lr_signals_eoi_mi(u32 lr_val)
-{
-	return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) &&
-	       !(lr_val & GICH_LR_HW);
-}
-
-/*
- * transfer the content of the LRs back into the corresponding ap_list:
- * - active bit is transferred as is
- * - pending bit is
- *   - transferred as is in case of edge sensitive IRQs
- *   - set to the line-level (resample time) for level sensitive IRQs
- */
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
-	int lr;
-
-	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
-
-	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
-		u32 val = cpuif->vgic_lr[lr];
-		u32 cpuid, intid = val & GICH_LR_VIRTUALID;
-		struct vgic_irq *irq;
-
-		/* Extract the source vCPU id from the LR */
-		cpuid = val & GICH_LR_PHYSID_CPUID;
-		cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
-		cpuid &= 7;
-
-		/* Notify fds when the guest EOI'ed a level-triggered SPI */
-		if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
-			kvm_notify_acked_irq(vcpu->kvm, 0,
-					     intid - VGIC_NR_PRIVATE_IRQS);
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-
-		raw_spin_lock(&irq->irq_lock);
-
-		/* Always preserve the active bit */
-		irq->active = !!(val & GICH_LR_ACTIVE_BIT);
-
-		if (irq->active && vgic_irq_is_sgi(intid))
-			irq->active_source = cpuid;
-
-		/* Edge is the only case where we preserve the pending bit */
-		if (irq->config == VGIC_CONFIG_EDGE &&
-		    (val & GICH_LR_PENDING_BIT)) {
-			irq->pending_latch = true;
-
-			if (vgic_irq_is_sgi(intid))
-				irq->source |= (1 << cpuid);
-		}
-
-		/*
-		 * Clear soft pending state when level irqs have been acked.
-		 */
-		if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
-			irq->pending_latch = false;
-
-		/*
-		 * Level-triggered mapped IRQs are special because we only
-		 * observe rising edges as input to the VGIC.
-		 *
-		 * If the guest never acked the interrupt we have to sample
-		 * the physical line and set the line level, because the
-		 * device state could have changed or we simply need to
-		 * process the still pending interrupt later.
-		 *
-		 * If this causes us to lower the level, we have to also clear
-		 * the physical active state, since we will otherwise never be
-		 * told when the interrupt becomes asserted again.
-		 */
-		if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
-			irq->line_level = vgic_get_phys_line_level(irq);
-
-			if (!irq->line_level)
-				vgic_irq_set_phys_active(irq, false);
-		}
-
-		raw_spin_unlock(&irq->irq_lock);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	vgic_cpu->used_lrs = 0;
-}
-
-/*
- * Populates the particular LR with the state of a given IRQ:
- * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
- * - for a level sensitive IRQ the pending state value is unchanged;
- *   it is dictated directly by the input level
- *
- * If @irq describes an SGI with multiple sources, we choose the
- * lowest-numbered source VCPU and clear that bit in the source bitmap.
- *
- * The irq_lock must be held by the caller.
- */
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
-{
-	u32 val = irq->intid;
-	bool allow_pending = true;
-
-	if (irq->active) {
-		val |= GICH_LR_ACTIVE_BIT;
-		if (vgic_irq_is_sgi(irq->intid))
-			val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
-		if (vgic_irq_is_multi_sgi(irq)) {
-			allow_pending = false;
-			val |= GICH_LR_EOI;
-		}
-	}
-
-	if (irq->group)
-		val |= GICH_LR_GROUP1;
-
-	if (irq->hw) {
-		val |= GICH_LR_HW;
-		val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
-		/*
-		 * Never set pending+active on a HW interrupt, as the
-		 * pending state is kept at the physical distributor
-		 * level.
-		 */
-		if (irq->active)
-			allow_pending = false;
-	} else {
-		if (irq->config == VGIC_CONFIG_LEVEL) {
-			val |= GICH_LR_EOI;
-
-			/*
-			 * Software resampling doesn't work very well
-			 * if we allow P+A, so let's not do that.
-			 */
-			if (irq->active)
-				allow_pending = false;
-		}
-	}
-
-	if (allow_pending && irq_is_pending(irq)) {
-		val |= GICH_LR_PENDING_BIT;
-
-		if (irq->config == VGIC_CONFIG_EDGE)
-			irq->pending_latch = false;
-
-		if (vgic_irq_is_sgi(irq->intid)) {
-			u32 src = ffs(irq->source);
-
-			if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
-					   irq->intid))
-				return;
-
-			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
-			irq->source &= ~(1 << (src - 1));
-			if (irq->source) {
-				irq->pending_latch = true;
-				val |= GICH_LR_EOI;
-			}
-		}
-	}
-
-	/*
-	 * Level-triggered mapped IRQs are special because we only observe
-	 * rising edges as input to the VGIC.  We therefore lower the line
-	 * level here, so that we can take new virtual IRQs.  See
-	 * vgic_v2_fold_lr_state for more info.
-	 */
-	if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT))
-		irq->line_level = false;
-
-	/* The GICv2 LR only holds five bits of priority. */
-	val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
-
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
-}
-
-void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
-}
-
-void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	u32 vmcr;
-
-	vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) &
-		GICH_VMCR_ENABLE_GRP0_MASK;
-	vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) &
-		GICH_VMCR_ENABLE_GRP1_MASK;
-	vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) &
-		GICH_VMCR_ACK_CTL_MASK;
-	vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) &
-		GICH_VMCR_FIQ_EN_MASK;
-	vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) &
-		GICH_VMCR_CBPR_MASK;
-	vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) &
-		GICH_VMCR_EOI_MODE_MASK;
-	vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
-		GICH_VMCR_ALIAS_BINPOINT_MASK;
-	vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
-		GICH_VMCR_BINPOINT_MASK;
-	vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
-		 GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
-
-	cpu_if->vgic_vmcr = vmcr;
-}
-
-void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	u32 vmcr;
-
-	vmcr = cpu_if->vgic_vmcr;
-
-	vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >>
-		GICH_VMCR_ENABLE_GRP0_SHIFT;
-	vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >>
-		GICH_VMCR_ENABLE_GRP1_SHIFT;
-	vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >>
-		GICH_VMCR_ACK_CTL_SHIFT;
-	vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >>
-		GICH_VMCR_FIQ_EN_SHIFT;
-	vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >>
-		GICH_VMCR_CBPR_SHIFT;
-	vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >>
-		GICH_VMCR_EOI_MODE_SHIFT;
-
-	vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
-			GICH_VMCR_ALIAS_BINPOINT_SHIFT;
-	vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
-			GICH_VMCR_BINPOINT_SHIFT;
-	vmcrp->pmr  = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
-			GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
-}
-
-void vgic_v2_enable(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * By forcing VMCR to zero, the GIC will restore the binary
-	 * points to their reset values. Anything else resets to zero
-	 * anyway.
-	 */
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-
-	/* Get the show on the road... */
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
-}
-
-/* check for overlapping regions and for regions crossing the end of memory */
-static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
-{
-	if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base)
-		return false;
-	if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base)
-		return false;
-
-	if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base)
-		return true;
-	if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base)
-		return true;
-
-	return false;
-}
-
-int vgic_v2_map_resources(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int ret = 0;
-
-	if (vgic_ready(kvm))
-		goto out;
-
-	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-	    IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
-		kvm_err("Need to set vgic cpu and dist addresses first\n");
-		ret = -ENXIO;
-		goto out;
-	}
-
-	if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) {
-		kvm_err("VGIC CPU and dist frames overlap\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/*
-	 * Initialize the vgic if this hasn't already been done on demand by
-	 * accessing the vgic state from userspace.
-	 */
-	ret = vgic_init(kvm);
-	if (ret) {
-		kvm_err("Unable to initialize VGIC dynamic data structures\n");
-		goto out;
-	}
-
-	ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
-	if (ret) {
-		kvm_err("Unable to register VGIC MMIO regions\n");
-		goto out;
-	}
-
-	if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
-		ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
-					    kvm_vgic_global_state.vcpu_base,
-					    KVM_VGIC_V2_CPU_SIZE, true);
-		if (ret) {
-			kvm_err("Unable to remap VGIC CPU to VCPU\n");
-			goto out;
-		}
-	}
-
-	dist->ready = true;
-
-out:
-	return ret;
-}
-
-DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
-
-/**
- * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller
- * @info:	pointer to the GIC description
- *
- * Returns 0 if the VGICv2 has been probed successfully, returns an error code
- * otherwise
- */
-int vgic_v2_probe(const struct gic_kvm_info *info)
-{
-	int ret;
-	u32 vtr;
-
-	if (!info->vctrl.start) {
-		kvm_err("GICH not present in the firmware table\n");
-		return -ENXIO;
-	}
-
-	if (!PAGE_ALIGNED(info->vcpu.start) ||
-	    !PAGE_ALIGNED(resource_size(&info->vcpu))) {
-		kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
-
-		ret = create_hyp_io_mappings(info->vcpu.start,
-					     resource_size(&info->vcpu),
-					     &kvm_vgic_global_state.vcpu_base_va,
-					     &kvm_vgic_global_state.vcpu_hyp_va);
-		if (ret) {
-			kvm_err("Cannot map GICV into hyp\n");
-			goto out;
-		}
-
-		static_branch_enable(&vgic_v2_cpuif_trap);
-	}
-
-	ret = create_hyp_io_mappings(info->vctrl.start,
-				     resource_size(&info->vctrl),
-				     &kvm_vgic_global_state.vctrl_base,
-				     &kvm_vgic_global_state.vctrl_hyp);
-	if (ret) {
-		kvm_err("Cannot map VCTRL into hyp\n");
-		goto out;
-	}
-
-	vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
-	kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
-
-	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-	if (ret) {
-		kvm_err("Cannot register GICv2 KVM device\n");
-		goto out;
-	}
-
-	kvm_vgic_global_state.can_emulate_gicv2 = true;
-	kvm_vgic_global_state.vcpu_base = info->vcpu.start;
-	kvm_vgic_global_state.type = VGIC_V2;
-	kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
-
-	kvm_debug("vgic-v2@%llx\n", info->vctrl.start);
-
-	return 0;
-out:
-	if (kvm_vgic_global_state.vctrl_base)
-		iounmap(kvm_vgic_global_state.vctrl_base);
-	if (kvm_vgic_global_state.vcpu_base_va)
-		iounmap(kvm_vgic_global_state.vcpu_base_va);
-
-	return ret;
-}
-
-static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	u64 elrsr;
-	int i;
-
-	elrsr = readl_relaxed(base + GICH_ELRSR0);
-	if (unlikely(used_lrs > 32))
-		elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32;
-
-	for (i = 0; i < used_lrs; i++) {
-		if (elrsr & (1UL << i))
-			cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
-		else
-			cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
-
-		writel_relaxed(0, base + GICH_LR0 + (i * 4));
-	}
-}
-
-void vgic_v2_save_state(struct kvm_vcpu *vcpu)
-{
-	void __iomem *base = kvm_vgic_global_state.vctrl_base;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
-	if (!base)
-		return;
-
-	if (used_lrs) {
-		save_lrs(vcpu, base);
-		writel_relaxed(0, base + GICH_HCR);
-	}
-}
-
-void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	void __iomem *base = kvm_vgic_global_state.vctrl_base;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	int i;
-
-	if (!base)
-		return;
-
-	if (used_lrs) {
-		writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
-		for (i = 0; i < used_lrs; i++) {
-			writel_relaxed(cpu_if->vgic_lr[i],
-				       base + GICH_LR0 + (i * 4));
-		}
-	}
-}
-
-void vgic_v2_load(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	writel_relaxed(cpu_if->vgic_vmcr,
-		       kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-	writel_relaxed(cpu_if->vgic_apr,
-		       kvm_vgic_global_state.vctrl_base + GICH_APR);
-}
-
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-}
-
-void vgic_v2_put(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	vgic_v2_vmcr_sync(vcpu);
-	cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
-}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
deleted file mode 100644
index 2c9fc13e2c59..000000000000
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ /dev/null
@@ -1,693 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_hyp.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_asm.h>
-
-#include "vgic.h"
-
-static bool group0_trap;
-static bool group1_trap;
-static bool common_trap;
-static bool gicv4_enable;
-
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	cpuif->vgic_hcr |= ICH_HCR_UIE;
-}
-
-static bool lr_signals_eoi_mi(u64 lr_val)
-{
-	return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) &&
-	       !(lr_val & ICH_LR_HW);
-}
-
-void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-	int lr;
-
-	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
-
-	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
-		u64 val = cpuif->vgic_lr[lr];
-		u32 intid, cpuid;
-		struct vgic_irq *irq;
-		bool is_v2_sgi = false;
-
-		cpuid = val & GICH_LR_PHYSID_CPUID;
-		cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
-
-		if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-			intid = val & ICH_LR_VIRTUAL_ID_MASK;
-		} else {
-			intid = val & GICH_LR_VIRTUALID;
-			is_v2_sgi = vgic_irq_is_sgi(intid);
-		}
-
-		/* Notify fds when the guest EOI'ed a level-triggered IRQ */
-		if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
-			kvm_notify_acked_irq(vcpu->kvm, 0,
-					     intid - VGIC_NR_PRIVATE_IRQS);
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-		if (!irq)	/* An LPI could have been unmapped. */
-			continue;
-
-		raw_spin_lock(&irq->irq_lock);
-
-		/* Always preserve the active bit */
-		irq->active = !!(val & ICH_LR_ACTIVE_BIT);
-
-		if (irq->active && is_v2_sgi)
-			irq->active_source = cpuid;
-
-		/* Edge is the only case where we preserve the pending bit */
-		if (irq->config == VGIC_CONFIG_EDGE &&
-		    (val & ICH_LR_PENDING_BIT)) {
-			irq->pending_latch = true;
-
-			if (is_v2_sgi)
-				irq->source |= (1 << cpuid);
-		}
-
-		/*
-		 * Clear soft pending state when level irqs have been acked.
-		 */
-		if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
-			irq->pending_latch = false;
-
-		/*
-		 * Level-triggered mapped IRQs are special because we only
-		 * observe rising edges as input to the VGIC.
-		 *
-		 * If the guest never acked the interrupt we have to sample
-		 * the physical line and set the line level, because the
-		 * device state could have changed or we simply need to
-		 * process the still pending interrupt later.
-		 *
-		 * If this causes us to lower the level, we have to also clear
-		 * the physical active state, since we will otherwise never be
-		 * told when the interrupt becomes asserted again.
-		 */
-		if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
-			irq->line_level = vgic_get_phys_line_level(irq);
-
-			if (!irq->line_level)
-				vgic_irq_set_phys_active(irq, false);
-		}
-
-		raw_spin_unlock(&irq->irq_lock);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	vgic_cpu->used_lrs = 0;
-}
-
-/* Requires the irq to be locked already */
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
-{
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-	u64 val = irq->intid;
-	bool allow_pending = true, is_v2_sgi;
-
-	is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
-		     model == KVM_DEV_TYPE_ARM_VGIC_V2);
-
-	if (irq->active) {
-		val |= ICH_LR_ACTIVE_BIT;
-		if (is_v2_sgi)
-			val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
-		if (vgic_irq_is_multi_sgi(irq)) {
-			allow_pending = false;
-			val |= ICH_LR_EOI;
-		}
-	}
-
-	if (irq->hw) {
-		val |= ICH_LR_HW;
-		val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
-		/*
-		 * Never set pending+active on a HW interrupt, as the
-		 * pending state is kept at the physical distributor
-		 * level.
-		 */
-		if (irq->active)
-			allow_pending = false;
-	} else {
-		if (irq->config == VGIC_CONFIG_LEVEL) {
-			val |= ICH_LR_EOI;
-
-			/*
-			 * Software resampling doesn't work very well
-			 * if we allow P+A, so let's not do that.
-			 */
-			if (irq->active)
-				allow_pending = false;
-		}
-	}
-
-	if (allow_pending && irq_is_pending(irq)) {
-		val |= ICH_LR_PENDING_BIT;
-
-		if (irq->config == VGIC_CONFIG_EDGE)
-			irq->pending_latch = false;
-
-		if (vgic_irq_is_sgi(irq->intid) &&
-		    model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-			u32 src = ffs(irq->source);
-
-			if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
-					   irq->intid))
-				return;
-
-			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
-			irq->source &= ~(1 << (src - 1));
-			if (irq->source) {
-				irq->pending_latch = true;
-				val |= ICH_LR_EOI;
-			}
-		}
-	}
-
-	/*
-	 * Level-triggered mapped IRQs are special because we only observe
-	 * rising edges as input to the VGIC.  We therefore lower the line
-	 * level here, so that we can take new virtual IRQs.  See
-	 * vgic_v3_fold_lr_state for more info.
-	 */
-	if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
-		irq->line_level = false;
-
-	if (irq->group)
-		val |= ICH_LR_GROUP;
-
-	val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
-
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
-}
-
-void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
-}
-
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-	u32 vmcr;
-
-	if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-		vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) &
-			ICH_VMCR_ACK_CTL_MASK;
-		vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) &
-			ICH_VMCR_FIQ_EN_MASK;
-	} else {
-		/*
-		 * When emulating GICv3 on GICv3 with SRE=1 on the
-		 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
-		 */
-		vmcr = ICH_VMCR_FIQ_EN_MASK;
-	}
-
-	vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
-	vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
-	vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
-	vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
-	vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
-	vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
-	vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
-
-	cpu_if->vgic_vmcr = vmcr;
-}
-
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-	u32 vmcr;
-
-	vmcr = cpu_if->vgic_vmcr;
-
-	if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-		vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >>
-			ICH_VMCR_ACK_CTL_SHIFT;
-		vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >>
-			ICH_VMCR_FIQ_EN_SHIFT;
-	} else {
-		/*
-		 * When emulating GICv3 on GICv3 with SRE=1 on the
-		 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
-		 */
-		vmcrp->fiqen = 1;
-		vmcrp->ackctl = 0;
-	}
-
-	vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
-	vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT;
-	vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-	vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-	vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-	vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
-	vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
-}
-
-#define INITIAL_PENDBASER_VALUE						  \
-	(GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)		| \
-	GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)	| \
-	GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
-
-void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	/*
-	 * By forcing VMCR to zero, the GIC will restore the binary
-	 * points to their reset values. Anything else resets to zero
-	 * anyway.
-	 */
-	vgic_v3->vgic_vmcr = 0;
-
-	/*
-	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
-	 * way, so we force SRE to 1 to demonstrate this to the guest.
-	 * Also, we don't support any form of IRQ/FIQ bypass.
-	 * This goes with the spec allowing the value to be RAO/WI.
-	 */
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB |
-				     ICC_SRE_EL1_DFB |
-				     ICC_SRE_EL1_SRE);
-		vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
-	} else {
-		vgic_v3->vgic_sre = 0;
-	}
-
-	vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
-					   ICH_VTR_ID_BITS_MASK) >>
-					   ICH_VTR_ID_BITS_SHIFT;
-	vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
-					    ICH_VTR_PRI_BITS_MASK) >>
-					    ICH_VTR_PRI_BITS_SHIFT) + 1;
-
-	/* Get the show on the road... */
-	vgic_v3->vgic_hcr = ICH_HCR_EN;
-	if (group0_trap)
-		vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
-	if (group1_trap)
-		vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
-	if (common_trap)
-		vgic_v3->vgic_hcr |= ICH_HCR_TC;
-}
-
-int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
-{
-	struct kvm_vcpu *vcpu;
-	int byte_offset, bit_nr;
-	gpa_t pendbase, ptr;
-	bool status;
-	u8 val;
-	int ret;
-	unsigned long flags;
-
-retry:
-	vcpu = irq->target_vcpu;
-	if (!vcpu)
-		return 0;
-
-	pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-
-	byte_offset = irq->intid / BITS_PER_BYTE;
-	bit_nr = irq->intid % BITS_PER_BYTE;
-	ptr = pendbase + byte_offset;
-
-	ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
-	if (ret)
-		return ret;
-
-	status = val & (1 << bit_nr);
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	if (irq->target_vcpu != vcpu) {
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		goto retry;
-	}
-	irq->pending_latch = status;
-	vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-	if (status) {
-		/* clear consumed data */
-		val &= ~(1 << bit_nr);
-		ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-/**
- * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
- * kvm lock and all vcpu lock must be held
- */
-int vgic_v3_save_pending_tables(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq;
-	gpa_t last_ptr = ~(gpa_t)0;
-	int ret;
-	u8 val;
-
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-		int byte_offset, bit_nr;
-		struct kvm_vcpu *vcpu;
-		gpa_t pendbase, ptr;
-		bool stored;
-
-		vcpu = irq->target_vcpu;
-		if (!vcpu)
-			continue;
-
-		pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-
-		byte_offset = irq->intid / BITS_PER_BYTE;
-		bit_nr = irq->intid % BITS_PER_BYTE;
-		ptr = pendbase + byte_offset;
-
-		if (ptr != last_ptr) {
-			ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
-			if (ret)
-				return ret;
-			last_ptr = ptr;
-		}
-
-		stored = val & (1U << bit_nr);
-		if (stored == irq->pending_latch)
-			continue;
-
-		if (irq->pending_latch)
-			val |= 1 << bit_nr;
-		else
-			val &= ~(1 << bit_nr);
-
-		ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-/**
- * vgic_v3_rdist_overlap - check if a region overlaps with any
- * existing redistributor region
- *
- * @kvm: kvm handle
- * @base: base of the region
- * @size: size of region
- *
- * Return: true if there is an overlap
- */
-bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size)
-{
-	struct vgic_dist *d = &kvm->arch.vgic;
-	struct vgic_redist_region *rdreg;
-
-	list_for_each_entry(rdreg, &d->rd_regions, list) {
-		if ((base + size > rdreg->base) &&
-			(base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg)))
-			return true;
-	}
-	return false;
-}
-
-/*
- * Check for overlapping regions and for regions crossing the end of memory
- * for base addresses which have already been set.
- */
-bool vgic_v3_check_base(struct kvm *kvm)
-{
-	struct vgic_dist *d = &kvm->arch.vgic;
-	struct vgic_redist_region *rdreg;
-
-	if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
-	    d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
-		return false;
-
-	list_for_each_entry(rdreg, &d->rd_regions, list) {
-		if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) <
-			rdreg->base)
-			return false;
-	}
-
-	if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base))
-		return true;
-
-	return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base,
-				      KVM_VGIC_V3_DIST_SIZE);
-}
-
-/**
- * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one
- * which has free space to put a new rdist region.
- *
- * @rd_regions: redistributor region list head
- *
- * A redistributor regions maps n redistributors, n = region size / (2 x 64kB).
- * Stride between redistributors is 0 and regions are filled in the index order.
- *
- * Return: the redist region handle, if any, that has space to map a new rdist
- * region.
- */
-struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions)
-{
-	struct vgic_redist_region *rdreg;
-
-	list_for_each_entry(rdreg, rd_regions, list) {
-		if (!vgic_v3_redist_region_full(rdreg))
-			return rdreg;
-	}
-	return NULL;
-}
-
-struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
-							   u32 index)
-{
-	struct list_head *rd_regions = &kvm->arch.vgic.rd_regions;
-	struct vgic_redist_region *rdreg;
-
-	list_for_each_entry(rdreg, rd_regions, list) {
-		if (rdreg->index == index)
-			return rdreg;
-	}
-	return NULL;
-}
-
-
-int vgic_v3_map_resources(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int ret = 0;
-	int c;
-
-	if (vgic_ready(kvm))
-		goto out;
-
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-		if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) {
-			kvm_debug("vcpu %d redistributor base not set\n", c);
-			ret = -ENXIO;
-			goto out;
-		}
-	}
-
-	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) {
-		kvm_err("Need to set vgic distributor addresses first\n");
-		ret = -ENXIO;
-		goto out;
-	}
-
-	if (!vgic_v3_check_base(kvm)) {
-		kvm_err("VGIC redist and dist frames overlap\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/*
-	 * For a VGICv3 we require the userland to explicitly initialize
-	 * the VGIC before we need to use it.
-	 */
-	if (!vgic_initialized(kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
-	if (ret) {
-		kvm_err("Unable to register VGICv3 dist MMIO regions\n");
-		goto out;
-	}
-
-	if (kvm_vgic_global_state.has_gicv4_1)
-		vgic_v4_configure_vsgis(kvm);
-	dist->ready = true;
-
-out:
-	return ret;
-}
-
-DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
-
-static int __init early_group0_trap_cfg(char *buf)
-{
-	return strtobool(buf, &group0_trap);
-}
-early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
-
-static int __init early_group1_trap_cfg(char *buf)
-{
-	return strtobool(buf, &group1_trap);
-}
-early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
-
-static int __init early_common_trap_cfg(char *buf)
-{
-	return strtobool(buf, &common_trap);
-}
-early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
-
-static int __init early_gicv4_enable(char *buf)
-{
-	return strtobool(buf, &gicv4_enable);
-}
-early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
-
-/**
- * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
- * @info:	pointer to the GIC description
- *
- * Returns 0 if the VGICv3 has been probed successfully, returns an error code
- * otherwise
- */
-int vgic_v3_probe(const struct gic_kvm_info *info)
-{
-	u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
-	int ret;
-
-	/*
-	 * The ListRegs field is 5 bits, but there is a architectural
-	 * maximum of 16 list registers. Just ignore bit 4...
-	 */
-	kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
-	kvm_vgic_global_state.can_emulate_gicv2 = false;
-	kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2;
-
-	/* GICv4 support? */
-	if (info->has_v4) {
-		kvm_vgic_global_state.has_gicv4 = gicv4_enable;
-		kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable;
-		kvm_info("GICv4%s support %sabled\n",
-			 kvm_vgic_global_state.has_gicv4_1 ? ".1" : "",
-			 gicv4_enable ? "en" : "dis");
-	}
-
-	if (!info->vcpu.start) {
-		kvm_info("GICv3: no GICV resource entry\n");
-		kvm_vgic_global_state.vcpu_base = 0;
-	} else if (!PAGE_ALIGNED(info->vcpu.start)) {
-		pr_warn("GICV physical address 0x%llx not page aligned\n",
-			(unsigned long long)info->vcpu.start);
-		kvm_vgic_global_state.vcpu_base = 0;
-	} else {
-		kvm_vgic_global_state.vcpu_base = info->vcpu.start;
-		kvm_vgic_global_state.can_emulate_gicv2 = true;
-		ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-		if (ret) {
-			kvm_err("Cannot register GICv2 KVM device.\n");
-			return ret;
-		}
-		kvm_info("vgic-v2@%llx\n", info->vcpu.start);
-	}
-	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
-	if (ret) {
-		kvm_err("Cannot register GICv3 KVM device.\n");
-		kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
-		return ret;
-	}
-
-	if (kvm_vgic_global_state.vcpu_base == 0)
-		kvm_info("disabling GICv2 emulation\n");
-
-#ifdef CONFIG_ARM64
-	if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
-		group0_trap = true;
-		group1_trap = true;
-	}
-#endif
-
-	if (group0_trap || group1_trap || common_trap) {
-		kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n",
-			 group0_trap ? "G0" : "",
-			 group1_trap ? "G1" : "",
-			 common_trap ? "C"  : "");
-		static_branch_enable(&vgic_v3_cpuif_trap);
-	}
-
-	kvm_vgic_global_state.vctrl_base = NULL;
-	kvm_vgic_global_state.type = VGIC_V3;
-	kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
-
-	return 0;
-}
-
-void vgic_v3_load(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	/*
-	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
-	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
-	 * VMCR_EL2 save/restore in the world switch.
-	 */
-	if (likely(cpu_if->vgic_sre))
-		kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
-
-	kvm_call_hyp(__vgic_v3_restore_aprs, vcpu);
-
-	if (has_vhe())
-		__vgic_v3_activate_traps(vcpu);
-
-	WARN_ON(vgic_v4_load(vcpu));
-}
-
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	if (likely(cpu_if->vgic_sre))
-		cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
-}
-
-void vgic_v3_put(struct kvm_vcpu *vcpu)
-{
-	WARN_ON(vgic_v4_put(vcpu, false));
-
-	vgic_v3_vmcr_sync(vcpu);
-
-	kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
-
-	if (has_vhe())
-		__vgic_v3_deactivate_traps(vcpu);
-}
diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c
deleted file mode 100644
index 27ac833e5ec7..000000000000
--- a/virt/kvm/arm/vgic/vgic-v4.c
+++ /dev/null
@@ -1,453 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2017 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/irqdomain.h>
-#include <linux/kvm_host.h>
-#include <linux/irqchip/arm-gic-v3.h>
-
-#include "vgic.h"
-
-/*
- * How KVM uses GICv4 (insert rude comments here):
- *
- * The vgic-v4 layer acts as a bridge between several entities:
- * - The GICv4 ITS representation offered by the ITS driver
- * - VFIO, which is in charge of the PCI endpoint
- * - The virtual ITS, which is the only thing the guest sees
- *
- * The configuration of VLPIs is triggered by a callback from VFIO,
- * instructing KVM that a PCI device has been configured to deliver
- * MSIs to a vITS.
- *
- * kvm_vgic_v4_set_forwarding() is thus called with the routing entry,
- * and this is used to find the corresponding vITS data structures
- * (ITS instance, device, event and irq) using a process that is
- * extremely similar to the injection of an MSI.
- *
- * At this stage, we can link the guest's view of an LPI (uniquely
- * identified by the routing entry) and the host irq, using the GICv4
- * driver mapping operation. Should the mapping succeed, we've then
- * successfully upgraded the guest's LPI to a VLPI. We can then start
- * with updating GICv4's view of the property table and generating an
- * INValidation in order to kickstart the delivery of this VLPI to the
- * guest directly, without software intervention. Well, almost.
- *
- * When the PCI endpoint is deconfigured, this operation is reversed
- * with VFIO calling kvm_vgic_v4_unset_forwarding().
- *
- * Once the VLPI has been mapped, it needs to follow any change the
- * guest performs on its LPI through the vITS. For that, a number of
- * command handlers have hooks to communicate these changes to the HW:
- * - Any invalidation triggers a call to its_prop_update_vlpi()
- * - The INT command results in a irq_set_irqchip_state(), which
- *   generates an INT on the corresponding VLPI.
- * - The CLEAR command results in a irq_set_irqchip_state(), which
- *   generates an CLEAR on the corresponding VLPI.
- * - DISCARD translates into an unmap, similar to a call to
- *   kvm_vgic_v4_unset_forwarding().
- * - MOVI is translated by an update of the existing mapping, changing
- *   the target vcpu, resulting in a VMOVI being generated.
- * - MOVALL is translated by a string of mapping updates (similar to
- *   the handling of MOVI). MOVALL is horrible.
- *
- * Note that a DISCARD/MAPTI sequence emitted from the guest without
- * reprogramming the PCI endpoint after MAPTI does not result in a
- * VLPI being mapped, as there is no callback from VFIO (the guest
- * will get the interrupt via the normal SW injection). Fixing this is
- * not trivial, and requires some horrible messing with the VFIO
- * internals. Not fun. Don't do that.
- *
- * Then there is the scheduling. Each time a vcpu is about to run on a
- * physical CPU, KVM must tell the corresponding redistributor about
- * it. And if we've migrated our vcpu from one CPU to another, we must
- * tell the ITS (so that the messages reach the right redistributor).
- * This is done in two steps: first issue a irq_set_affinity() on the
- * irq corresponding to the vcpu, then call its_make_vpe_resident().
- * You must be in a non-preemptible context. On exit, a call to
- * its_make_vpe_non_resident() tells the redistributor that we're done
- * with the vcpu.
- *
- * Finally, the doorbell handling: Each vcpu is allocated an interrupt
- * which will fire each time a VLPI is made pending whilst the vcpu is
- * not running. Each time the vcpu gets blocked, the doorbell
- * interrupt gets enabled. When the vcpu is unblocked (for whatever
- * reason), the doorbell interrupt is disabled.
- */
-
-#define DB_IRQ_FLAGS	(IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING)
-
-static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info)
-{
-	struct kvm_vcpu *vcpu = info;
-
-	/* We got the message, no need to fire again */
-	if (!kvm_vgic_global_state.has_gicv4_1 &&
-	    !irqd_irq_disabled(&irq_to_desc(irq)->irq_data))
-		disable_irq_nosync(irq);
-
-	vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true;
-	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-	kvm_vcpu_kick(vcpu);
-
-	return IRQ_HANDLED;
-}
-
-static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq)
-{
-	vpe->sgi_config[irq->intid].enabled	= irq->enabled;
-	vpe->sgi_config[irq->intid].group 	= irq->group;
-	vpe->sgi_config[irq->intid].priority	= irq->priority;
-}
-
-static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu)
-{
-	struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-	int i;
-
-	/*
-	 * With GICv4.1, every virtual SGI can be directly injected. So
-	 * let's pretend that they are HW interrupts, tied to a host
-	 * IRQ. The SGI code will do its magic.
-	 */
-	for (i = 0; i < VGIC_NR_SGIS; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
-		struct irq_desc *desc;
-		unsigned long flags;
-		int ret;
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		if (irq->hw)
-			goto unlock;
-
-		irq->hw = true;
-		irq->host_irq = irq_find_mapping(vpe->sgi_domain, i);
-
-		/* Transfer the full irq state to the vPE */
-		vgic_v4_sync_sgi_config(vpe, irq);
-		desc = irq_to_desc(irq->host_irq);
-		ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc),
-					      false);
-		if (!WARN_ON(ret)) {
-			/* Transfer pending state */
-			ret = irq_set_irqchip_state(irq->host_irq,
-						    IRQCHIP_STATE_PENDING,
-						    irq->pending_latch);
-			WARN_ON(ret);
-			irq->pending_latch = false;
-		}
-	unlock:
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for (i = 0; i < VGIC_NR_SGIS; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
-		struct irq_desc *desc;
-		unsigned long flags;
-		int ret;
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-		if (!irq->hw)
-			goto unlock;
-
-		irq->hw = false;
-		ret = irq_get_irqchip_state(irq->host_irq,
-					    IRQCHIP_STATE_PENDING,
-					    &irq->pending_latch);
-		WARN_ON(ret);
-
-		desc = irq_to_desc(irq->host_irq);
-		irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
-	unlock:
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-/* Must be called with the kvm lock held */
-void vgic_v4_configure_vsgis(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	kvm_arm_halt_guest(kvm);
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (dist->nassgireq)
-			vgic_v4_enable_vsgis(vcpu);
-		else
-			vgic_v4_disable_vsgis(vcpu);
-	}
-
-	kvm_arm_resume_guest(kvm);
-}
-
-/**
- * vgic_v4_init - Initialize the GICv4 data structures
- * @kvm:	Pointer to the VM being initialized
- *
- * We may be called each time a vITS is created, or when the
- * vgic is initialized. This relies on kvm->lock to be
- * held. In both cases, the number of vcpus should now be
- * fixed.
- */
-int vgic_v4_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int i, nr_vcpus, ret;
-
-	if (!kvm_vgic_global_state.has_gicv4)
-		return 0; /* Nothing to see here... move along. */
-
-	if (dist->its_vm.vpes)
-		return 0;
-
-	nr_vcpus = atomic_read(&kvm->online_vcpus);
-
-	dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes),
-				    GFP_KERNEL);
-	if (!dist->its_vm.vpes)
-		return -ENOMEM;
-
-	dist->its_vm.nr_vpes = nr_vcpus;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-
-	ret = its_alloc_vcpu_irqs(&dist->its_vm);
-	if (ret < 0) {
-		kvm_err("VPE IRQ allocation failure\n");
-		kfree(dist->its_vm.vpes);
-		dist->its_vm.nr_vpes = 0;
-		dist->its_vm.vpes = NULL;
-		return ret;
-	}
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		int irq = dist->its_vm.vpes[i]->irq;
-		unsigned long irq_flags = DB_IRQ_FLAGS;
-
-		/*
-		 * Don't automatically enable the doorbell, as we're
-		 * flipping it back and forth when the vcpu gets
-		 * blocked. Also disable the lazy disabling, as the
-		 * doorbell could kick us out of the guest too
-		 * early...
-		 *
-		 * On GICv4.1, the doorbell is managed in HW and must
-		 * be left enabled.
-		 */
-		if (kvm_vgic_global_state.has_gicv4_1)
-			irq_flags &= ~IRQ_NOAUTOEN;
-		irq_set_status_flags(irq, irq_flags);
-
-		ret = request_irq(irq, vgic_v4_doorbell_handler,
-				  0, "vcpu", vcpu);
-		if (ret) {
-			kvm_err("failed to allocate vcpu IRQ%d\n", irq);
-			/*
-			 * Trick: adjust the number of vpes so we know
-			 * how many to nuke on teardown...
-			 */
-			dist->its_vm.nr_vpes = i;
-			break;
-		}
-	}
-
-	if (ret)
-		vgic_v4_teardown(kvm);
-
-	return ret;
-}
-
-/**
- * vgic_v4_teardown - Free the GICv4 data structures
- * @kvm:	Pointer to the VM being destroyed
- *
- * Relies on kvm->lock to be held.
- */
-void vgic_v4_teardown(struct kvm *kvm)
-{
-	struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
-	int i;
-
-	if (!its_vm->vpes)
-		return;
-
-	for (i = 0; i < its_vm->nr_vpes; i++) {
-		struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i);
-		int irq = its_vm->vpes[i]->irq;
-
-		irq_clear_status_flags(irq, DB_IRQ_FLAGS);
-		free_irq(irq, vcpu);
-	}
-
-	its_free_vcpu_irqs(its_vm);
-	kfree(its_vm->vpes);
-	its_vm->nr_vpes = 0;
-	its_vm->vpes = NULL;
-}
-
-int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
-{
-	struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-
-	if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
-		return 0;
-
-	return its_make_vpe_non_resident(vpe, need_db);
-}
-
-int vgic_v4_load(struct kvm_vcpu *vcpu)
-{
-	struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-	int err;
-
-	if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
-		return 0;
-
-	/*
-	 * Before making the VPE resident, make sure the redistributor
-	 * corresponding to our current CPU expects us here. See the
-	 * doc in drivers/irqchip/irq-gic-v4.c to understand how this
-	 * turns into a VMOVP command at the ITS level.
-	 */
-	err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id()));
-	if (err)
-		return err;
-
-	err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled);
-	if (err)
-		return err;
-
-	/*
-	 * Now that the VPE is resident, let's get rid of a potential
-	 * doorbell interrupt that would still be pending. This is a
-	 * GICv4.0 only "feature"...
-	 */
-	if (!kvm_vgic_global_state.has_gicv4_1)
-		err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false);
-
-	return err;
-}
-
-static struct vgic_its *vgic_get_its(struct kvm *kvm,
-				     struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-	struct kvm_msi msi  = (struct kvm_msi) {
-		.address_lo	= irq_entry->msi.address_lo,
-		.address_hi	= irq_entry->msi.address_hi,
-		.data		= irq_entry->msi.data,
-		.flags		= irq_entry->msi.flags,
-		.devid		= irq_entry->msi.devid,
-	};
-
-	return vgic_msi_to_its(kvm, &msi);
-}
-
-int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
-			       struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-	struct vgic_its *its;
-	struct vgic_irq *irq;
-	struct its_vlpi_map map;
-	int ret;
-
-	if (!vgic_supports_direct_msis(kvm))
-		return 0;
-
-	/*
-	 * Get the ITS, and escape early on error (not a valid
-	 * doorbell for any of our vITSs).
-	 */
-	its = vgic_get_its(kvm, irq_entry);
-	if (IS_ERR(its))
-		return 0;
-
-	mutex_lock(&its->its_lock);
-
-	/* Perform the actual DevID/EventID -> LPI translation. */
-	ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
-				   irq_entry->msi.data, &irq);
-	if (ret)
-		goto out;
-
-	/*
-	 * Emit the mapping request. If it fails, the ITS probably
-	 * isn't v4 compatible, so let's silently bail out. Holding
-	 * the ITS lock should ensure that nothing can modify the
-	 * target vcpu.
-	 */
-	map = (struct its_vlpi_map) {
-		.vm		= &kvm->arch.vgic.its_vm,
-		.vpe		= &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe,
-		.vintid		= irq->intid,
-		.properties	= ((irq->priority & 0xfc) |
-				   (irq->enabled ? LPI_PROP_ENABLED : 0) |
-				   LPI_PROP_GROUP1),
-		.db_enabled	= true,
-	};
-
-	ret = its_map_vlpi(virq, &map);
-	if (ret)
-		goto out;
-
-	irq->hw		= true;
-	irq->host_irq	= virq;
-	atomic_inc(&map.vpe->vlpi_count);
-
-out:
-	mutex_unlock(&its->its_lock);
-	return ret;
-}
-
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
-				 struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-	struct vgic_its *its;
-	struct vgic_irq *irq;
-	int ret;
-
-	if (!vgic_supports_direct_msis(kvm))
-		return 0;
-
-	/*
-	 * Get the ITS, and escape early on error (not a valid
-	 * doorbell for any of our vITSs).
-	 */
-	its = vgic_get_its(kvm, irq_entry);
-	if (IS_ERR(its))
-		return 0;
-
-	mutex_lock(&its->its_lock);
-
-	ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
-				   irq_entry->msi.data, &irq);
-	if (ret)
-		goto out;
-
-	WARN_ON(!(irq->hw && irq->host_irq == virq));
-	if (irq->hw) {
-		atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
-		irq->hw = false;
-		ret = its_unmap_vlpi(virq);
-	}
-
-out:
-	mutex_unlock(&its->its_lock);
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
deleted file mode 100644
index 99b02ca730a8..000000000000
--- a/virt/kvm/arm/vgic/vgic.c
+++ /dev/null
@@ -1,1011 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/list_sort.h>
-#include <linux/nospec.h>
-
-#include <asm/kvm_hyp.h>
-
-#include "vgic.h"
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-struct vgic_global kvm_vgic_global_state __ro_after_init = {
-	.gicv3_cpuif = STATIC_KEY_FALSE_INIT,
-};
-
-/*
- * Locking order is always:
- * kvm->lock (mutex)
- *   its->cmd_lock (mutex)
- *     its->its_lock (mutex)
- *       vgic_cpu->ap_list_lock		must be taken with IRQs disabled
- *         kvm->lpi_list_lock		must be taken with IRQs disabled
- *           vgic_irq->irq_lock		must be taken with IRQs disabled
- *
- * As the ap_list_lock might be taken from the timer interrupt handler,
- * we have to disable IRQs before taking this lock and everything lower
- * than it.
- *
- * If you need to take multiple locks, always take the upper lock first,
- * then the lower ones, e.g. first take the its_lock, then the irq_lock.
- * If you are already holding a lock and need to take a higher one, you
- * have to drop the lower ranking lock first and re-aquire it after having
- * taken the upper one.
- *
- * When taking more than one ap_list_lock at the same time, always take the
- * lowest numbered VCPU's ap_list_lock first, so:
- *   vcpuX->vcpu_id < vcpuY->vcpu_id:
- *     raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
- *     raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
- *
- * Since the VGIC must support injecting virtual interrupts from ISRs, we have
- * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer
- * spinlocks for any lock that may be taken while injecting an interrupt.
- */
-
-/*
- * Iterate over the VM's list of mapped LPIs to find the one with a
- * matching interrupt ID and return a reference to the IRQ structure.
- */
-static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq = NULL;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-		if (irq->intid != intid)
-			continue;
-
-		/*
-		 * This increases the refcount, the caller is expected to
-		 * call vgic_put_irq() later once it's finished with the IRQ.
-		 */
-		vgic_get_irq_kref(irq);
-		goto out_unlock;
-	}
-	irq = NULL;
-
-out_unlock:
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	return irq;
-}
-
-/*
- * This looks up the virtual interrupt ID to get the corresponding
- * struct vgic_irq. It also increases the refcount, so any caller is expected
- * to call vgic_put_irq() once it's finished with this IRQ.
- */
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
-			      u32 intid)
-{
-	/* SGIs and PPIs */
-	if (intid <= VGIC_MAX_PRIVATE) {
-		intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
-		return &vcpu->arch.vgic_cpu.private_irqs[intid];
-	}
-
-	/* SPIs */
-	if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
-		intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
-		return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
-	}
-
-	/* LPIs */
-	if (intid >= VGIC_MIN_LPI)
-		return vgic_get_lpi(kvm, intid);
-
-	WARN(1, "Looking up struct vgic_irq for reserved INTID");
-	return NULL;
-}
-
-/*
- * We can't do anything in here, because we lack the kvm pointer to
- * lock and remove the item from the lpi_list. So we keep this function
- * empty and use the return value of kref_put() to trigger the freeing.
- */
-static void vgic_irq_release(struct kref *ref)
-{
-}
-
-/*
- * Drop the refcount on the LPI. Must be called with lpi_list_lock held.
- */
-void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	if (!kref_put(&irq->refcount, vgic_irq_release))
-		return;
-
-	list_del(&irq->lpi_list);
-	dist->lpi_list_count--;
-
-	kfree(irq);
-}
-
-void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	unsigned long flags;
-
-	if (irq->intid < VGIC_MIN_LPI)
-		return;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	__vgic_put_lpi_locked(kvm, irq);
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-}
-
-void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq, *tmp;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
-
-	list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
-		if (irq->intid >= VGIC_MIN_LPI) {
-			raw_spin_lock(&irq->irq_lock);
-			list_del(&irq->ap_list);
-			irq->vcpu = NULL;
-			raw_spin_unlock(&irq->irq_lock);
-			vgic_put_irq(vcpu->kvm, irq);
-		}
-	}
-
-	raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
-}
-
-void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
-{
-	WARN_ON(irq_set_irqchip_state(irq->host_irq,
-				      IRQCHIP_STATE_PENDING,
-				      pending));
-}
-
-bool vgic_get_phys_line_level(struct vgic_irq *irq)
-{
-	bool line_level;
-
-	BUG_ON(!irq->hw);
-
-	if (irq->get_input_level)
-		return irq->get_input_level(irq->intid);
-
-	WARN_ON(irq_get_irqchip_state(irq->host_irq,
-				      IRQCHIP_STATE_PENDING,
-				      &line_level));
-	return line_level;
-}
-
-/* Set/Clear the physical active state */
-void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
-{
-
-	BUG_ON(!irq->hw);
-	WARN_ON(irq_set_irqchip_state(irq->host_irq,
-				      IRQCHIP_STATE_ACTIVE,
-				      active));
-}
-
-/**
- * kvm_vgic_target_oracle - compute the target vcpu for an irq
- *
- * @irq:	The irq to route. Must be already locked.
- *
- * Based on the current state of the interrupt (enabled, pending,
- * active, vcpu and target_vcpu), compute the next vcpu this should be
- * given to. Return NULL if this shouldn't be injected at all.
- *
- * Requires the IRQ lock to be held.
- */
-static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
-{
-	lockdep_assert_held(&irq->irq_lock);
-
-	/* If the interrupt is active, it must stay on the current vcpu */
-	if (irq->active)
-		return irq->vcpu ? : irq->target_vcpu;
-
-	/*
-	 * If the IRQ is not active but enabled and pending, we should direct
-	 * it to its configured target VCPU.
-	 * If the distributor is disabled, pending interrupts shouldn't be
-	 * forwarded.
-	 */
-	if (irq->enabled && irq_is_pending(irq)) {
-		if (unlikely(irq->target_vcpu &&
-			     !irq->target_vcpu->kvm->arch.vgic.enabled))
-			return NULL;
-
-		return irq->target_vcpu;
-	}
-
-	/* If neither active nor pending and enabled, then this IRQ should not
-	 * be queued to any VCPU.
-	 */
-	return NULL;
-}
-
-/*
- * The order of items in the ap_lists defines how we'll pack things in LRs as
- * well, the first items in the list being the first things populated in the
- * LRs.
- *
- * A hard rule is that active interrupts can never be pushed out of the LRs
- * (and therefore take priority) since we cannot reliably trap on deactivation
- * of IRQs and therefore they have to be present in the LRs.
- *
- * Otherwise things should be sorted by the priority field and the GIC
- * hardware support will take care of preemption of priority groups etc.
- *
- * Return negative if "a" sorts before "b", 0 to preserve order, and positive
- * to sort "b" before "a".
- */
-static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
-{
-	struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
-	struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
-	bool penda, pendb;
-	int ret;
-
-	/*
-	 * list_sort may call this function with the same element when
-	 * the list is fairly long.
-	 */
-	if (unlikely(irqa == irqb))
-		return 0;
-
-	raw_spin_lock(&irqa->irq_lock);
-	raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
-
-	if (irqa->active || irqb->active) {
-		ret = (int)irqb->active - (int)irqa->active;
-		goto out;
-	}
-
-	penda = irqa->enabled && irq_is_pending(irqa);
-	pendb = irqb->enabled && irq_is_pending(irqb);
-
-	if (!penda || !pendb) {
-		ret = (int)pendb - (int)penda;
-		goto out;
-	}
-
-	/* Both pending and enabled, sort by priority */
-	ret = irqa->priority - irqb->priority;
-out:
-	raw_spin_unlock(&irqb->irq_lock);
-	raw_spin_unlock(&irqa->irq_lock);
-	return ret;
-}
-
-/* Must be called with the ap_list_lock held */
-static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-	list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
-}
-
-/*
- * Only valid injection if changing level for level-triggered IRQs or for a
- * rising edge, and in-kernel connected IRQ lines can only be controlled by
- * their owner.
- */
-static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
-{
-	if (irq->owner != owner)
-		return false;
-
-	switch (irq->config) {
-	case VGIC_CONFIG_LEVEL:
-		return irq->line_level != level;
-	case VGIC_CONFIG_EDGE:
-		return level;
-	}
-
-	return false;
-}
-
-/*
- * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
- * Do the queuing if necessary, taking the right locks in the right order.
- * Returns true when the IRQ was queued, false otherwise.
- *
- * Needs to be entered with the IRQ lock already held, but will return
- * with all locks dropped.
- */
-bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
-			   unsigned long flags)
-{
-	struct kvm_vcpu *vcpu;
-
-	lockdep_assert_held(&irq->irq_lock);
-
-retry:
-	vcpu = vgic_target_oracle(irq);
-	if (irq->vcpu || !vcpu) {
-		/*
-		 * If this IRQ is already on a VCPU's ap_list, then it
-		 * cannot be moved or modified and there is no more work for
-		 * us to do.
-		 *
-		 * Otherwise, if the irq is not pending and enabled, it does
-		 * not need to be inserted into an ap_list and there is also
-		 * no more work for us to do.
-		 */
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		/*
-		 * We have to kick the VCPU here, because we could be
-		 * queueing an edge-triggered interrupt for which we
-		 * get no EOI maintenance interrupt. In that case,
-		 * while the IRQ is already on the VCPU's AP list, the
-		 * VCPU could have EOI'ed the original interrupt and
-		 * won't see this one until it exits for some other
-		 * reason.
-		 */
-		if (vcpu) {
-			kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-			kvm_vcpu_kick(vcpu);
-		}
-		return false;
-	}
-
-	/*
-	 * We must unlock the irq lock to take the ap_list_lock where
-	 * we are going to insert this new pending interrupt.
-	 */
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	/* someone can do stuff here, which we re-check below */
-
-	raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
-	raw_spin_lock(&irq->irq_lock);
-
-	/*
-	 * Did something change behind our backs?
-	 *
-	 * There are two cases:
-	 * 1) The irq lost its pending state or was disabled behind our
-	 *    backs and/or it was queued to another VCPU's ap_list.
-	 * 2) Someone changed the affinity on this irq behind our
-	 *    backs and we are now holding the wrong ap_list_lock.
-	 *
-	 * In both cases, drop the locks and retry.
-	 */
-
-	if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
-		raw_spin_unlock(&irq->irq_lock);
-		raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock,
-					   flags);
-
-		raw_spin_lock_irqsave(&irq->irq_lock, flags);
-		goto retry;
-	}
-
-	/*
-	 * Grab a reference to the irq to reflect the fact that it is
-	 * now in the ap_list.
-	 */
-	vgic_get_irq_kref(irq);
-	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
-	irq->vcpu = vcpu;
-
-	raw_spin_unlock(&irq->irq_lock);
-	raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
-
-	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-	kvm_vcpu_kick(vcpu);
-
-	return true;
-}
-
-/**
- * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @intid:   The INTID to inject a new state to.
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *			      false: to ignore the call
- *	     Level-sensitive  true:  raise the input signal
- *			      false: lower the input signal
- * @owner:   The opaque pointer to the owner of the IRQ being raised to verify
- *           that the caller is allowed to inject this IRQ.  Userspace
- *           injections will have owner == NULL.
- *
- * The VGIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			bool level, void *owner)
-{
-	struct kvm_vcpu *vcpu;
-	struct vgic_irq *irq;
-	unsigned long flags;
-	int ret;
-
-	trace_vgic_update_irq_pending(cpuid, intid, level);
-
-	ret = vgic_lazy_init(kvm);
-	if (ret)
-		return ret;
-
-	vcpu = kvm_get_vcpu(kvm, cpuid);
-	if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
-		return -EINVAL;
-
-	irq = vgic_get_irq(kvm, vcpu, intid);
-	if (!irq)
-		return -EINVAL;
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-
-	if (!vgic_validate_injection(irq, level, owner)) {
-		/* Nothing to see here, move along... */
-		raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(kvm, irq);
-		return 0;
-	}
-
-	if (irq->config == VGIC_CONFIG_LEVEL)
-		irq->line_level = level;
-	else
-		irq->pending_latch = true;
-
-	vgic_queue_irq_unlock(kvm, irq, flags);
-	vgic_put_irq(kvm, irq);
-
-	return 0;
-}
-
-/* @irq->irq_lock must be held */
-static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-			    unsigned int host_irq,
-			    bool (*get_input_level)(int vindid))
-{
-	struct irq_desc *desc;
-	struct irq_data *data;
-
-	/*
-	 * Find the physical IRQ number corresponding to @host_irq
-	 */
-	desc = irq_to_desc(host_irq);
-	if (!desc) {
-		kvm_err("%s: no interrupt descriptor\n", __func__);
-		return -EINVAL;
-	}
-	data = irq_desc_get_irq_data(desc);
-	while (data->parent_data)
-		data = data->parent_data;
-
-	irq->hw = true;
-	irq->host_irq = host_irq;
-	irq->hwintid = data->hwirq;
-	irq->get_input_level = get_input_level;
-	return 0;
-}
-
-/* @irq->irq_lock must be held */
-static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
-{
-	irq->hw = false;
-	irq->hwintid = 0;
-	irq->get_input_level = NULL;
-}
-
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
-			  u32 vintid, bool (*get_input_level)(int vindid))
-{
-	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-	unsigned long flags;
-	int ret;
-
-	BUG_ON(!irq);
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-	vgic_put_irq(vcpu->kvm, irq);
-
-	return ret;
-}
-
-/**
- * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ
- * @vcpu: The VCPU pointer
- * @vintid: The INTID of the interrupt
- *
- * Reset the active and pending states of a mapped interrupt.  Kernel
- * subsystems injecting mapped interrupts should reset their interrupt lines
- * when we are doing a reset of the VM.
- */
-void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
-{
-	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-	unsigned long flags;
-
-	if (!irq->hw)
-		goto out;
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	irq->active = false;
-	irq->pending_latch = false;
-	irq->line_level = false;
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-out:
-	vgic_put_irq(vcpu->kvm, irq);
-}
-
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
-{
-	struct vgic_irq *irq;
-	unsigned long flags;
-
-	if (!vgic_initialized(vcpu->kvm))
-		return -EAGAIN;
-
-	irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-	BUG_ON(!irq);
-
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	kvm_vgic_unmap_irq(irq);
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-	vgic_put_irq(vcpu->kvm, irq);
-
-	return 0;
-}
-
-/**
- * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
- *
- * @vcpu:   Pointer to the VCPU (used for PPIs)
- * @intid:  The virtual INTID identifying the interrupt (PPI or SPI)
- * @owner:  Opaque pointer to the owner
- *
- * Returns 0 if intid is not already used by another in-kernel device and the
- * owner is set, otherwise returns an error code.
- */
-int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
-{
-	struct vgic_irq *irq;
-	unsigned long flags;
-	int ret = 0;
-
-	if (!vgic_initialized(vcpu->kvm))
-		return -EAGAIN;
-
-	/* SGIs and LPIs cannot be wired up to any device */
-	if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
-		return -EINVAL;
-
-	irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	if (irq->owner && irq->owner != owner)
-		ret = -EEXIST;
-	else
-		irq->owner = owner;
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	return ret;
-}
-
-/**
- * vgic_prune_ap_list - Remove non-relevant interrupts from the list
- *
- * @vcpu: The VCPU pointer
- *
- * Go over the list of "interesting" interrupts, and prune those that we
- * won't have to consider in the near future.
- */
-static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq, *tmp;
-
-	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-retry:
-	raw_spin_lock(&vgic_cpu->ap_list_lock);
-
-	list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
-		struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
-		bool target_vcpu_needs_kick = false;
-
-		raw_spin_lock(&irq->irq_lock);
-
-		BUG_ON(vcpu != irq->vcpu);
-
-		target_vcpu = vgic_target_oracle(irq);
-
-		if (!target_vcpu) {
-			/*
-			 * We don't need to process this interrupt any
-			 * further, move it off the list.
-			 */
-			list_del(&irq->ap_list);
-			irq->vcpu = NULL;
-			raw_spin_unlock(&irq->irq_lock);
-
-			/*
-			 * This vgic_put_irq call matches the
-			 * vgic_get_irq_kref in vgic_queue_irq_unlock,
-			 * where we added the LPI to the ap_list. As
-			 * we remove the irq from the list, we drop
-			 * also drop the refcount.
-			 */
-			vgic_put_irq(vcpu->kvm, irq);
-			continue;
-		}
-
-		if (target_vcpu == vcpu) {
-			/* We're on the right CPU */
-			raw_spin_unlock(&irq->irq_lock);
-			continue;
-		}
-
-		/* This interrupt looks like it has to be migrated. */
-
-		raw_spin_unlock(&irq->irq_lock);
-		raw_spin_unlock(&vgic_cpu->ap_list_lock);
-
-		/*
-		 * Ensure locking order by always locking the smallest
-		 * ID first.
-		 */
-		if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
-			vcpuA = vcpu;
-			vcpuB = target_vcpu;
-		} else {
-			vcpuA = target_vcpu;
-			vcpuB = vcpu;
-		}
-
-		raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
-		raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
-				      SINGLE_DEPTH_NESTING);
-		raw_spin_lock(&irq->irq_lock);
-
-		/*
-		 * If the affinity has been preserved, move the
-		 * interrupt around. Otherwise, it means things have
-		 * changed while the interrupt was unlocked, and we
-		 * need to replay this.
-		 *
-		 * In all cases, we cannot trust the list not to have
-		 * changed, so we restart from the beginning.
-		 */
-		if (target_vcpu == vgic_target_oracle(irq)) {
-			struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
-
-			list_del(&irq->ap_list);
-			irq->vcpu = target_vcpu;
-			list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
-			target_vcpu_needs_kick = true;
-		}
-
-		raw_spin_unlock(&irq->irq_lock);
-		raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
-		raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
-
-		if (target_vcpu_needs_kick) {
-			kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu);
-			kvm_vcpu_kick(target_vcpu);
-		}
-
-		goto retry;
-	}
-
-	raw_spin_unlock(&vgic_cpu->ap_list_lock);
-}
-
-static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_fold_lr_state(vcpu);
-	else
-		vgic_v3_fold_lr_state(vcpu);
-}
-
-/* Requires the irq_lock to be held. */
-static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
-				    struct vgic_irq *irq, int lr)
-{
-	lockdep_assert_held(&irq->irq_lock);
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_populate_lr(vcpu, irq, lr);
-	else
-		vgic_v3_populate_lr(vcpu, irq, lr);
-}
-
-static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_clear_lr(vcpu, lr);
-	else
-		vgic_v3_clear_lr(vcpu, lr);
-}
-
-static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_set_underflow(vcpu);
-	else
-		vgic_v3_set_underflow(vcpu);
-}
-
-/* Requires the ap_list_lock to be held. */
-static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
-				 bool *multi_sgi)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq;
-	int count = 0;
-
-	*multi_sgi = false;
-
-	lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-		int w;
-
-		raw_spin_lock(&irq->irq_lock);
-		/* GICv2 SGIs can count for more than one... */
-		w = vgic_irq_get_lr_count(irq);
-		raw_spin_unlock(&irq->irq_lock);
-
-		count += w;
-		*multi_sgi |= (w > 1);
-	}
-	return count;
-}
-
-/* Requires the VCPU's ap_list_lock to be held. */
-static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq;
-	int count;
-	bool multi_sgi;
-	u8 prio = 0xff;
-
-	lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-	count = compute_ap_list_depth(vcpu, &multi_sgi);
-	if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
-		vgic_sort_ap_list(vcpu);
-
-	count = 0;
-
-	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-		raw_spin_lock(&irq->irq_lock);
-
-		/*
-		 * If we have multi-SGIs in the pipeline, we need to
-		 * guarantee that they are all seen before any IRQ of
-		 * lower priority. In that case, we need to filter out
-		 * these interrupts by exiting early. This is easy as
-		 * the AP list has been sorted already.
-		 */
-		if (multi_sgi && irq->priority > prio) {
-			_raw_spin_unlock(&irq->irq_lock);
-			break;
-		}
-
-		if (likely(vgic_target_oracle(irq) == vcpu)) {
-			vgic_populate_lr(vcpu, irq, count++);
-
-			if (irq->source)
-				prio = irq->priority;
-		}
-
-		raw_spin_unlock(&irq->irq_lock);
-
-		if (count == kvm_vgic_global_state.nr_lr) {
-			if (!list_is_last(&irq->ap_list,
-					  &vgic_cpu->ap_list_head))
-				vgic_set_underflow(vcpu);
-			break;
-		}
-	}
-
-	vcpu->arch.vgic_cpu.used_lrs = count;
-
-	/* Nuke remaining LRs */
-	for ( ; count < kvm_vgic_global_state.nr_lr; count++)
-		vgic_clear_lr(vcpu, count);
-}
-
-static inline bool can_access_vgic_from_kernel(void)
-{
-	/*
-	 * GICv2 can always be accessed from the kernel because it is
-	 * memory-mapped, and VHE systems can access GICv3 EL2 system
-	 * registers.
-	 */
-	return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
-}
-
-static inline void vgic_save_state(struct kvm_vcpu *vcpu)
-{
-	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
-		vgic_v2_save_state(vcpu);
-	else
-		__vgic_v3_save_state(vcpu);
-}
-
-/* Sync back the hardware VGIC state into our emulation after a guest's run. */
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	/* An empty ap_list_head implies used_lrs == 0 */
-	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
-		return;
-
-	if (can_access_vgic_from_kernel())
-		vgic_save_state(vcpu);
-
-	if (vgic_cpu->used_lrs)
-		vgic_fold_lr_state(vcpu);
-	vgic_prune_ap_list(vcpu);
-}
-
-static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
-{
-	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
-		vgic_v2_restore_state(vcpu);
-	else
-		__vgic_v3_restore_state(vcpu);
-}
-
-/* Flush our emulation state into the GIC hardware before entering the guest. */
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * If there are no virtual interrupts active or pending for this
-	 * VCPU, then there is no work to do and we can bail out without
-	 * taking any lock.  There is a potential race with someone injecting
-	 * interrupts to the VCPU, but it is a benign race as the VCPU will
-	 * either observe the new interrupt before or after doing this check,
-	 * and introducing additional synchronization mechanism doesn't change
-	 * this.
-	 *
-	 * Note that we still need to go through the whole thing if anything
-	 * can be directly injected (GICv4).
-	 */
-	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
-	    !vgic_supports_direct_msis(vcpu->kvm))
-		return;
-
-	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-	if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
-		raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
-		vgic_flush_lr_state(vcpu);
-		raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
-	}
-
-	if (can_access_vgic_from_kernel())
-		vgic_restore_state(vcpu);
-}
-
-void kvm_vgic_load(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(!vgic_initialized(vcpu->kvm)))
-		return;
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_load(vcpu);
-	else
-		vgic_v3_load(vcpu);
-}
-
-void kvm_vgic_put(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(!vgic_initialized(vcpu->kvm)))
-		return;
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_put(vcpu);
-	else
-		vgic_v3_put(vcpu);
-}
-
-void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		return;
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_vmcr_sync(vcpu);
-	else
-		vgic_v3_vmcr_sync(vcpu);
-}
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq;
-	bool pending = false;
-	unsigned long flags;
-	struct vgic_vmcr vmcr;
-
-	if (!vcpu->kvm->arch.vgic.enabled)
-		return false;
-
-	if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
-		return true;
-
-	vgic_get_vmcr(vcpu, &vmcr);
-
-	raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
-
-	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-		raw_spin_lock(&irq->irq_lock);
-		pending = irq_is_pending(irq) && irq->enabled &&
-			  !irq->active &&
-			  irq->priority < vmcr.pmr;
-		raw_spin_unlock(&irq->irq_lock);
-
-		if (pending)
-			break;
-	}
-
-	raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
-
-	return pending;
-}
-
-void vgic_kick_vcpus(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int c;
-
-	/*
-	 * We've injected an interrupt, time to find out who deserves
-	 * a good kick...
-	 */
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		if (kvm_vgic_vcpu_pending_irq(vcpu)) {
-			kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-			kvm_vcpu_kick(vcpu);
-		}
-	}
-}
-
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
-{
-	struct vgic_irq *irq;
-	bool map_is_active;
-	unsigned long flags;
-
-	if (!vgic_initialized(vcpu->kvm))
-		return false;
-
-	irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-	raw_spin_lock_irqsave(&irq->irq_lock, flags);
-	map_is_active = irq->hw && irq->active;
-	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-	vgic_put_irq(vcpu->kvm, irq);
-
-	return map_is_active;
-}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
deleted file mode 100644
index 769e4802645e..000000000000
--- a/virt/kvm/arm/vgic/vgic.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- */
-#ifndef __KVM_ARM_VGIC_NEW_H__
-#define __KVM_ARM_VGIC_NEW_H__
-
-#include <linux/irqchip/arm-gic-common.h>
-
-#define PRODUCT_ID_KVM		0x4b	/* ASCII code K */
-#define IMPLEMENTER_ARM		0x43b
-
-#define VGIC_ADDR_UNDEF		(-1)
-#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
-
-#define INTERRUPT_ID_BITS_SPIS	10
-#define INTERRUPT_ID_BITS_ITS	16
-#define VGIC_PRI_BITS		5
-
-#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
-
-#define VGIC_AFFINITY_0_SHIFT 0
-#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT)
-#define VGIC_AFFINITY_1_SHIFT 8
-#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT)
-#define VGIC_AFFINITY_2_SHIFT 16
-#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT)
-#define VGIC_AFFINITY_3_SHIFT 24
-#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT)
-
-#define VGIC_AFFINITY_LEVEL(reg, level) \
-	((((reg) & VGIC_AFFINITY_## level ##_MASK) \
-	>> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/*
- * The Userspace encodes the affinity differently from the MPIDR,
- * Below macro converts vgic userspace format to MPIDR reg format.
- */
-#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \
-			    VGIC_AFFINITY_LEVEL(val, 1) | \
-			    VGIC_AFFINITY_LEVEL(val, 2) | \
-			    VGIC_AFFINITY_LEVEL(val, 3))
-
-/*
- * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt,
- * below macros are defined for CPUREG encoding.
- */
-#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK   0x000000000000c000
-#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT  14
-#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK   0x0000000000003800
-#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT  11
-#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK   0x0000000000000780
-#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT  7
-#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK   0x0000000000000078
-#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT  3
-#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK   0x0000000000000007
-#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT  0
-
-#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \
-				      KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \
-				      KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \
-				      KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
-				      KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
-
-/*
- * As per Documentation/virt/kvm/devices/arm-vgic-its.txt,
- * below macros are defined for ITS table entry encoding.
- */
-#define KVM_ITS_CTE_VALID_SHIFT		63
-#define KVM_ITS_CTE_VALID_MASK		BIT_ULL(63)
-#define KVM_ITS_CTE_RDBASE_SHIFT	16
-#define KVM_ITS_CTE_ICID_MASK		GENMASK_ULL(15, 0)
-#define KVM_ITS_ITE_NEXT_SHIFT		48
-#define KVM_ITS_ITE_PINTID_SHIFT	16
-#define KVM_ITS_ITE_PINTID_MASK		GENMASK_ULL(47, 16)
-#define KVM_ITS_ITE_ICID_MASK		GENMASK_ULL(15, 0)
-#define KVM_ITS_DTE_VALID_SHIFT		63
-#define KVM_ITS_DTE_VALID_MASK		BIT_ULL(63)
-#define KVM_ITS_DTE_NEXT_SHIFT		49
-#define KVM_ITS_DTE_NEXT_MASK		GENMASK_ULL(62, 49)
-#define KVM_ITS_DTE_ITTADDR_SHIFT	5
-#define KVM_ITS_DTE_ITTADDR_MASK	GENMASK_ULL(48, 5)
-#define KVM_ITS_DTE_SIZE_MASK		GENMASK_ULL(4, 0)
-#define KVM_ITS_L1E_VALID_MASK		BIT_ULL(63)
-/* we only support 64 kB translation table page size */
-#define KVM_ITS_L1E_ADDR_MASK		GENMASK_ULL(51, 16)
-
-#define KVM_VGIC_V3_RDIST_INDEX_MASK	GENMASK_ULL(11, 0)
-#define KVM_VGIC_V3_RDIST_FLAGS_MASK	GENMASK_ULL(15, 12)
-#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT	12
-#define KVM_VGIC_V3_RDIST_BASE_MASK	GENMASK_ULL(51, 16)
-#define KVM_VGIC_V3_RDIST_COUNT_MASK	GENMASK_ULL(63, 52)
-#define KVM_VGIC_V3_RDIST_COUNT_SHIFT	52
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
-#else
-#define DEBUG_SPINLOCK_BUG_ON(p)
-#endif
-
-/* Requires the irq_lock to be held by the caller. */
-static inline bool irq_is_pending(struct vgic_irq *irq)
-{
-	if (irq->config == VGIC_CONFIG_EDGE)
-		return irq->pending_latch;
-	else
-		return irq->pending_latch || irq->line_level;
-}
-
-static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
-{
-	return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
-}
-
-static inline int vgic_irq_get_lr_count(struct vgic_irq *irq)
-{
-	/* Account for the active state as an interrupt */
-	if (vgic_irq_is_sgi(irq->intid) && irq->source)
-		return hweight8(irq->source) + irq->active;
-
-	return irq_is_pending(irq) || irq->active;
-}
-
-static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq)
-{
-	return vgic_irq_get_lr_count(irq) > 1;
-}
-
-/*
- * This struct provides an intermediate representation of the fields contained
- * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
- * state to userspace can generate either GICv2 or GICv3 CPU interface
- * registers regardless of the hardware backed GIC used.
- */
-struct vgic_vmcr {
-	u32	grpen0;
-	u32	grpen1;
-
-	u32	ackctl;
-	u32	fiqen;
-	u32	cbpr;
-	u32	eoim;
-
-	u32	abpr;
-	u32	bpr;
-	u32	pmr;  /* Priority mask field in the GICC_PMR and
-		       * ICC_PMR_EL1 priority field format */
-};
-
-struct vgic_reg_attr {
-	struct kvm_vcpu *vcpu;
-	gpa_t addr;
-};
-
-int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-		       struct vgic_reg_attr *reg_attr);
-int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-		       struct vgic_reg_attr *reg_attr);
-const struct vgic_register_region *
-vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
-		     gpa_t addr, int len);
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
-			      u32 intid);
-void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq);
-void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
-bool vgic_get_phys_line_level(struct vgic_irq *irq);
-void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
-void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
-bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
-			   unsigned long flags);
-void vgic_kick_vcpus(struct kvm *kvm);
-
-int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-		      phys_addr_t addr, phys_addr_t alignment);
-
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
-void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
-void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
-int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val);
-int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			  int offset, u32 *val);
-void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_enable(struct kvm_vcpu *vcpu);
-int vgic_v2_probe(const struct gic_kvm_info *info);
-int vgic_v2_map_resources(struct kvm *kvm);
-int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
-			     enum vgic_type);
-
-void vgic_v2_init_lrs(void);
-void vgic_v2_load(struct kvm_vcpu *vcpu);
-void vgic_v2_put(struct kvm_vcpu *vcpu);
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
-
-void vgic_v2_save_state(struct kvm_vcpu *vcpu);
-void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
-
-static inline void vgic_get_irq_kref(struct vgic_irq *irq)
-{
-	if (irq->intid < VGIC_MIN_LPI)
-		return;
-
-	kref_get(&irq->refcount);
-}
-
-void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
-void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
-void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_enable(struct kvm_vcpu *vcpu);
-int vgic_v3_probe(const struct gic_kvm_info *info);
-int vgic_v3_map_resources(struct kvm *kvm);
-int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
-int vgic_v3_save_pending_tables(struct kvm *kvm);
-int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count);
-int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
-bool vgic_v3_check_base(struct kvm *kvm);
-
-void vgic_v3_load(struct kvm_vcpu *vcpu);
-void vgic_v3_put(struct kvm_vcpu *vcpu);
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
-
-bool vgic_has_its(struct kvm *kvm);
-int kvm_vgic_register_its_device(void);
-void vgic_enable_lpis(struct kvm_vcpu *vcpu);
-void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
-int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
-int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val);
-int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val);
-int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 u64 id, u64 *val);
-int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
-				u64 *reg);
-int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-				    u32 intid, u64 *val);
-int kvm_register_vgic_device(unsigned long type);
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-int vgic_lazy_init(struct kvm *kvm);
-int vgic_init(struct kvm *kvm);
-
-void vgic_debug_init(struct kvm *kvm);
-void vgic_debug_destroy(struct kvm *kvm);
-
-bool lock_all_vcpus(struct kvm *kvm);
-void unlock_all_vcpus(struct kvm *kvm);
-
-static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
-
-	/*
-	 * num_pri_bits are initialized with HW supported values.
-	 * We can rely safely on num_pri_bits even if VM has not
-	 * restored ICC_CTLR_EL1 before restoring APnR registers.
-	 */
-	switch (cpu_if->num_pri_bits) {
-	case 7: return 3;
-	case 6: return 1;
-	default: return 0;
-	}
-}
-
-static inline bool
-vgic_v3_redist_region_full(struct vgic_redist_region *region)
-{
-	if (!region->count)
-		return false;
-
-	return (region->free_index >= region->count);
-}
-
-struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs);
-
-static inline size_t
-vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
-{
-	if (!rdreg->count)
-		return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE;
-	else
-		return rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
-}
-
-struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
-							   u32 index);
-
-bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
-
-static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
-{
-	struct vgic_dist *d = &kvm->arch.vgic;
-
-	return (base + size > d->vgic_dist_base) &&
-		(base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE);
-}
-
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
-int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
-			 u32 devid, u32 eventid, struct vgic_irq **irq);
-struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
-int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
-void vgic_lpi_translation_cache_init(struct kvm *kvm);
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
-void vgic_its_invalidate_cache(struct kvm *kvm);
-
-bool vgic_supports_direct_msis(struct kvm *kvm);
-int vgic_v4_init(struct kvm *kvm);
-void vgic_v4_teardown(struct kvm *kvm);
-void vgic_v4_configure_vsgis(struct kvm *kvm);
-
-#endif
-- 
cgit v1.2.3


From d82755b2e781c8989614c82df7582f5649e265b8 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 5 May 2020 16:45:17 +0100
Subject: KVM: arm64: Kill off CONFIG_KVM_ARM_HOST

CONFIG_KVM_ARM_HOST is just a proxy for CONFIG_KVM, so remove it in favour
of the latter.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200505154520.194120-2-tabba@google.com
---
 arch/arm64/kernel/asm-offsets.c |  2 +-
 arch/arm64/kernel/cpu_errata.c  |  2 +-
 arch/arm64/kernel/smp.c         |  2 +-
 arch/arm64/kvm/Kconfig          |  6 -----
 arch/arm64/kvm/Makefile         | 52 ++++++++++++++++++++---------------------
 arch/arm64/kvm/hyp/Makefile     | 22 ++++++++---------
 6 files changed, 40 insertions(+), 46 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 9981a0a5a87f..a27e0cd731e9 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -96,7 +96,7 @@ int main(void)
   DEFINE(CPU_BOOT_PTRAUTH_KEY,	offsetof(struct secondary_data, ptrauth_key));
 #endif
   BLANK();
-#ifdef CONFIG_KVM_ARM_HOST
+#ifdef CONFIG_KVM
   DEFINE(VCPU_CONTEXT,		offsetof(struct kvm_vcpu, arch.ctxt));
   DEFINE(VCPU_FAULT_DISR,	offsetof(struct kvm_vcpu, arch.fault.disr_el1));
   DEFINE(VCPU_WORKAROUND_FLAGS,	offsetof(struct kvm_vcpu, arch.workaround_flags));
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index df56d2295d16..a102321fc8a2 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -234,7 +234,7 @@ static int detect_harden_bp_fw(void)
 		smccc_end = NULL;
 		break;
 
-#if IS_ENABLED(CONFIG_KVM_ARM_HOST)
+#if IS_ENABLED(CONFIG_KVM)
 	case SMCCC_CONDUIT_SMC:
 		cb = call_smc_arch_workaround_1;
 		smccc_start = __smccc_workaround_1_smc;
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 061f60fe452f..0a3045d9f33f 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -430,7 +430,7 @@ static void __init hyp_mode_check(void)
 			   "CPU: CPUs started in inconsistent modes");
 	else
 		pr_info("CPU: All CPU(s) started at EL1\n");
-	if (IS_ENABLED(CONFIG_KVM_ARM_HOST))
+	if (IS_ENABLED(CONFIG_KVM))
 		kvm_compute_layout();
 }
 
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 449386d76441..ce724e526689 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -28,7 +28,6 @@ config KVM
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select HAVE_KVM_ARCH_TLB_FLUSH_ALL
 	select KVM_MMIO
-	select KVM_ARM_HOST
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select SRCU
 	select KVM_VFIO
@@ -50,11 +49,6 @@ config KVM
 
 	  If unsure, say N.
 
-config KVM_ARM_HOST
-	bool
-	---help---
-	  Provides host support for ARM processors.
-
 config KVM_ARM_PMU
 	bool
 	---help---
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 7a3768538343..419696e615b3 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -7,33 +7,33 @@ ccflags-y += -I $(srctree)/$(src)
 
 KVM=../../../virt/kvm
 
-obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
-obj-$(CONFIG_KVM_ARM_HOST) += hyp/
+obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM) += hyp/
 
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o
-kvm-$(CONFIG_KVM_ARM_HOST) += arm.o mmu.o mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += psci.o perf.o
-kvm-$(CONFIG_KVM_ARM_HOST) += hypercalls.o
-kvm-$(CONFIG_KVM_ARM_HOST) += pvtime.o
+kvm-$(CONFIG_KVM) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
+kvm-$(CONFIG_KVM) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o
+kvm-$(CONFIG_KVM) += arm.o mmu.o mmio.o
+kvm-$(CONFIG_KVM) += psci.o perf.o
+kvm-$(CONFIG_KVM) += hypercalls.o
+kvm-$(CONFIG_KVM) += pvtime.o
 
-kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
-kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
-kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o
-kvm-$(CONFIG_KVM_ARM_HOST) += aarch32.o
-kvm-$(CONFIG_KVM_ARM_HOST) += arch_timer.o
+kvm-$(CONFIG_KVM) += inject_fault.o regmap.o va_layout.o
+kvm-$(CONFIG_KVM) += hyp.o hyp-init.o handle_exit.o
+kvm-$(CONFIG_KVM) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM) += vgic-sys-reg-v3.o fpsimd.o pmu.o
+kvm-$(CONFIG_KVM) += aarch32.o
+kvm-$(CONFIG_KVM) += arch_timer.o
 kvm-$(CONFIG_KVM_ARM_PMU)  += pmu-emul.o
 
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-init.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-irqfd.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-v4.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-mmio-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-kvm-device.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-its.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic/vgic-debug.o
+kvm-$(CONFIG_KVM) += vgic/vgic.o
+kvm-$(CONFIG_KVM) += vgic/vgic-init.o
+kvm-$(CONFIG_KVM) += vgic/vgic-irqfd.o
+kvm-$(CONFIG_KVM) += vgic/vgic-v2.o
+kvm-$(CONFIG_KVM) += vgic/vgic-v3.o
+kvm-$(CONFIG_KVM) += vgic/vgic-v4.o
+kvm-$(CONFIG_KVM) += vgic/vgic-mmio.o
+kvm-$(CONFIG_KVM) += vgic/vgic-mmio-v2.o
+kvm-$(CONFIG_KVM) += vgic/vgic-mmio-v3.o
+kvm-$(CONFIG_KVM) += vgic/vgic-kvm-device.o
+kvm-$(CONFIG_KVM) += vgic/vgic-its.o
+kvm-$(CONFIG_KVM) += vgic/vgic-debug.o
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index dc18274a6826..8229e47ba870 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -6,17 +6,17 @@
 ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \
 		$(DISABLE_STACKLEAK_PLUGIN)
 
-obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += aarch32.o
-obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o
-obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += entry.o
-obj-$(CONFIG_KVM_ARM_HOST) += switch.o
-obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
-obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
-obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
+obj-$(CONFIG_KVM) += vgic-v3-sr.o
+obj-$(CONFIG_KVM) += timer-sr.o
+obj-$(CONFIG_KVM) += aarch32.o
+obj-$(CONFIG_KVM) += vgic-v2-cpuif-proxy.o
+obj-$(CONFIG_KVM) += sysreg-sr.o
+obj-$(CONFIG_KVM) += debug-sr.o
+obj-$(CONFIG_KVM) += entry.o
+obj-$(CONFIG_KVM) += switch.o
+obj-$(CONFIG_KVM) += fpsimd.o
+obj-$(CONFIG_KVM) += tlb.o
+obj-$(CONFIG_KVM) += hyp-entry.o
 
 # KVM code is run at a different exception code with a different map, so
 # compiler instrumentation that inserts callbacks or checks into the code may
-- 
cgit v1.2.3


From bf7bc1df30f6c6afa34d4d1d53e1c8ad93510d3e Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 5 May 2020 16:45:18 +0100
Subject: KVM: arm64: Update help text

arm64 KVM supports 16k pages since 02e0b7600f83
("arm64: kvm: Add support for 16K pages"), so update the Kconfig help
text accordingly.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200505154520.194120-3-tabba@google.com
---
 arch/arm64/kvm/Kconfig | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index ce724e526689..d2cf4f099454 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -44,8 +44,6 @@ config KVM
 	select TASK_DELAY_ACCT
 	---help---
 	  Support hosting virtualized guest machines.
-	  We don't support KVM with 16K page tables yet, due to the multiple
-	  levels of fake page tables.
 
 	  If unsure, say N.
 
-- 
cgit v1.2.3


From f26133624d602b0d984815168a2d3a1f630b02e2 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 5 May 2020 16:45:19 +0100
Subject: KVM: arm64: Change CONFIG_KVM to a menuconfig entry

Changing CONFIG_KVM to be a 'menuconfig' entry in Kconfig mean that we
can straightforwardly enumerate optional features, such as the virtual
PMU device as dependent options.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200505154520.194120-4-tabba@google.com
---
 arch/arm64/kvm/Kconfig | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index d2cf4f099454..f1c1f981482c 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -3,7 +3,6 @@
 # KVM configuration
 #
 
-source "virt/kvm/Kconfig"
 source "virt/lib/Kconfig"
 
 menuconfig VIRTUALIZATION
@@ -18,7 +17,7 @@ menuconfig VIRTUALIZATION
 
 if VIRTUALIZATION
 
-config KVM
+menuconfig KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
 	depends on OF
 	# for TASKSTATS/TASK_DELAY_ACCT:
@@ -33,7 +32,6 @@ config KVM
 	select KVM_VFIO
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_IRQFD
-	select KVM_ARM_PMU if HW_PERF_EVENTS
 	select HAVE_KVM_MSI
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQ_ROUTING
@@ -47,13 +45,21 @@ config KVM
 
 	  If unsure, say N.
 
+if KVM
+
+source "virt/kvm/Kconfig"
+
 config KVM_ARM_PMU
-	bool
+	bool "Virtual Performance Monitoring Unit (PMU) support"
+	depends on HW_PERF_EVENTS
+	default y
 	---help---
 	  Adds support for a virtual Performance Monitoring Unit (PMU) in
 	  virtual machines.
 
 config KVM_INDIRECT_VECTORS
-       def_bool KVM && (HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS)
+	def_bool HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS
+
+endif # KVM
 
 endif # VIRTUALIZATION
-- 
cgit v1.2.3


From 25357de01b95140ecacd4d9347d74df2dda789f2 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Tue, 5 May 2020 16:45:20 +0100
Subject: KVM: arm64: Clean up kvm makefiles

Consolidate references to the CONFIG_KVM configuration item to encompass
entire folders rather than per line.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200505154520.194120-5-tabba@google.com
---
 arch/arm64/kvm/Makefile     | 38 +++++++++++++-------------------------
 arch/arm64/kvm/hyp/Makefile | 15 ++++-----------
 2 files changed, 17 insertions(+), 36 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 419696e615b3..8d3d9513cbfe 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -10,30 +10,18 @@ KVM=../../../virt/kvm
 obj-$(CONFIG_KVM) += kvm.o
 obj-$(CONFIG_KVM) += hyp/
 
-kvm-$(CONFIG_KVM) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
-kvm-$(CONFIG_KVM) += $(KVM)/eventfd.o $(KVM)/vfio.o $(KVM)/irqchip.o
-kvm-$(CONFIG_KVM) += arm.o mmu.o mmio.o
-kvm-$(CONFIG_KVM) += psci.o perf.o
-kvm-$(CONFIG_KVM) += hypercalls.o
-kvm-$(CONFIG_KVM) += pvtime.o
+kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
+	 $(KVM)/vfio.o $(KVM)/irqchip.o \
+	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
+	 inject_fault.o regmap.o va_layout.o hyp.o hyp-init.o handle_exit.o \
+	 guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o \
+	 vgic-sys-reg-v3.o fpsimd.o pmu.o \
+	 aarch32.o arch_timer.o \
+	 vgic/vgic.o vgic/vgic-init.o \
+	 vgic/vgic-irqfd.o vgic/vgic-v2.o \
+	 vgic/vgic-v3.o vgic/vgic-v4.o \
+	 vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
+	 vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
+	 vgic/vgic-its.o vgic/vgic-debug.o
 
-kvm-$(CONFIG_KVM) += inject_fault.o regmap.o va_layout.o
-kvm-$(CONFIG_KVM) += hyp.o hyp-init.o handle_exit.o
-kvm-$(CONFIG_KVM) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
-kvm-$(CONFIG_KVM) += vgic-sys-reg-v3.o fpsimd.o pmu.o
-kvm-$(CONFIG_KVM) += aarch32.o
-kvm-$(CONFIG_KVM) += arch_timer.o
 kvm-$(CONFIG_KVM_ARM_PMU)  += pmu-emul.o
-
-kvm-$(CONFIG_KVM) += vgic/vgic.o
-kvm-$(CONFIG_KVM) += vgic/vgic-init.o
-kvm-$(CONFIG_KVM) += vgic/vgic-irqfd.o
-kvm-$(CONFIG_KVM) += vgic/vgic-v2.o
-kvm-$(CONFIG_KVM) += vgic/vgic-v3.o
-kvm-$(CONFIG_KVM) += vgic/vgic-v4.o
-kvm-$(CONFIG_KVM) += vgic/vgic-mmio.o
-kvm-$(CONFIG_KVM) += vgic/vgic-mmio-v2.o
-kvm-$(CONFIG_KVM) += vgic/vgic-mmio-v3.o
-kvm-$(CONFIG_KVM) += vgic/vgic-kvm-device.o
-kvm-$(CONFIG_KVM) += vgic/vgic-its.o
-kvm-$(CONFIG_KVM) += vgic/vgic-debug.o
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 8229e47ba870..8c9880783839 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -6,17 +6,10 @@
 ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \
 		$(DISABLE_STACKLEAK_PLUGIN)
 
-obj-$(CONFIG_KVM) += vgic-v3-sr.o
-obj-$(CONFIG_KVM) += timer-sr.o
-obj-$(CONFIG_KVM) += aarch32.o
-obj-$(CONFIG_KVM) += vgic-v2-cpuif-proxy.o
-obj-$(CONFIG_KVM) += sysreg-sr.o
-obj-$(CONFIG_KVM) += debug-sr.o
-obj-$(CONFIG_KVM) += entry.o
-obj-$(CONFIG_KVM) += switch.o
-obj-$(CONFIG_KVM) += fpsimd.o
-obj-$(CONFIG_KVM) += tlb.o
-obj-$(CONFIG_KVM) += hyp-entry.o
+obj-$(CONFIG_KVM) += hyp.o
+
+hyp-y := vgic-v3-sr.o timer-sr.o aarch32.o vgic-v2-cpuif-proxy.o sysreg-sr.o \
+	 debug-sr.o entry.o switch.o fpsimd.o tlb.o hyp-entry.o
 
 # KVM code is run at a different exception code with a different map, so
 # compiler instrumentation that inserts callbacks or checks into the code may
-- 
cgit v1.2.3


From c6fe89ff8b250ad4dc4bed7bd5877bfbc35f4aba Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 13 May 2020 11:58:29 +0100
Subject: KVM: arm64: Simplify __kvm_timer_set_cntvoff implementation

Now that this function isn't constrained by the 32bit PCS,
let's simplify it by taking a single 64bit offset instead
of two 32bit parameters.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_asm.h |  2 +-
 arch/arm64/kvm/arch_timer.c      | 12 +-----------
 arch/arm64/kvm/hyp/timer-sr.c    |  3 +--
 3 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 7c7eeeaab9fa..59e314f38e43 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -64,7 +64,7 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
 
-extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
+extern void __kvm_timer_set_cntvoff(u64 cntvoff);
 
 extern int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu);
 
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 93bd59b46848..487eba9f87cd 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -451,17 +451,7 @@ out:
 
 static void set_cntvoff(u64 cntvoff)
 {
-	u32 low = lower_32_bits(cntvoff);
-	u32 high = upper_32_bits(cntvoff);
-
-	/*
-	 * Since kvm_call_hyp doesn't fully support the ARM PCS especially on
-	 * 32-bit systems, but rather passes register by register shifted one
-	 * place (we put the function address in r0/x0), we cannot simply pass
-	 * a 64-bit value as an argument, but have to split the value in two
-	 * 32-bit halves.
-	 */
-	kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
+	kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
 }
 
 static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c
index ff76e6845fe4..fb5c0be33223 100644
--- a/arch/arm64/kvm/hyp/timer-sr.c
+++ b/arch/arm64/kvm/hyp/timer-sr.c
@@ -10,9 +10,8 @@
 
 #include <asm/kvm_hyp.h>
 
-void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
+void __hyp_text __kvm_timer_set_cntvoff(u64 cntvoff)
 {
-	u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low;
 	write_sysreg(cntvoff, cntvoff_el2);
 }
 
-- 
cgit v1.2.3


From ce6f8f02f9f6786355fa6c79d88b839639dd75d8 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 13 May 2020 11:38:28 +0100
Subject: KVM: arm64: Use cpus_have_final_cap for has_vhe()

By the time we start using the has_vhe() helper, we have long
discovered whether we are running VHE or not. It thus makes
sense to use cpus_have_final_cap() instead of cpus_have_const_cap(),
which leads to a small text size reduction.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: David Brazdil <dbrazdil@google.com>
Link: https://lore.kernel.org/r/20200513103828.74580-1-maz@kernel.org
---
 arch/arm64/include/asm/virt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 61fd26752adc..5051b388c654 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -85,7 +85,7 @@ static inline bool is_kernel_in_hyp_mode(void)
 
 static __always_inline bool has_vhe(void)
 {
-	if (cpus_have_const_cap(ARM64_HAS_VIRT_HOST_EXTN))
+	if (cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN))
 		return true;
 
 	return false;
-- 
cgit v1.2.3


From 656012c731fcfd0f770007366e2b952a613745f2 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 1 Apr 2020 15:03:10 +0100
Subject: KVM: Fix spelling in code comments

Fix spelling and typos (e.g., repeated words) in comments.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200401140310.29701-1-tabba@google.com
---
 arch/arm64/kvm/arm.c            | 6 +++---
 arch/arm64/kvm/guest.c          | 4 ++--
 arch/arm64/kvm/hyp/vgic-v3-sr.c | 2 +-
 arch/arm64/kvm/mmio.c           | 2 +-
 arch/arm64/kvm/mmu.c            | 6 +++---
 arch/arm64/kvm/psci.c           | 6 +++---
 arch/arm64/kvm/reset.c          | 6 +++---
 arch/arm64/kvm/sys_regs.c       | 6 +++---
 arch/arm64/kvm/vgic/vgic-v3.c   | 2 +-
 virt/kvm/coalesced_mmio.c       | 2 +-
 virt/kvm/eventfd.c              | 2 +-
 virt/kvm/kvm_main.c             | 2 +-
 12 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index c958bb37b769..ee1b5bba1d08 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -455,9 +455,9 @@ void force_vm_exit(const cpumask_t *mask)
  *
  * The hardware supports a limited set of values with the value zero reserved
  * for the host, so we check if an assigned value belongs to a previous
- * generation, which which requires us to assign a new value. If we're the
- * first to use a VMID for the new generation, we must flush necessary caches
- * and TLBs on all CPUs.
+ * generation, which requires us to assign a new value. If we're the first to
+ * use a VMID for the new generation, we must flush necessary caches and TLBs
+ * on all CPUs.
  */
 static bool need_new_vmid_gen(struct kvm_vmid *vmid)
 {
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 50a279d3ddd7..871d51729b63 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -267,7 +267,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 	/*
 	 * Vector lengths supported by the host can't currently be
 	 * hidden from the guest individually: instead we can only set a
-	 * maxmium via ZCR_EL2.LEN.  So, make sure the available vector
+	 * maximum via ZCR_EL2.LEN.  So, make sure the available vector
 	 * lengths match the set requested exactly up to the requested
 	 * maximum:
 	 */
@@ -337,7 +337,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
 	unsigned int reg_num;
 
 	unsigned int reqoffset, reqlen; /* User-requested offset and length */
-	unsigned int maxlen; /* Maxmimum permitted length */
+	unsigned int maxlen; /* Maximum permitted length */
 
 	size_t sve_state_size;
 
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 49fedf6710f9..6b85773e15c4 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -577,7 +577,7 @@ static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
 
 /*
  * The priority value is independent of any of the BPR values, so we
- * normalize it using the minumal BPR value. This guarantees that no
+ * normalize it using the minimal BPR value. This guarantees that no
  * matter what the guest does with its BPR, we can always set/get the
  * same value of a priority.
  */
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index aedfcff99ac5..4e0366759726 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -131,7 +131,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
 
 	/*
 	 * No valid syndrome? Ask userspace for help if it has
-	 * voluntered to do so, and bail out otherwise.
+	 * volunteered to do so, and bail out otherwise.
 	 */
 	if (!kvm_vcpu_dabt_isvalid(vcpu)) {
 		if (vcpu->kvm->arch.return_nisv_io_abort_to_user) {
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index e3b9ee268823..29d8f24df944 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -784,7 +784,7 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
 	mutex_lock(&kvm_hyp_pgd_mutex);
 
 	/*
-	 * This assumes that we we have enough space below the idmap
+	 * This assumes that we have enough space below the idmap
 	 * page to allocate our VAs. If not, the check below will
 	 * kick. A potential alternative would be to detect that
 	 * overflow and switch to an allocation above the idmap.
@@ -964,7 +964,7 @@ static void stage2_unmap_memslot(struct kvm *kvm,
  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
  * @kvm: The struct kvm pointer
  *
- * Go through the memregions and unmap any reguler RAM
+ * Go through the memregions and unmap any regular RAM
  * backing memory already mapped to the VM.
  */
 void stage2_unmap_vm(struct kvm *kvm)
@@ -2262,7 +2262,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 {
 	/*
 	 * At this point memslot has been committed and there is an
-	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
+	 * allocated dirty_bitmap[], dirty pages will be tracked while the
 	 * memory slot is write protected.
 	 */
 	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index ae364716ee40..83415e96b589 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -94,7 +94,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 
 	/*
 	 * NOTE: We always update r0 (or x0) because for PSCI v0.1
-	 * the general puspose registers are undefined upon CPU_ON.
+	 * the general purpose registers are undefined upon CPU_ON.
 	 */
 	reset_state->r0 = smccc_get_arg3(source_vcpu);
 
@@ -265,10 +265,10 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 	case PSCI_0_2_FN_SYSTEM_OFF:
 		kvm_psci_system_off(vcpu);
 		/*
-		 * We should'nt be going back to guest VCPU after
+		 * We shouldn't be going back to guest VCPU after
 		 * receiving SYSTEM_OFF request.
 		 *
-		 * If user space accidently/deliberately resumes
+		 * If user space accidentally/deliberately resumes
 		 * guest VCPU after SYSTEM_OFF request then guest
 		 * VCPU should see internal failure from PSCI return
 		 * value. To achieve this, we preload r0 (or x0) with
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 30b7ea680f66..658f3a79617b 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -163,7 +163,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
 	vl = vcpu->arch.sve_max_vl;
 
 	/*
-	 * Resposibility for these properties is shared between
+	 * Responsibility for these properties is shared between
 	 * kvm_arm_init_arch_resources(), kvm_vcpu_enable_sve() and
 	 * set_sve_vls().  Double-check here just to be sure:
 	 */
@@ -249,7 +249,7 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
  * ioctl or as part of handling a request issued by another VCPU in the PSCI
  * handling code.  In the first case, the VCPU will not be loaded, and in the
  * second case the VCPU will be loaded.  Because this function operates purely
- * on the memory-backed valus of system registers, we want to do a full put if
+ * on the memory-backed values of system registers, we want to do a full put if
  * we were loaded (handling a request) and load the values back at the end of
  * the function.  Otherwise we leave the state alone.  In both cases, we
  * disable preemption around the vcpu reset as we would otherwise race with
@@ -357,7 +357,7 @@ void kvm_set_ipa_limit(void)
 	 *
 	 * So clamp the ipa limit further down to limit the number of levels.
 	 * Since we can concatenate upto 16 tables at entry level, we could
-	 * go upto 4bits above the maximum VA addressible with the current
+	 * go upto 4bits above the maximum VA addressable with the current
 	 * number of levels.
 	 */
 	va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 51db934702b6..620eaf11e672 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -34,7 +34,7 @@
 #include "trace.h"
 
 /*
- * All of this file is extremly similar to the ARM coproc.c, but the
+ * All of this file is extremely similar to the ARM coproc.c, but the
  * types are different. My gut feeling is that it should be pretty
  * easy to merge, but that would be an ABI breakage -- again. VFP
  * would also need to be abstracted.
@@ -118,8 +118,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 	 * entry to the guest but are only restored on vcpu_load.
 	 *
 	 * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
-	 * should never be listed below, because the the MPIDR should only be
-	 * set once, before running the VCPU, and never changed later.
+	 * should never be listed below, because the MPIDR should only be set
+	 * once, before running the VCPU, and never changed later.
 	 */
 	switch (reg) {
 	case CSSELR_EL1:	write_sysreg_s(val, SYS_CSSELR_EL1);	return;
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 5bc2ab58954b..3ccd6d3cb4d3 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -587,7 +587,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
 	int ret;
 
 	/*
-	 * The ListRegs field is 5 bits, but there is a architectural
+	 * The ListRegs field is 5 bits, but there is an architectural
 	 * maximum of 16 list registers. Just ignore bit 4...
 	 */
 	kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 00c747dbc82e..e2c197fd4f9d 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -119,7 +119,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
 
 	/*
 	 * We're using this spinlock to sync access to the coalesced ring.
-	 * The list doesn't need it's own lock since device registration and
+	 * The list doesn't need its own lock since device registration and
 	 * unregistration should only happen when kvm->slots_lock is held.
 	 */
 	spin_lock_init(&kvm->ring_lock);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 67b6fc153e9c..e586d1395c28 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -116,7 +116,7 @@ irqfd_shutdown(struct work_struct *work)
 	struct kvm *kvm = irqfd->kvm;
 	u64 cnt;
 
-	/* Make sure irqfd has been initalized in assign path. */
+	/* Make sure irqfd has been initialized in assign path. */
 	synchronize_srcu(&kvm->irq_srcu);
 
 	/*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 74bdb7bf3295..f57792b1541b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2799,7 +2799,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
  *
  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
  *  (preempted lock holder), indicated by @in_spin_loop.
- *  Set at the beiginning and cleared at the end of interception/PLE handler.
+ *  Set at the beginning and cleared at the end of interception/PLE handler.
  *
  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
  *  chance last time (mostly it has become eligible now since we have probably
-- 
cgit v1.2.3


From 892713e97ca146591515b3c115f99cdf632030fb Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Wed, 15 Apr 2020 15:28:35 +0800
Subject: KVM: arm64: Sidestep stage2_unmap_vm() on vcpu reset when S2FWB is
 supported

stage2_unmap_vm() was introduced to unmap user RAM region in the stage2
page table to make the caches coherent. E.g., a guest reboot with stage1
MMU disabled will access memory using non-cacheable attributes. If the
RAM and caches are not coherent at this stage, some evicted dirty cache
line may go and corrupt guest data in RAM.

Since ARMv8.4, S2FWB feature is mandatory and KVM will take advantage
of it to configure the stage2 page table and the attributes of memory
access. So we ensure that guests always access memory using cacheable
attributes and thus, the caches always be coherent.

So on CPUs that support S2FWB, we can safely reset the vcpu without a
heavy stage2 unmapping.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200415072835.1164-1-yuzenghui@huawei.com
---
 arch/arm64/kvm/arm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index ee1b5bba1d08..0ea9a0266d9a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -983,8 +983,11 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 	/*
 	 * Ensure a rebooted VM will fault in RAM pages and detect if the
 	 * guest MMU is turned off and flush the caches as needed.
+	 *
+	 * S2FWB enforces all memory accesses to RAM being cacheable, we
+	 * ensure that the cache is always coherent.
 	 */
-	if (vcpu->arch.has_run_once)
+	if (vcpu->arch.has_run_once && !cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
 		stage2_unmap_vm(vcpu->kvm);
 
 	vcpu_reset_hcr(vcpu);
-- 
cgit v1.2.3


From 48c963e31bc664afafd31058483ea8390da63980 Mon Sep 17 00:00:00 2001
From: Jiang Yi <giangyi@amazon.com>
Date: Wed, 15 Apr 2020 10:42:29 +0200
Subject: KVM: arm/arm64: Release kvm->mmu_lock in loop to prevent starvation

Do cond_resched_lock() in stage2_flush_memslot() like what is done in
unmap_stage2_range() and other places holding mmu_lock while processing
a possibly large range of memory.

Signed-off-by: Jiang Yi <giangyi@amazon.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20200415084229.29992-1-giangyi@amazon.com
---
 arch/arm64/kvm/mmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 29d8f24df944..917363375e8a 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -422,6 +422,9 @@ static void stage2_flush_memslot(struct kvm *kvm,
 		next = stage2_pgd_addr_end(kvm, addr, end);
 		if (!stage2_pgd_none(kvm, *pgd))
 			stage2_flush_puds(kvm, pgd, addr, next);
+
+		if (next != end)
+			cond_resched_lock(&kvm->mmu_lock);
 	} while (pgd++, addr = next, addr != end);
 }
 
-- 
cgit v1.2.3


From 9f2836146b11cdf98d5c8f8f71b0fce28fbd83c8 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 7 May 2020 20:35:45 +0800
Subject: KVM: arm64: Clean up the checking for huge mapping

If we are checking whether the stage2 can map PAGE_SIZE,
we don't have to do the boundary checks as both the host
VMA and the guest memslots are page aligned. Bail the case
easily.

While we're at it, fixup a typo in the comment below.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200507123546.1875-2-yuzenghui@huawei.com
---
 arch/arm64/kvm/mmu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 917363375e8a..ccb44e7d30d9 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1610,6 +1610,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
 	hva_t uaddr_start, uaddr_end;
 	size_t size;
 
+	/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
+	if (map_size == PAGE_SIZE)
+		return true;
+
 	size = memslot->npages * PAGE_SIZE;
 
 	gpa_start = memslot->base_gfn << PAGE_SHIFT;
@@ -1629,7 +1633,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
 	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
 	 *    +-----+--------------------+--------------------+---+
 	 *
-	 *    memslot->base_gfn << PAGE_SIZE:
+	 *    memslot->base_gfn << PAGE_SHIFT:
 	 *      +---+--------------------+--------------------+-----+
 	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
 	 *      +---+--------------------+--------------------+-----+
-- 
cgit v1.2.3


From 0529c9021252a58b6d3808da86986a614b900b1b Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 7 May 2020 20:35:46 +0800
Subject: KVM: arm64: Unify handling THP backed host memory

We support mapping host memory backed by PMD transparent hugepages
at stage2 as huge pages. However the checks are now spread across
two different places. Let us unify the handling of the THPs to
keep the code cleaner (and future proof for PUD THP support).
This patch moves transparent_hugepage_adjust() closer to the caller
to avoid a forward declaration for fault_supports_stage2_huge_mappings().

Also, since we already handle the case where the host VA and the guest
PA may not be aligned, the explicit VM_BUG_ON() is not required.

Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200507123546.1875-3-yuzenghui@huawei.com
---
 arch/arm64/kvm/mmu.c | 115 +++++++++++++++++++++++++++------------------------
 1 file changed, 60 insertions(+), 55 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index ccb44e7d30d9..66eb8e3f6e8c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1375,47 +1375,6 @@ out:
 	return ret;
 }
 
-static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
-{
-	kvm_pfn_t pfn = *pfnp;
-	gfn_t gfn = *ipap >> PAGE_SHIFT;
-
-	if (kvm_is_transparent_hugepage(pfn)) {
-		unsigned long mask;
-		/*
-		 * The address we faulted on is backed by a transparent huge
-		 * page.  However, because we map the compound huge page and
-		 * not the individual tail page, we need to transfer the
-		 * refcount to the head page.  We have to be careful that the
-		 * THP doesn't start to split while we are adjusting the
-		 * refcounts.
-		 *
-		 * We are sure this doesn't happen, because mmu_notifier_retry
-		 * was successful and we are holding the mmu_lock, so if this
-		 * THP is trying to split, it will be blocked in the mmu
-		 * notifier before touching any of the pages, specifically
-		 * before being able to call __split_huge_page_refcount().
-		 *
-		 * We can therefore safely transfer the refcount from PG_tail
-		 * to PG_head and switch the pfn from a tail page to the head
-		 * page accordingly.
-		 */
-		mask = PTRS_PER_PMD - 1;
-		VM_BUG_ON((gfn & mask) != (pfn & mask));
-		if (pfn & mask) {
-			*ipap &= PMD_MASK;
-			kvm_release_pfn_clean(pfn);
-			pfn &= ~mask;
-			kvm_get_pfn(pfn);
-			*pfnp = pfn;
-		}
-
-		return true;
-	}
-
-	return false;
-}
-
 /**
  * stage2_wp_ptes - write protect PMD range
  * @pmd:	pointer to pmd entry
@@ -1663,6 +1622,59 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
 	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
 }
 
+/*
+ * Check if the given hva is backed by a transparent huge page (THP) and
+ * whether it can be mapped using block mapping in stage2. If so, adjust
+ * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
+ * supported. This will need to be updated to support other THP sizes.
+ *
+ * Returns the size of the mapping.
+ */
+static unsigned long
+transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
+			    unsigned long hva, kvm_pfn_t *pfnp,
+			    phys_addr_t *ipap)
+{
+	kvm_pfn_t pfn = *pfnp;
+
+	/*
+	 * Make sure the adjustment is done only for THP pages. Also make
+	 * sure that the HVA and IPA are sufficiently aligned and that the
+	 * block map is contained within the memslot.
+	 */
+	if (kvm_is_transparent_hugepage(pfn) &&
+	    fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+		/*
+		 * The address we faulted on is backed by a transparent huge
+		 * page.  However, because we map the compound huge page and
+		 * not the individual tail page, we need to transfer the
+		 * refcount to the head page.  We have to be careful that the
+		 * THP doesn't start to split while we are adjusting the
+		 * refcounts.
+		 *
+		 * We are sure this doesn't happen, because mmu_notifier_retry
+		 * was successful and we are holding the mmu_lock, so if this
+		 * THP is trying to split, it will be blocked in the mmu
+		 * notifier before touching any of the pages, specifically
+		 * before being able to call __split_huge_page_refcount().
+		 *
+		 * We can therefore safely transfer the refcount from PG_tail
+		 * to PG_head and switch the pfn from a tail page to the head
+		 * page accordingly.
+		 */
+		*ipap &= PMD_MASK;
+		kvm_release_pfn_clean(pfn);
+		pfn &= ~(PTRS_PER_PMD - 1);
+		kvm_get_pfn(pfn);
+		*pfnp = pfn;
+
+		return PMD_SIZE;
+	}
+
+	/* Use page mapping if we cannot use block mapping. */
+	return PAGE_SIZE;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -1776,20 +1788,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
 
-	if (vma_pagesize == PAGE_SIZE && !force_pte) {
-		/*
-		 * Only PMD_SIZE transparent hugepages(THP) are
-		 * currently supported. This code will need to be
-		 * updated to support other THP sizes.
-		 *
-		 * Make sure the host VA and the guest IPA are sufficiently
-		 * aligned and that the block is contained within the memslot.
-		 */
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
-		    transparent_hugepage_adjust(&pfn, &fault_ipa))
-			vma_pagesize = PMD_SIZE;
-	}
-
+	/*
+	 * If we are not forced to use page mapping, check if we are
+	 * backed by a THP and thus use block mapping if possible.
+	 */
+	if (vma_pagesize == PAGE_SIZE && !force_pte)
+		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
+							   &pfn, &fault_ipa);
 	if (writable)
 		kvm_set_pfn_dirty(pfn);
 
-- 
cgit v1.2.3


From c862626e19efdc26b26481515470b160e8fe52f3 Mon Sep 17 00:00:00 2001
From: Keqian Zhu <zhukeqian1@huawei.com>
Date: Mon, 13 Apr 2020 20:20:23 +0800
Subject: KVM: arm64: Support enabling dirty log gradually in small chunks

There is already support of enabling dirty log gradually in small chunks
for x86 in commit 3c9bd4006bfc ("KVM: x86: enable dirty log gradually in
small chunks"). This adds support for arm64.

x86 still writes protect all huge pages when DIRTY_LOG_INITIALLY_ALL_SET
is enabled. However, for arm64, both huge pages and normal pages can be
write protected gradually by userspace.

Under the Huawei Kunpeng 920 2.6GHz platform, I did some tests on 128G
Linux VMs with different page size. The memory pressure is 127G in each
case. The time taken of memory_global_dirty_log_start in QEMU is listed
below:

Page Size      Before    After Optimization
  4K            650ms         1.8ms
  2M             4ms          1.8ms
  1G             2ms          1.8ms

Besides the time reduction, the biggest improvement is that we will minimize
the performance side effect (because of dissolving huge pages and marking
memslots dirty) on guest after enabling dirty log.

Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200413122023.52583-1-zhukeqian1@huawei.com
---
 Documentation/virt/kvm/api.rst    |  2 +-
 arch/arm64/include/asm/kvm_host.h |  3 +++
 arch/arm64/kvm/mmu.c              | 12 ++++++++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index efbbe570aa9b..0017f63fa44f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5777,7 +5777,7 @@ will be initialized to 1 when created.  This also improves performance because
 dirty logging can be enabled gradually in small chunks on the first call
 to KVM_CLEAR_DIRTY_LOG.  KVM_DIRTY_LOG_INITIALLY_SET depends on
 KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (it is also only available on
-x86 for now).
+x86 and arm64 for now).
 
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 32c8a675e5a4..a723f84fab83 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -46,6 +46,9 @@
 #define KVM_REQ_RECORD_STEAL	KVM_ARCH_REQ(3)
 #define KVM_REQ_RELOAD_GICv4	KVM_ARCH_REQ(4)
 
+#define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
+				     KVM_DIRTY_LOG_INITIALLY_SET)
+
 DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
 
 extern unsigned int kvm_sve_max_vl;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 66eb8e3f6e8c..ddf85bf21897 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -2277,8 +2277,16 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * allocated dirty_bitmap[], dirty pages will be tracked while the
 	 * memory slot is write protected.
 	 */
-	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
-		kvm_mmu_wp_memory_region(kvm, mem->slot);
+	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+		/*
+		 * If we're with initial-all-set, we don't need to write
+		 * protect any pages because they're all reported as dirty.
+		 * Huge pages and normal pages will be write protect gradually.
+		 */
+		if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+			kvm_mmu_wp_memory_region(kvm, mem->slot);
+		}
+	}
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-- 
cgit v1.2.3


From 5107000faa6e8c2b0ff7a91a6d1f010f84596cd2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Apr 2020 15:15:07 +0100
Subject: KVM: arm64: Make KVM_CAP_MAX_VCPUS compatible with the selected GIC
 version

KVM_CAP_MAX_VCPUS always return the maximum possible number of
VCPUs, irrespective of the selected interrupt controller. This
is pretty misleading for userspace that selects a GICv2 on a GICv3
system that supports v2 compat: It always gets a maximum of 512
VCPUs, even if the effective limit is 8. The 9th VCPU will fail
to be created, which is unexpected as far as userspace is concerned.

Fortunately, we already have the right information stashed in the
kvm structure, and we can return it as requested.

Reported-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Alexandru Elisei <alexandru.elisei@arm.com>
Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
Link: https://lore.kernel.org/r/20200427141507.284985-1-maz@kernel.org
---
 arch/arm64/kvm/arm.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 0ea9a0266d9a..e01d44df98df 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -95,6 +95,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 	return r;
 }
 
+static int kvm_arm_default_max_vcpus(void)
+{
+	return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
+}
+
 /**
  * kvm_arch_init_vm - initializes a VM data structure
  * @kvm:	pointer to the KVM struct
@@ -128,8 +133,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.vmid.vmid_gen = 0;
 
 	/* The maximum number of VCPUs is limited by the host's GIC model */
-	kvm->arch.max_vcpus = vgic_present ?
-				kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
+	kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();
 
 	return ret;
 out_free_stage2_pgd:
@@ -204,10 +208,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = num_online_cpus();
 		break;
 	case KVM_CAP_MAX_VCPUS:
-		r = KVM_MAX_VCPUS;
-		break;
 	case KVM_CAP_MAX_VCPU_ID:
-		r = KVM_MAX_VCPU_ID;
+		if (kvm)
+			r = kvm->arch.max_vcpus;
+		else
+			r = kvm_arm_default_max_vcpus();
 		break;
 	case KVM_CAP_MSI_DEVID:
 		if (!kvm)
-- 
cgit v1.2.3


From d43e2675e96fc6ae1a633b6a69d296394448cc32 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 19 May 2020 05:34:41 -0400
Subject: KVM: x86: only do L1TF workaround on affected processors

KVM stores the gfn in MMIO SPTEs as a caching optimization.  These are split
in two parts, as in "[high 11111 low]", to thwart any attempt to use these bits
in an L1TF attack.  This works as long as there are 5 free bits between
MAXPHYADDR and bit 50 (inclusive), leaving bit 51 free so that the MMIO
access triggers a reserved-bit-set page fault.

The bit positions however were computed wrongly for AMD processors that have
encryption support.  In this case, x86_phys_bits is reduced (for example
from 48 to 43, to account for the C bit at position 47 and four bits used
internally to store the SEV ASID and other stuff) while x86_cache_bits in
would remain set to 48, and _all_ bits between the reduced MAXPHYADDR
and bit 51 are set.  Then low_phys_bits would also cover some of the
bits that are set in the shadow_mmio_value, terribly confusing the gfn
caching mechanism.

To fix this, avoid splitting gfns as long as the processor does not have
the L1TF bug (which includes all AMD processors).  When there is no
splitting, low_phys_bits can be set to the reduced MAXPHYADDR removing
the overlap.  This fixes "npt=0" operation on EPYC processors.

Thanks to Maxim Levitsky for bisecting this bug.

Cc: stable@vger.kernel.org
Fixes: 52918ed5fcf0 ("KVM: SVM: Override default MMIO mask if memory encryption is enabled")
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8071952e9cf2..86619631ff6a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -335,6 +335,8 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
 {
 	BUG_ON((u64)(unsigned)access_mask != access_mask);
 	BUG_ON((mmio_mask & mmio_value) != mmio_value);
+	WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+	WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
 	shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
 	shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 	shadow_mmio_access_mask = access_mask;
@@ -583,16 +585,15 @@ static void kvm_mmu_reset_all_pte_masks(void)
 	 * the most significant bits of legal physical address space.
 	 */
 	shadow_nonpresent_or_rsvd_mask = 0;
-	low_phys_bits = boot_cpu_data.x86_cache_bits;
-	if (boot_cpu_data.x86_cache_bits <
-	    52 - shadow_nonpresent_or_rsvd_mask_len) {
+	low_phys_bits = boot_cpu_data.x86_phys_bits;
+	if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+	    !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+			  52 - shadow_nonpresent_or_rsvd_mask_len)) {
+		low_phys_bits = boot_cpu_data.x86_cache_bits
+			- shadow_nonpresent_or_rsvd_mask_len;
 		shadow_nonpresent_or_rsvd_mask =
-			rsvd_bits(boot_cpu_data.x86_cache_bits -
-				  shadow_nonpresent_or_rsvd_mask_len,
-				  boot_cpu_data.x86_cache_bits - 1);
-		low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
-	} else
-		WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
+			rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
+	}
 
 	shadow_nonpresent_or_rsvd_lower_gfn_mask =
 		GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
-- 
cgit v1.2.3


From ef68017eb5704eb2b0577c3aa6619e13caf2b59f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 28 Feb 2020 10:42:48 -0800
Subject: x86/kvm: Handle async page faults directly through do_page_fault()

KVM overloads #PF to indicate two types of not-actually-page-fault
events.  Right now, the KVM guest code intercepts them by modifying
the IDT and hooking the #PF vector.  This makes the already fragile
fault code even harder to understand, and it also pollutes call
traces with async_page_fault and do_async_page_fault for normal page
faults.

Clean it up by moving the logic into do_page_fault() using a static
branch.  This gets rid of the platform trap_init override mechanism
completely.

[ tglx: Fixed up 32bit, removed error code from the async functions and
  	massaged coding style ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200505134059.169270470@linutronix.de
---
 arch/x86/entry/entry_32.S       |  8 --------
 arch/x86/entry/entry_64.S       |  4 ----
 arch/x86/include/asm/kvm_para.h | 19 +++++++++++++++++--
 arch/x86/include/asm/x86_init.h |  2 --
 arch/x86/kernel/kvm.c           | 39 +++++++++++++++++++++------------------
 arch/x86/kernel/traps.c         |  2 --
 arch/x86/kernel/x86_init.c      |  1 -
 arch/x86/mm/fault.c             | 19 +++++++++++++++++++
 8 files changed, 57 insertions(+), 37 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b67bae7091d7..8ba0985f5016 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1693,14 +1693,6 @@ SYM_CODE_START(general_protection)
 	jmp	common_exception
 SYM_CODE_END(general_protection)
 
-#ifdef CONFIG_KVM_GUEST
-SYM_CODE_START(async_page_fault)
-	ASM_CLAC
-	pushl	$do_async_page_fault
-	jmp	common_exception_read_cr2
-SYM_CODE_END(async_page_fault)
-#endif
-
 SYM_CODE_START(rewind_stack_do_exit)
 	/* Prevent any naive code from trying to unwind to our caller. */
 	xorl	%ebp, %ebp
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3063aa9090f9..9ab3ea6d02fc 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1202,10 +1202,6 @@ idtentry xendebug		do_debug		has_error_code=0
 idtentry general_protection	do_general_protection	has_error_code=1
 idtentry page_fault		do_page_fault		has_error_code=1	read_cr2=1
 
-#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault	do_async_page_fault	has_error_code=1	read_cr2=1
-#endif
-
 #ifdef CONFIG_X86_MCE
 idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 #endif
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 9b4df6eaa11a..5261363adda3 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -91,8 +91,18 @@ unsigned int kvm_arch_para_hints(void);
 void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
-extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
+void kvm_disable_steal_time(void);
+bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
+
+DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
+static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+	if (static_branch_unlikely(&kvm_async_pf_enabled))
+		return __kvm_handle_async_pf(regs, token);
+	else
+		return false;
+}
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
@@ -130,6 +140,11 @@ static inline void kvm_disable_steal_time(void)
 {
 	return;
 }
+
+static inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+	return false;
+}
 #endif
 
 #endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 96d9cd208610..6807153c0410 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -50,14 +50,12 @@ struct x86_init_resources {
  * @pre_vector_init:		init code to run before interrupt vectors
  *				are set up.
  * @intr_init:			interrupt init code
- * @trap_init:			platform specific trap setup
  * @intr_mode_select:		interrupt delivery mode selection
  * @intr_mode_init:		interrupt delivery mode setup
  */
 struct x86_init_irqs {
 	void (*pre_vector_init)(void);
 	void (*intr_init)(void);
-	void (*trap_init)(void);
 	void (*intr_mode_select)(void);
 	void (*intr_mode_init)(void);
 };
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6efe0410fb72..5ad3fcca2309 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -35,6 +35,8 @@
 #include <asm/tlb.h>
 #include <asm/cpuidle_haltpoll.h>
 
+DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
 static int kvmapf = 1;
 
 static int __init parse_no_kvmapf(char *arg)
@@ -242,25 +244,27 @@ u32 kvm_read_and_reset_pf_reason(void)
 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
-dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
+	/*
+	 * If we get a page fault right here, the pf_reason seems likely
+	 * to be clobbered.  Bummer.
+	 */
 	switch (kvm_read_and_reset_pf_reason()) {
 	default:
-		do_page_fault(regs, error_code, address);
-		break;
+		return false;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
-		kvm_async_pf_task_wait((u32)address, !user_mode(regs));
-		break;
+		kvm_async_pf_task_wait(token, !user_mode(regs));
+		return true;
 	case KVM_PV_REASON_PAGE_READY:
 		rcu_irq_enter();
-		kvm_async_pf_task_wake((u32)address);
+		kvm_async_pf_task_wake(token);
 		rcu_irq_exit();
-		break;
+		return true;
 	}
 }
-NOKPROBE_SYMBOL(do_async_page_fault);
+NOKPROBE_SYMBOL(__kvm_handle_async_pf);
 
 static void __init paravirt_ops_setup(void)
 {
@@ -306,7 +310,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 static void kvm_guest_cpu_init(void)
 {
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
-		u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+		u64 pa;
+
+		WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
+
+		pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
 
 #ifdef CONFIG_PREEMPTION
 		pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -592,12 +600,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
 }
 #endif
 
-static void __init kvm_apf_trap_init(void)
-{
-	update_intr_gate(X86_TRAP_PF, async_page_fault);
-}
-
-
 static void kvm_flush_tlb_others(const struct cpumask *cpumask,
 			const struct flush_tlb_info *info)
 {
@@ -632,8 +634,6 @@ static void __init kvm_guest_init(void)
 	register_reboot_notifier(&kvm_pv_reboot_nb);
 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
 		raw_spin_lock_init(&async_pf_sleepers[i].lock);
-	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
-		x86_init.irqs.trap_init = kvm_apf_trap_init;
 
 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 		has_steal_clock = 1;
@@ -649,6 +649,9 @@ static void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
+	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
+		static_branch_enable(&kvm_async_pf_enabled);
+
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d54cffdc7cac..821fac47eef6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -983,7 +983,5 @@ void __init trap_init(void)
 
 	idt_setup_ist_traps();
 
-	x86_init.irqs.trap_init();
-
 	idt_setup_debugidt_traps();
 }
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 85f1a90c55cd..123f1c1f1788 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = {
 	.irqs = {
 		.pre_vector_init	= init_ISA_irqs,
 		.intr_init		= native_init_IRQ,
-		.trap_init		= x86_init_noop,
 		.intr_mode_select	= apic_intr_mode_select,
 		.intr_mode_init		= apic_intr_mode_init
 	},
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a51df516b87b..6486ccec1b0e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -30,6 +30,7 @@
 #include <asm/desc.h>			/* store_idt(), ...		*/
 #include <asm/cpu_entry_area.h>		/* exception stack		*/
 #include <asm/pgtable_areas.h>		/* VMALLOC_START, ...		*/
+#include <asm/kvm_para.h>		/* kvm_handle_async_pf		*/
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -1523,6 +1524,24 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
 		unsigned long address)
 {
 	prefetchw(&current->mm->mmap_sem);
+	/*
+	 * KVM has two types of events that are, logically, interrupts, but
+	 * are unfortunately delivered using the #PF vector.  These events are
+	 * "you just accessed valid memory, but the host doesn't have it right
+	 * now, so I'll put you to sleep if you continue" and "that memory
+	 * you tried to access earlier is available now."
+	 *
+	 * We are relying on the interrupted context being sane (valid RSP,
+	 * relevant locks not held, etc.), which is fine as long as the
+	 * interrupted context had IF=1.  We are also relying on the KVM
+	 * async pf type field and CR2 being read consistently instead of
+	 * getting values from real and async page faults mixed up.
+	 *
+	 * Fingers crossed.
+	 */
+	if (kvm_handle_async_pf(regs, (u32)address))
+		return;
+
 	trace_page_fault_entries(regs, hw_error_code, address);
 
 	if (unlikely(kmmio_fault(regs, address)))
-- 
cgit v1.2.3


From 6bca69ada4bc20fa27eb44a5e09da3363d1752af Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 7 Mar 2020 00:42:06 +0100
Subject: x86/kvm: Sanitize kvm_async_pf_task_wait()

While working on the entry consolidation I stumbled over the KVM async page
fault handler and kvm_async_pf_task_wait() in particular. It took me a
while to realize that the randomly sprinkled around rcu_irq_enter()/exit()
invocations are just cargo cult programming. Several patches "fixed" RCU
splats by curing the symptoms without noticing that the code is flawed
from a design perspective.

The main problem is that this async injection is not based on a proper
handshake mechanism and only respects the minimal requirement, i.e. the
guest is not in a state where it has interrupts disabled.

Aside of that the actual code is a convoluted one fits it all swiss army
knife. It is invoked from different places with different RCU constraints:

  1) Host side:

     vcpu_enter_guest()
       kvm_x86_ops->handle_exit()
         kvm_handle_page_fault()
           kvm_async_pf_task_wait()

     The invocation happens from fully preemptible context.

  2) Guest side:

     The async page fault interrupted:

         a) user space

	 b) preemptible kernel code which is not in a RCU read side
	    critical section

     	 c) non-preemtible kernel code or a RCU read side critical section
	    or kernel code with CONFIG_PREEMPTION=n which allows not to
	    differentiate between #2b and #2c.

RCU is watching for:

  #1  The vCPU exited and current is definitely not the idle task

  #2a The #PF entry code on the guest went through enter_from_user_mode()
      which reactivates RCU

  #2b There is no preemptible, interrupts enabled code in the kernel
      which can run with RCU looking away. (The idle task is always
      non preemptible).

I.e. all schedulable states (#1, #2a, #2b) do not need any of this RCU
voodoo at all.

In #2c RCU is eventually not watching, but as that state cannot schedule
anyway there is no point to worry about it so it has to invoke
rcu_irq_enter() before running that code. This can be optimized, but this
will be done as an extra step in course of the entry code consolidation
work.

So the proper solution for this is to:

  - Split kvm_async_pf_task_wait() into schedule and halt based waiting
    interfaces which share the enqueueing code.

  - Add comments (condensed form of this changelog) to spare others the
    time waste and pain of reverse engineering all of this with the help of
    uncomprehensible changelogs and code history.

  - Invoke kvm_async_pf_task_wait_schedule() from kvm_handle_page_fault(),
    user mode and schedulable kernel side async page faults (#1, #2a, #2b)

  - Invoke kvm_async_pf_task_wait_halt() for the non schedulable kernel
    case (#2c).

    For this case also remove the rcu_irq_exit()/enter() pair around the
    halt as it is just a pointless exercise:

       - vCPUs can VMEXIT at any random point and can be scheduled out for
         an arbitrary amount of time by the host and this is not any
         different except that it voluntary triggers the exit via halt.

       - The interrupted context could have RCU watching already. So the
	 rcu_irq_exit() before the halt is not gaining anything aside of
	 confusing the reader. Claiming that this might prevent RCU stalls
	 is just an illusion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200505134059.262701431@linutronix.de
---
 arch/x86/include/asm/kvm_para.h |   4 +-
 arch/x86/kernel/kvm.c           | 201 ++++++++++++++++++++++++++++------------
 arch/x86/kvm/mmu/mmu.c          |   2 +-
 3 files changed, 144 insertions(+), 63 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 5261363adda3..118e5c2379f9 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 bool kvm_para_available(void);
 unsigned int kvm_arch_para_features(void);
 unsigned int kvm_arch_para_hints(void);
-void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
+void kvm_async_pf_task_wait_schedule(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 void kvm_disable_steal_time(void);
@@ -113,7 +113,7 @@ static inline void kvm_spinlock_init(void)
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
 
 #else /* CONFIG_KVM_GUEST */
-#define kvm_async_pf_task_wait(T, I) do {} while(0)
+#define kvm_async_pf_task_wait_schedule(T) do {} while(0)
 #define kvm_async_pf_task_wake(T) do {} while(0)
 
 static inline bool kvm_para_available(void)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5ad3fcca2309..c6a82f9f537f 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -75,7 +75,7 @@ struct kvm_task_sleep_node {
 	struct swait_queue_head wq;
 	u32 token;
 	int cpu;
-	bool halted;
+	bool use_halt;
 };
 
 static struct kvm_task_sleep_head {
@@ -98,75 +98,145 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
 	return NULL;
 }
 
-/*
- * @interrupt_kernel: Is this called from a routine which interrupts the kernel
- * 		      (other than user space)?
- */
-void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
+static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
+				    struct kvm_task_sleep_node *n)
 {
 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
-	struct kvm_task_sleep_node n, *e;
-	DECLARE_SWAITQUEUE(wait);
-
-	rcu_irq_enter();
+	struct kvm_task_sleep_node *e;
 
 	raw_spin_lock(&b->lock);
 	e = _find_apf_task(b, token);
 	if (e) {
 		/* dummy entry exist -> wake up was delivered ahead of PF */
 		hlist_del(&e->link);
-		kfree(e);
 		raw_spin_unlock(&b->lock);
+		kfree(e);
+		return false;
+	}
 
-		rcu_irq_exit();
+	n->token = token;
+	n->cpu = smp_processor_id();
+	n->use_halt = use_halt;
+	init_swait_queue_head(&n->wq);
+	hlist_add_head(&n->link, &b->list);
+	raw_spin_unlock(&b->lock);
+	return true;
+}
+
+/*
+ * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
+ * @token:	Token to identify the sleep node entry
+ *
+ * Invoked from the async pagefault handling code or from the VM exit page
+ * fault handler. In both cases RCU is watching.
+ */
+void kvm_async_pf_task_wait_schedule(u32 token)
+{
+	struct kvm_task_sleep_node n;
+	DECLARE_SWAITQUEUE(wait);
+
+	lockdep_assert_irqs_disabled();
+
+	if (!kvm_async_pf_queue_task(token, false, &n))
 		return;
+
+	for (;;) {
+		prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (hlist_unhashed(&n.link))
+			break;
+
+		local_irq_enable();
+		schedule();
+		local_irq_disable();
 	}
+	finish_swait(&n.wq, &wait);
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
 
-	n.token = token;
-	n.cpu = smp_processor_id();
-	n.halted = is_idle_task(current) ||
-		   (IS_ENABLED(CONFIG_PREEMPT_COUNT)
-		    ? preempt_count() > 1 || rcu_preempt_depth()
-		    : interrupt_kernel);
-	init_swait_queue_head(&n.wq);
-	hlist_add_head(&n.link, &b->list);
-	raw_spin_unlock(&b->lock);
+/*
+ * Invoked from the async page fault handler.
+ */
+static void kvm_async_pf_task_wait_halt(u32 token)
+{
+	struct kvm_task_sleep_node n;
+
+	if (!kvm_async_pf_queue_task(token, true, &n))
+		return;
 
 	for (;;) {
-		if (!n.halted)
-			prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 		if (hlist_unhashed(&n.link))
 			break;
+		/*
+		 * No point in doing anything about RCU here. Any RCU read
+		 * side critical section or RCU watching section can be
+		 * interrupted by VMEXITs and the host is free to keep the
+		 * vCPU scheduled out as long as it sees fit. This is not
+		 * any different just because of the halt induced voluntary
+		 * VMEXIT.
+		 *
+		 * Also the async page fault could have interrupted any RCU
+		 * watching context, so invoking rcu_irq_exit()/enter()
+		 * around this is not gaining anything.
+		 */
+		native_safe_halt();
+		local_irq_disable();
+	}
+}
 
-		rcu_irq_exit();
+/* Invoked from the async page fault handler */
+static void kvm_async_pf_task_wait(u32 token, bool usermode)
+{
+	bool can_schedule;
 
-		if (!n.halted) {
-			local_irq_enable();
-			schedule();
-			local_irq_disable();
-		} else {
-			/*
-			 * We cannot reschedule. So halt.
-			 */
-			native_safe_halt();
-			local_irq_disable();
-		}
+	/*
+	 * No need to check whether interrupts were disabled because the
+	 * host will (hopefully) only inject an async page fault into
+	 * interrupt enabled regions.
+	 *
+	 * If CONFIG_PREEMPTION is enabled then check whether the code
+	 * which triggered the page fault is preemptible. This covers user
+	 * mode as well because preempt_count() is obviously 0 there.
+	 *
+	 * The check for rcu_preempt_depth() is also required because
+	 * voluntary scheduling inside a rcu read locked section is not
+	 * allowed.
+	 *
+	 * The idle task is already covered by this because idle always
+	 * has a preempt count > 0.
+	 *
+	 * If CONFIG_PREEMPTION is disabled only allow scheduling when
+	 * coming from user mode as there is no indication whether the
+	 * context which triggered the page fault could schedule or not.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPTION))
+		can_schedule = preempt_count() + rcu_preempt_depth() == 0;
+	else
+		can_schedule = usermode;
 
+	/*
+	 * If the kernel context is allowed to schedule then RCU is
+	 * watching because no preemptible code in the kernel is inside RCU
+	 * idle state. So it can be treated like user mode. User mode is
+	 * safe because the #PF entry invoked enter_from_user_mode().
+	 *
+	 * For the non schedulable case invoke rcu_irq_enter() for
+	 * now. This will be moved out to the pagefault entry code later
+	 * and only invoked when really needed.
+	 */
+	if (can_schedule) {
+		kvm_async_pf_task_wait_schedule(token);
+	} else {
 		rcu_irq_enter();
+		kvm_async_pf_task_wait_halt(token);
+		rcu_irq_exit();
 	}
-	if (!n.halted)
-		finish_swait(&n.wq, &wait);
-
-	rcu_irq_exit();
-	return;
 }
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
 
 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 {
 	hlist_del_init(&n->link);
-	if (n->halted)
+	if (n->use_halt)
 		smp_send_reschedule(n->cpu);
 	else if (swq_has_sleeper(&n->wq))
 		swake_up_one(&n->wq);
@@ -177,12 +247,13 @@ static void apf_task_wake_all(void)
 	int i;
 
 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
-		struct hlist_node *p, *next;
 		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+		struct kvm_task_sleep_node *n;
+		struct hlist_node *p, *next;
+
 		raw_spin_lock(&b->lock);
 		hlist_for_each_safe(p, next, &b->list) {
-			struct kvm_task_sleep_node *n =
-				hlist_entry(p, typeof(*n), link);
+			n = hlist_entry(p, typeof(*n), link);
 			if (n->cpu == smp_processor_id())
 				apf_task_wake_one(n);
 		}
@@ -223,8 +294,9 @@ again:
 		n->cpu = smp_processor_id();
 		init_swait_queue_head(&n->wq);
 		hlist_add_head(&n->link, &b->list);
-	} else
+	} else {
 		apf_task_wake_one(n);
+	}
 	raw_spin_unlock(&b->lock);
 	return;
 }
@@ -246,23 +318,33 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
 bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
-	/*
-	 * If we get a page fault right here, the pf_reason seems likely
-	 * to be clobbered.  Bummer.
-	 */
-	switch (kvm_read_and_reset_pf_reason()) {
+	u32 reason = kvm_read_and_reset_pf_reason();
+
+	switch (reason) {
+	case KVM_PV_REASON_PAGE_NOT_PRESENT:
+	case KVM_PV_REASON_PAGE_READY:
+		break;
 	default:
 		return false;
-	case KVM_PV_REASON_PAGE_NOT_PRESENT:
+	}
+
+	/*
+	 * If the host managed to inject an async #PF into an interrupt
+	 * disabled region, then die hard as this is not going to end well
+	 * and the host side is seriously broken.
+	 */
+	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
+		panic("Host injected async #PF in interrupt disabled region\n");
+
+	if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
 		/* page is swapped out by the host. */
-		kvm_async_pf_task_wait(token, !user_mode(regs));
-		return true;
-	case KVM_PV_REASON_PAGE_READY:
+		kvm_async_pf_task_wait(token, user_mode(regs));
+	} else {
 		rcu_irq_enter();
 		kvm_async_pf_task_wake(token);
 		rcu_irq_exit();
-		return true;
 	}
+	return true;
 }
 NOKPROBE_SYMBOL(__kvm_handle_async_pf);
 
@@ -326,12 +408,12 @@ static void kvm_guest_cpu_init(void)
 
 		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
 		__this_cpu_write(apf_reason.enabled, 1);
-		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
-		       smp_processor_id());
+		pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
 	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
 		unsigned long pa;
+
 		/* Size alignment is implied but just to make it explicit. */
 		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
 		__this_cpu_write(kvm_apic_eoi, 0);
@@ -352,8 +434,7 @@ static void kvm_pv_disable_apf(void)
 	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
 	__this_cpu_write(apf_reason.enabled, 0);
 
-	printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
-	       smp_processor_id());
+	pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
 }
 
 static void kvm_pv_guest_cpu_reboot(void *unused)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8071952e9cf2..dd900a648059 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4198,7 +4198,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		vcpu->arch.apf.host_apf_reason = 0;
 		local_irq_disable();
-		kvm_async_pf_task_wait(fault_address, 0);
+		kvm_async_pf_task_wait_schedule(fault_address);
 		local_irq_enable();
 		break;
 	case KVM_PV_REASON_PAGE_READY:
-- 
cgit v1.2.3


From 3a7c8fafd1b42adea229fd204132f6a2fb3cd2d9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Apr 2020 09:57:56 +0200
Subject: x86/kvm: Restrict ASYNC_PF to user space

The async page fault injection into kernel space creates more problems than
it solves. The host has absolutely no knowledge about the state of the
guest if the fault happens in CPL0. The only restriction for the host is
interrupt disabled state. If interrupts are enabled in the guest then the
exception can hit arbitrary code. The HALT based wait in non-preemotible
code is a hacky replacement for a proper hypercall.

For the ongoing work to restrict instrumentation and make the RCU idle
interaction well defined the required extra work for supporting async
pagefault in CPL0 is just not justified and creates complexity for a
dubious benefit.

The CPL3 injection is well defined and does not cause any issues as it is
more or less the same as a regular page fault from CPL3.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200505134059.369802541@linutronix.de
---
 arch/x86/kernel/kvm.c | 100 ++++----------------------------------------------
 1 file changed, 7 insertions(+), 93 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c6a82f9f537f..b3d9b0d7a37d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -75,7 +75,6 @@ struct kvm_task_sleep_node {
 	struct swait_queue_head wq;
 	u32 token;
 	int cpu;
-	bool use_halt;
 };
 
 static struct kvm_task_sleep_head {
@@ -98,8 +97,7 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
 	return NULL;
 }
 
-static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
-				    struct kvm_task_sleep_node *n)
+static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
 {
 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
@@ -117,7 +115,6 @@ static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
 
 	n->token = token;
 	n->cpu = smp_processor_id();
-	n->use_halt = use_halt;
 	init_swait_queue_head(&n->wq);
 	hlist_add_head(&n->link, &b->list);
 	raw_spin_unlock(&b->lock);
@@ -138,7 +135,7 @@ void kvm_async_pf_task_wait_schedule(u32 token)
 
 	lockdep_assert_irqs_disabled();
 
-	if (!kvm_async_pf_queue_task(token, false, &n))
+	if (!kvm_async_pf_queue_task(token, &n))
 		return;
 
 	for (;;) {
@@ -154,91 +151,10 @@ void kvm_async_pf_task_wait_schedule(u32 token)
 }
 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
 
-/*
- * Invoked from the async page fault handler.
- */
-static void kvm_async_pf_task_wait_halt(u32 token)
-{
-	struct kvm_task_sleep_node n;
-
-	if (!kvm_async_pf_queue_task(token, true, &n))
-		return;
-
-	for (;;) {
-		if (hlist_unhashed(&n.link))
-			break;
-		/*
-		 * No point in doing anything about RCU here. Any RCU read
-		 * side critical section or RCU watching section can be
-		 * interrupted by VMEXITs and the host is free to keep the
-		 * vCPU scheduled out as long as it sees fit. This is not
-		 * any different just because of the halt induced voluntary
-		 * VMEXIT.
-		 *
-		 * Also the async page fault could have interrupted any RCU
-		 * watching context, so invoking rcu_irq_exit()/enter()
-		 * around this is not gaining anything.
-		 */
-		native_safe_halt();
-		local_irq_disable();
-	}
-}
-
-/* Invoked from the async page fault handler */
-static void kvm_async_pf_task_wait(u32 token, bool usermode)
-{
-	bool can_schedule;
-
-	/*
-	 * No need to check whether interrupts were disabled because the
-	 * host will (hopefully) only inject an async page fault into
-	 * interrupt enabled regions.
-	 *
-	 * If CONFIG_PREEMPTION is enabled then check whether the code
-	 * which triggered the page fault is preemptible. This covers user
-	 * mode as well because preempt_count() is obviously 0 there.
-	 *
-	 * The check for rcu_preempt_depth() is also required because
-	 * voluntary scheduling inside a rcu read locked section is not
-	 * allowed.
-	 *
-	 * The idle task is already covered by this because idle always
-	 * has a preempt count > 0.
-	 *
-	 * If CONFIG_PREEMPTION is disabled only allow scheduling when
-	 * coming from user mode as there is no indication whether the
-	 * context which triggered the page fault could schedule or not.
-	 */
-	if (IS_ENABLED(CONFIG_PREEMPTION))
-		can_schedule = preempt_count() + rcu_preempt_depth() == 0;
-	else
-		can_schedule = usermode;
-
-	/*
-	 * If the kernel context is allowed to schedule then RCU is
-	 * watching because no preemptible code in the kernel is inside RCU
-	 * idle state. So it can be treated like user mode. User mode is
-	 * safe because the #PF entry invoked enter_from_user_mode().
-	 *
-	 * For the non schedulable case invoke rcu_irq_enter() for
-	 * now. This will be moved out to the pagefault entry code later
-	 * and only invoked when really needed.
-	 */
-	if (can_schedule) {
-		kvm_async_pf_task_wait_schedule(token);
-	} else {
-		rcu_irq_enter();
-		kvm_async_pf_task_wait_halt(token);
-		rcu_irq_exit();
-	}
-}
-
 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 {
 	hlist_del_init(&n->link);
-	if (n->use_halt)
-		smp_send_reschedule(n->cpu);
-	else if (swq_has_sleeper(&n->wq))
+	if (swq_has_sleeper(&n->wq))
 		swake_up_one(&n->wq);
 }
 
@@ -337,8 +253,10 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 		panic("Host injected async #PF in interrupt disabled region\n");
 
 	if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
-		/* page is swapped out by the host. */
-		kvm_async_pf_task_wait(token, user_mode(regs));
+		if (unlikely(!(user_mode(regs))))
+			panic("Host injected async #PF in kernel mode\n");
+		/* Page is swapped out by the host. */
+		kvm_async_pf_task_wait_schedule(token);
 	} else {
 		rcu_irq_enter();
 		kvm_async_pf_task_wake(token);
@@ -397,10 +315,6 @@ static void kvm_guest_cpu_init(void)
 		WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
 
 		pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
-
-#ifdef CONFIG_PREEMPTION
-		pa |= KVM_ASYNC_PF_SEND_ALWAYS;
-#endif
 		pa |= KVM_ASYNC_PF_ENABLED;
 
 		if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
-- 
cgit v1.2.3


From 71b3ec5f221b8b3ff545639be83ddfcd5d7c9800 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Fri, 15 May 2020 16:20:56 +0100
Subject: KVM: arm64: Clean up cpu_init_hyp_mode()

Pull bits of code to the only place where it is used. Remove empty function
__cpu_init_stage2(). Remove redundant has_vhe() check since this function is
nVHE-only. No functional changes intended.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200515152056.83158-1-dbrazdil@google.com
---
 arch/arm64/include/asm/kvm_asm.h  |  2 ++
 arch/arm64/include/asm/kvm_host.h | 35 -----------------------------------
 arch/arm64/kvm/arm.c              | 32 +++++++++++++++++++++++++++-----
 3 files changed, 29 insertions(+), 40 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 59e314f38e43..0c9b5fc4ba0a 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -70,6 +70,8 @@ extern int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu);
 
 extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu);
 
+extern void __kvm_enable_ssbs(void);
+
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
 extern u64 __vgic_v3_read_vmcr(void);
 extern void __vgic_v3_write_vmcr(u32 vmcr);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a723f84fab83..69a338a390a6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -533,39 +533,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
 	cpu_ctxt->sys_regs[MPIDR_EL1] = read_cpuid_mpidr();
 }
 
-void __kvm_enable_ssbs(void);
-
-static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
-				       unsigned long hyp_stack_ptr,
-				       unsigned long vector_ptr)
-{
-	/*
-	 * Calculate the raw per-cpu offset without a translation from the
-	 * kernel's mapping to the linear mapping, and store it in tpidr_el2
-	 * so that we can use adr_l to access per-cpu variables in EL2.
-	 */
-	u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_data) -
-			 (u64)kvm_ksym_ref(kvm_host_data));
-
-	/*
-	 * Call initialization code, and switch to the full blown HYP code.
-	 * If the cpucaps haven't been finalized yet, something has gone very
-	 * wrong, and hyp will crash and burn when it uses any
-	 * cpus_have_const_cap() wrapper.
-	 */
-	BUG_ON(!system_capabilities_finalized());
-	__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
-
-	/*
-	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
-	 * at EL2.
-	 */
-	if (!has_vhe() && this_cpu_has_cap(ARM64_SSBS) &&
-	    arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
-		kvm_call_hyp(__kvm_enable_ssbs);
-	}
-}
-
 static inline bool kvm_arch_requires_vhe(void)
 {
 	/*
@@ -601,8 +568,6 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
 int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 			       struct kvm_device_attr *attr);
 
-static inline void __cpu_init_stage2(void) {}
-
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e01d44df98df..b0b569f2cdd0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1273,19 +1273,41 @@ static void cpu_init_hyp_mode(void)
 {
 	phys_addr_t pgd_ptr;
 	unsigned long hyp_stack_ptr;
-	unsigned long stack_page;
 	unsigned long vector_ptr;
+	unsigned long tpidr_el2;
 
 	/* Switch from the HYP stub to our own HYP init vector */
 	__hyp_set_vectors(kvm_get_idmap_vector());
 
+	/*
+	 * Calculate the raw per-cpu offset without a translation from the
+	 * kernel's mapping to the linear mapping, and store it in tpidr_el2
+	 * so that we can use adr_l to access per-cpu variables in EL2.
+	 */
+	tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
+		     (unsigned long)kvm_ksym_ref(kvm_host_data));
+
 	pgd_ptr = kvm_mmu_get_httbr();
-	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
-	hyp_stack_ptr = stack_page + PAGE_SIZE;
+	hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
 	vector_ptr = (unsigned long)kvm_get_hyp_vector();
 
-	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
-	__cpu_init_stage2();
+	/*
+	 * Call initialization code, and switch to the full blown HYP code.
+	 * If the cpucaps haven't been finalized yet, something has gone very
+	 * wrong, and hyp will crash and burn when it uses any
+	 * cpus_have_const_cap() wrapper.
+	 */
+	BUG_ON(!system_capabilities_finalized());
+	__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+
+	/*
+	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
+	 * at EL2.
+	 */
+	if (this_cpu_has_cap(ARM64_SSBS) &&
+	    arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
+		kvm_call_hyp(__kvm_enable_ssbs);
+	}
 }
 
 static void cpu_hyp_reset(void)
-- 
cgit v1.2.3


From 438f711ce1d889632467be80779c8f5762b107d7 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Fri, 15 May 2020 16:25:50 +0100
Subject: KVM: arm64: Fix incorrect comment on kvm_get_hyp_vector()

The comment used to say that kvm_get_hyp_vector is only called on VHE systems.
In fact, it is also called from the nVHE init function cpu_init_hyp_mode().
Fix the comment to stop confusing devs.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200515152550.83810-1-dbrazdil@google.com
---
 arch/arm64/include/asm/kvm_mmu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 30b0e8d6b895..796f6a2e794a 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -473,7 +473,7 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
 extern void *__kvm_bp_vect_base;
 extern int __kvm_harden_el2_vector_slot;
 
-/*  This is only called on a VHE system */
+/*  This is called on both VHE and !VHE systems */
 static inline void *kvm_get_hyp_vector(void)
 {
 	struct bp_hardening_data *data = arm64_get_bp_hardening_data();
-- 
cgit v1.2.3


From 0a78791c0d12fcd5d3f486668defb9ab055e3729 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 19 May 2020 11:40:36 +0100
Subject: KVM: arm64: Remove obsolete kvm_virt_to_phys abstraction

This abstraction was introduced to hide the difference between arm and
arm64 but, with the former no longer supported, this abstraction can be
removed and the canonical kernel API used directly instead.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
CC: Marc Zyngier <maz@kernel.org>
CC: James Morse <james.morse@arm.com>
CC: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20200519104036.259917-1-ascull@google.com
---
 arch/arm64/include/asm/kvm_mmu.h | 2 --
 arch/arm64/kvm/mmu.c             | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 796f6a2e794a..53bd4d517a4d 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -363,8 +363,6 @@ static inline void __kvm_flush_dcache_pud(pud_t pud)
 	}
 }
 
-#define kvm_virt_to_phys(x)		__pa_symbol(x)
-
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index ddf85bf21897..a1f6bc70c4e4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -2197,11 +2197,11 @@ int kvm_mmu_init(void)
 {
 	int err;
 
-	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
+	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
 	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
-	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
+	hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
 	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
-	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
+	hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
 
 	/*
 	 * We rely on the linker script to ensure at build time that the HYP
-- 
cgit v1.2.3


From 6129ed877d409037b79866327102c9dc59a302fe Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 27 May 2020 01:49:09 -0700
Subject: KVM: x86/mmu: Set mmio_value to '0' if reserved #PF can't be
 generated

Set the mmio_value to '0' instead of simply clearing the present bit to
squash a benign warning in kvm_mmu_set_mmio_spte_mask() that complains
about the mmio_value overlapping the lower GFN mask on systems with 52
bits of PA space.

Opportunistically clean up the code and comments.

Cc: stable@vger.kernel.org
Fixes: d43e2675e96fc ("KVM: x86: only do L1TF workaround on affected processors")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200527084909.23492-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 86619631ff6a..92d056954194 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6143,25 +6143,16 @@ static void kvm_set_mmio_spte_mask(void)
 	u64 mask;
 
 	/*
-	 * Set the reserved bits and the present bit of an paging-structure
-	 * entry to generate page fault with PFER.RSV = 1.
+	 * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+	 * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
+	 * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+	 * 52-bit physical addresses then there are no reserved PA bits in the
+	 * PTEs and so the reserved PA approach must be disabled.
 	 */
-
-	/*
-	 * Mask the uppermost physical address bit, which would be reserved as
-	 * long as the supported physical address width is less than 52.
-	 */
-	mask = 1ull << 51;
-
-	/* Set the present bit. */
-	mask |= 1ull;
-
-	/*
-	 * If reserved bit is not supported, clear the present bit to disable
-	 * mmio page fault.
-	 */
-	if (shadow_phys_bits == 52)
-		mask &= ~1ull;
+	if (shadow_phys_bits < 52)
+		mask = BIT_ULL(51) | PT_PRESENT_MASK;
+	else
+		mask = 0;
 
 	kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
 }
-- 
cgit v1.2.3


From 0abcc8f65cc23b65bc8d1614cc64b02b1641ed7c Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Sat, 23 May 2020 19:14:54 +0300
Subject: KVM: VMX: enable X86_FEATURE_WAITPKG in KVM capabilities

Even though we might not allow the guest to use WAITPKG's new
instructions, we should tell KVM that the feature is supported by the
host CPU.

Note that vmx_waitpkg_supported checks that WAITPKG _can_ be set in
secondary execution controls as specified by VMX capability MSR, rather
that we actually enable it for a guest.

Cc: stable@vger.kernel.org
Fixes: e69e72faa3a0 ("KVM: x86: Add support for user wait instructions")
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20200523161455.3940-2-mlevitsk@redhat.com>
Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 89c766fad889..9b63ac8d97ee 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7138,6 +7138,9 @@ static __init void vmx_set_cpu_caps(void)
 	/* CPUID 0x80000001 */
 	if (!cpu_has_vmx_rdtscp())
 		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+
+	if (vmx_waitpkg_supported())
+		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
 }
 
 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From f4cfcd2d5aea4e96c5d483c476f3057b6b7baf6a Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Sat, 23 May 2020 19:14:55 +0300
Subject: KVM: x86: don't expose MSR_IA32_UMWAIT_CONTROL unconditionally

This msr is only available when the host supports WAITPKG feature.

This breaks a nested guest, if the L1 hypervisor is set to ignore
unknown msrs, because the only other safety check that the
kernel does is that it attempts to read the msr and
rejects it if it gets an exception.

Cc: stable@vger.kernel.org
Fixes: 6e3ba4abce ("KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL")
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20200523161455.3940-3-mlevitsk@redhat.com>
Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c17e6eb9ad43..e0083a08da9e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5242,6 +5242,10 @@ static void kvm_init_msr_list(void)
 			if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
 				continue;
 			break;
+		case MSR_IA32_UMWAIT_CONTROL:
+			if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
+				continue;
+			break;
 		case MSR_IA32_RTIT_CTL:
 		case MSR_IA32_RTIT_STATUS:
 			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
-- 
cgit v1.2.3


From e7581caca4c105d81a490a3e15cf46d6407e3fa7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 19 May 2020 05:04:49 -0400
Subject: KVM: x86: simplify is_mmio_spte

We can simply look at bits 52-53 to identify MMIO entries in KVM's page
tables.  Therefore, there is no need to pass a mask to kvm_mmu_set_mmio_spte_mask.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.h     |  2 +-
 arch/x86/kvm/mmu/mmu.c | 10 +++-------
 arch/x86/kvm/svm/svm.c |  2 +-
 arch/x86/kvm/vmx/vmx.c |  3 +--
 4 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 8a3b1bce722a..048e865ad485 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e)
 	return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 92d056954194..bef05a437a89 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -244,7 +244,6 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
-static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_mmio_value;
 static u64 __read_mostly shadow_mmio_access_mask;
 static u64 __read_mostly shadow_present_mask;
@@ -331,21 +330,19 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 	kvm_flush_remote_tlbs_with_range(kvm, &range);
 }
 
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
 {
 	BUG_ON((u64)(unsigned)access_mask != access_mask);
-	BUG_ON((mmio_mask & mmio_value) != mmio_value);
 	WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
 	WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
 	shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
-	shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 	shadow_mmio_access_mask = access_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
 static bool is_mmio_spte(u64 spte)
 {
-	return (spte & shadow_mmio_mask) == shadow_mmio_value;
+	return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
 }
 
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
@@ -568,7 +565,6 @@ static void kvm_mmu_reset_all_pte_masks(void)
 	shadow_dirty_mask = 0;
 	shadow_nx_mask = 0;
 	shadow_x_mask = 0;
-	shadow_mmio_mask = 0;
 	shadow_present_mask = 0;
 	shadow_acc_track_mask = 0;
 
@@ -6154,7 +6150,7 @@ static void kvm_set_mmio_spte_mask(void)
 	else
 		mask = 0;
 
-	kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
+	kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK);
 }
 
 static bool get_nx_auto_mode(void)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a862c768fd54..3c9b321a420a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -780,7 +780,7 @@ static __init void svm_adjust_mmio_mask(void)
 	 */
 	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
 
-	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+	kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 
 static void svm_hardware_teardown(void)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9b63ac8d97ee..65ff16f00c08 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4147,8 +4147,7 @@ static void ept_set_mmio_spte_mask(void)
 	 * EPT Misconfigurations can be generated if the value of bits 2:0
 	 * of an EPT paging-structure entry is 110b (write/execute).
 	 */
-	kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
-				   VMX_EPT_MISCONFIG_WX_VALUE, 0);
+	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
-- 
cgit v1.2.3


From df2a69af85bef169ab6810cc57f6b6b943941e7e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 19 May 2020 12:51:32 -0400
Subject: KVM: x86: allow KVM_STATE_NESTED_MTF_PENDING in kvm_state flags

The migration functionality was left incomplete in commit 5ef8acbdd687
("KVM: nVMX: Emulate MTF when performing instruction emulation", 2020-02-23),
fix it.

Fixes: 5ef8acbdd687 ("KVM: nVMX: Emulate MTF when performing instruction emulation")
Cc: stable@vger.kernel.org
Reviewed-by: Oliver Upton <oupton@google.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8ac69c16f153..0621a63afee7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4626,7 +4626,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
 		if (kvm_state.flags &
 		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
-		      | KVM_STATE_NESTED_EVMCS))
+		      | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING))
 			break;
 
 		/* nested_run_pending implies guest_mode.  */
-- 
cgit v1.2.3


From 7cb85fc465fdb91e88eff02b496150b1270952cf Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 18 Feb 2020 15:40:10 -0800
Subject: KVM: x86: Remove superfluous brackets from case statement

Remove unnecessary brackets from a case statement that unintentionally
encapsulates unrelated case statements in the same switch statement.
While technically legal and functionally correct syntax, the brackets
are visually confusing and potentially dangerous, e.g. the last of the
encapsulated case statements has an undocumented fall-through that isn't
flagged by compilers due the encapsulation.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200218234012.7110-2-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0621a63afee7..43f9b63399f0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5303,7 +5303,7 @@ static void kvm_init_msr_list(void)
 				 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
 				continue;
 			break;
-		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
+		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
 			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
 				msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
 				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
@@ -5318,7 +5318,7 @@ static void kvm_init_msr_list(void)
 			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
 			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
 				continue;
-		}
+			break;
 		default:
 			break;
 		}
-- 
cgit v1.2.3


From cb97c2d680dd9cefd2b36cafd2426f52d85b27e7 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 18 Feb 2020 15:40:11 -0800
Subject: KVM: x86: Take an unsigned 32-bit int for has_emulated_msr()'s index

Take a u32 for the index in has_emulated_msr() to match hardware, which
treats MSR indices as unsigned 32-bit values.  Functionally, taking a
signed int doesn't cause problems with the current code base, but could
theoretically cause problems with 32-bit KVM, e.g. if the index were
checked via a less-than statement, which would evaluate incorrectly for
MSR indices with bit 31 set.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200218234012.7110-3-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/svm/svm.c          | 2 +-
 arch/x86/kvm/vmx/vmx.c          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fd78bd44b2d6..db261da578f3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1064,7 +1064,7 @@ struct kvm_x86_ops {
 	void (*hardware_disable)(void);
 	void (*hardware_unsetup)(void);
 	bool (*cpu_has_accelerated_tpr)(void);
-	bool (*has_emulated_msr)(int index);
+	bool (*has_emulated_msr)(u32 index);
 	void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
 	unsigned int vm_size;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e9c0fb68387d..d877a0f70cac 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3525,7 +3525,7 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
-static bool svm_has_emulated_msr(int index)
+static bool svm_has_emulated_msr(u32 index)
 {
 	switch (index) {
 	case MSR_IA32_MCG_EXT_CTL:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0ea5a225a579..ab31033121f9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6434,7 +6434,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
 		handle_exception_nmi_irqoff(vmx);
 }
 
-static bool vmx_has_emulated_msr(int index)
+static bool vmx_has_emulated_msr(u32 index)
 {
 	switch (index) {
 	case MSR_IA32_SMBASE:
-- 
cgit v1.2.3


From a8cfbae59284ef20f38936a135f82a4d8d9105d2 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 19 Feb 2020 10:45:48 +0800
Subject: KVM: VMX: replace "fall through" with "return" to indicate different
 case

The second "/* fall through */" in rmode_exception() makes code harder to
read. Replace it with "return" to indicate they are different cases, only
the #DB and #BP check vcpu->guest_debug, while others don't care. And this
also improves the readability.

Suggested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Message-Id: <1582080348-20827-1-git-send-email-linmiaohe@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ab31033121f9..4e76e30b661c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4626,10 +4626,8 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
 			return false;
 		/* fall through */
 	case DB_VECTOR:
-		if (vcpu->guest_debug &
-			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
-			return false;
-		/* fall through */
+		return !(vcpu->guest_debug &
+			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
 	case DE_VECTOR:
 	case OF_VECTOR:
 	case BR_VECTOR:
-- 
cgit v1.2.3


From 80bc97f2d84dbec6df6b36d6bbc0715c14f80c61 Mon Sep 17 00:00:00 2001
From: Haiwei Li <lihaiwei@tencent.com>
Date: Mon, 18 May 2020 09:31:38 +0800
Subject: KVM: Fix the indentation to match coding style

There is a bad indentation in next&queue branch. The patch looks like
fixes nothing though it fixes the indentation.

Before fixing:

                 if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
                         kvm_skip_emulated_instruction(vcpu);
                         ret = EXIT_FASTPATH_EXIT_HANDLED;
                }
                 break;
         case MSR_IA32_TSCDEADLINE:

After fixing:

                 if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
                         kvm_skip_emulated_instruction(vcpu);
                         ret = EXIT_FASTPATH_EXIT_HANDLED;
                 }
                 break;
         case MSR_IA32_TSCDEADLINE:

Signed-off-by: Haiwei Li <lihaiwei@tencent.com>
Message-Id: <2f78457e-f3a7-3bc9-e237-3132ee87f71e@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43f9b63399f0..375d3fc0a4a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1631,7 +1631,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
 		if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
 			kvm_skip_emulated_instruction(vcpu);
 			ret = EXIT_FASTPATH_EXIT_HANDLED;
-               }
+		}
 		break;
 	case MSR_IA32_TSCDEADLINE:
 		data = kvm_read_edx_eax(vcpu);
-- 
cgit v1.2.3


From 88197e6ab33a909e54f683b5e3a2e45cc307108a Mon Sep 17 00:00:00 2001
From: "彭浩(Richard)" <richard.peng@oppo.com>
Date: Thu, 21 May 2020 05:57:49 +0000
Subject: kvm/x86: Remove redundant function implementations

pic_in_kernel(), ioapic_in_kernel() and irqchip_kernel() have the
same implementation.

Signed-off-by: Peng Hao <richard.peng@oppo.com>
Message-Id: <HKAPR02MB4291D5926EA10B8BFE9EA0D3E0B70@HKAPR02MB4291.apcprd02.prod.outlook.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/ioapic.h  |  8 ++------
 arch/x86/kvm/irq.h     | 15 +++++----------
 arch/x86/kvm/lapic.c   |  1 +
 arch/x86/kvm/mmu/mmu.c |  1 +
 arch/x86/kvm/x86.c     |  1 +
 5 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 2fb2e3c80724..660401700075 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -3,8 +3,8 @@
 #define __KVM_IO_APIC_H
 
 #include <linux/kvm_host.h>
-
 #include <kvm/iodev.h>
+#include "irq.h"
 
 struct kvm;
 struct kvm_vcpu;
@@ -108,11 +108,7 @@ do {									\
 
 static inline int ioapic_in_kernel(struct kvm *kvm)
 {
-	int mode = kvm->arch.irqchip_mode;
-
-	/* Matches smp_wmb() when setting irqchip_mode */
-	smp_rmb();
-	return mode == KVM_IRQCHIP_KERNEL;
+	return irqchip_kernel(kvm);
 }
 
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f173ab6b407e..9b64abf9b3f1 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -16,7 +16,6 @@
 #include <linux/spinlock.h>
 
 #include <kvm/iodev.h>
-#include "ioapic.h"
 #include "lapic.h"
 
 #define PIC_NUM_PINS 16
@@ -66,15 +65,6 @@ void kvm_pic_destroy(struct kvm *kvm);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
-static inline int pic_in_kernel(struct kvm *kvm)
-{
-	int mode = kvm->arch.irqchip_mode;
-
-	/* Matches smp_wmb() when setting irqchip_mode */
-	smp_rmb();
-	return mode == KVM_IRQCHIP_KERNEL;
-}
-
 static inline int irqchip_split(struct kvm *kvm)
 {
 	int mode = kvm->arch.irqchip_mode;
@@ -93,6 +83,11 @@ static inline int irqchip_kernel(struct kvm *kvm)
 	return mode == KVM_IRQCHIP_KERNEL;
 }
 
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+	return irqchip_kernel(kvm);
+}
+
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
 	int mode = kvm->arch.irqchip_mode;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2a3b57401a68..34a7e0533dad 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -36,6 +36,7 @@
 #include <linux/jump_label.h>
 #include "kvm_cache_regs.h"
 #include "irq.h"
+#include "ioapic.h"
 #include "trace.h"
 #include "x86.h"
 #include "cpuid.h"
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f63258183b8f..fd1c9145505c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -16,6 +16,7 @@
  */
 
 #include "irq.h"
+#include "ioapic.h"
 #include "mmu.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 375d3fc0a4a6..6d13e9d1bb75 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -18,6 +18,7 @@
 
 #include <linux/kvm_host.h>
 #include "irq.h"
+#include "ioapic.h"
 #include "mmu.h"
 #include "i8254.h"
 #include "tss.h"
-- 
cgit v1.2.3


From a3535be731c2a343912578465021f50937f7b099 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 16 May 2020 09:19:06 -0400
Subject: KVM: nSVM: fix condition for filtering async PF

Async page faults have to be trapped in the host (L1 in this case),
since the APF reason was passed from L0 to L1 and stored in the L1 APF
data page.  This was completely reversed: the page faults were passed
to the guest, a L2 hypervisor.

Cc: stable@vger.kernel.org
Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index a89a166d1cb8..f4cd2d0cc360 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -880,8 +880,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
 			return NESTED_EXIT_HOST;
 		break;
 	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
-		/* When we're shadowing, trap PFs, but not async PF */
-		if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
+		/* Trap async PF even if not shadowing */
+		if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason)
 			return NESTED_EXIT_HOST;
 		break;
 	default:
-- 
cgit v1.2.3


From 6c0238c4a62b3a0b1201aeb7e33a4636d552a436 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 May 2020 08:02:17 -0400
Subject: KVM: nSVM: leave ASID aside in copy_vmcb_control_area

Restoring the ASID from the hsave area on VMEXIT is wrong, because its
value depends on the handling of TLB flushes.  Just skipping the field in
copy_vmcb_control_area will do.

Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index f4cd2d0cc360..d544cce4f964 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -150,7 +150,7 @@ static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb
 	dst->iopm_base_pa         = from->iopm_base_pa;
 	dst->msrpm_base_pa        = from->msrpm_base_pa;
 	dst->tsc_offset           = from->tsc_offset;
-	dst->asid                 = from->asid;
+	/* asid not copied, it is handled manually for svm->vmcb.  */
 	dst->tlb_ctl              = from->tlb_ctl;
 	dst->int_ctl              = from->int_ctl;
 	dst->int_vector           = from->int_vector;
-- 
cgit v1.2.3


From 7d2e8748af62b0de7c7bbcb0d62f937e88fd7027 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Wed, 27 May 2020 01:54:00 -0700
Subject: KVM: x86: Initialize tdp_level during vCPU creation

Initialize vcpu->arch.tdp_level during vCPU creation to avoid consuming
garbage if userspace calls KVM_RUN without first calling KVM_SET_CPUID.

Fixes: e93fd3b3e89e9 ("KVM: x86/mmu: Capture TDP level when updating CPUID")
Reported-by: syzbot+904752567107eefb728c@syzkaller.appspotmail.com
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Message-Id: <20200527085400.23759-1-sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6d13e9d1bb75..329bdd2eb2cf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9418,6 +9418,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	fx_init(vcpu);
 
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+	vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
 
 	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
 
-- 
cgit v1.2.3


From b6162e82aef19fee9c32cb3fe9ac30d9116a8c73 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 27 May 2020 11:01:02 +0200
Subject: KVM: nSVM: Preserve registers modifications done before
 nested_svm_vmexit()

L2 guest hang is observed after 'exit_required' was dropped and nSVM
switched to check_nested_events() completely. The hang is a busy loop when
e.g. KVM is emulating an instruction (e.g. L2 is accessing MMIO space and
we drop to userspace). After nested_svm_vmexit() and when L1 is doing VMRUN
nested guest's RIP is not advanced so KVM goes into emulating the same
instruction which caused nested_svm_vmexit() and the loop continues.

nested_svm_vmexit() is not new, however, with check_nested_events() we're
now calling it later than before. In case by that time KVM has modified
register state we may pick stale values from VMCB when trying to save
nested guest state to nested VMCB.

nVMX code handles this case correctly: sync_vmcs02_to_vmcs12() called from
nested_vmx_vmexit() does e.g 'vmcs12->guest_rip = kvm_rip_read(vcpu)' and
this ensures KVM-made modifications are preserved. Do the same for nSVM.

Generally, nested_vmx_vmexit()/nested_svm_vmexit() need to pick up all
nested guest state modifications done by KVM after vmexit. It would be
great to find a way to express this in a way which would not require to
manually track these changes, e.g. nested_{vmcb,vmcs}_get_field().

Co-debugged-with: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200527090102.220647-1-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index d544cce4f964..f39e0d578b9b 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -491,9 +491,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
 	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
 	nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
-	nested_vmcb->save.rip    = vmcb->save.rip;
-	nested_vmcb->save.rsp    = vmcb->save.rsp;
-	nested_vmcb->save.rax    = vmcb->save.rax;
+	nested_vmcb->save.rip    = kvm_rip_read(&svm->vcpu);
+	nested_vmcb->save.rsp    = kvm_rsp_read(&svm->vcpu);
+	nested_vmcb->save.rax    = kvm_rax_read(&svm->vcpu);
 	nested_vmcb->save.dr7    = vmcb->save.dr7;
 	nested_vmcb->save.dr6    = svm->vcpu.arch.dr6;
 	nested_vmcb->save.cpl    = vmcb->save.cpl;
-- 
cgit v1.2.3


From c6b22f59d694d0caf61aefb262d9639b3d9661d5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 26 May 2020 09:05:27 -0400
Subject: KVM: x86: track manually whether an event has been injected

Instead of calling kvm_event_needs_reinjection, track its
future return value in a variable.  This will be useful in
the next patch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 329bdd2eb2cf..77b9b4e66673 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7717,11 +7717,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 static int inject_pending_event(struct kvm_vcpu *vcpu)
 {
 	int r;
+	bool can_inject = true;
 
 	/* try to reinject previous events if any */
 
-	if (vcpu->arch.exception.injected)
+	if (vcpu->arch.exception.injected) {
 		kvm_x86_ops.queue_exception(vcpu);
+		can_inject = false;
+	}
 	/*
 	 * Do not inject an NMI or interrupt if there is a pending
 	 * exception.  Exceptions and interrupts are recognized at
@@ -7737,10 +7740,13 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 	 * fully complete the previous instruction.
 	 */
 	else if (!vcpu->arch.exception.pending) {
-		if (vcpu->arch.nmi_injected)
+		if (vcpu->arch.nmi_injected) {
 			kvm_x86_ops.set_nmi(vcpu);
-		else if (vcpu->arch.interrupt.injected)
+			can_inject = false;
+		} else if (vcpu->arch.interrupt.injected) {
 			kvm_x86_ops.set_irq(vcpu);
+			can_inject = false;
+		}
 	}
 
 	WARN_ON_ONCE(vcpu->arch.exception.injected &&
@@ -7790,10 +7796,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 		}
 
 		kvm_x86_ops.queue_exception(vcpu);
+		can_inject = false;
 	}
 
-	/* Don't consider new event if we re-injected an event */
-	if (kvm_event_needs_reinjection(vcpu))
+	/* Finish re-injection before considering new events */
+	if (!can_inject)
 		return 0;
 
 	if (vcpu->arch.smi_pending &&
-- 
cgit v1.2.3


From fc5d1f1a42fba6266ab95dc3b84937933a9b5a66 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@arm.com>
Date: Sat, 1 Dec 2018 08:41:28 -0800
Subject: KVM: arm64: vgic-v3: Take cpu_if pointer directly instead of vcpu

If we move the used_lrs field to the version-specific cpu interface
structure, the following functions only operate on the struct
vgic_v3_cpu_if and not the full vcpu:

  __vgic_v3_save_state
  __vgic_v3_restore_state
  __vgic_v3_activate_traps
  __vgic_v3_deactivate_traps
  __vgic_v3_save_aprs
  __vgic_v3_restore_aprs

This is going to be very useful for nested virt, so move the used_lrs
field and change the prototypes and implementations of these functions to
take the cpu_if parameter directly.

No functional change.

Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_hyp.h | 12 ++++++------
 arch/arm64/kvm/hyp/switch.c      |  8 ++++----
 arch/arm64/kvm/hyp/vgic-v3-sr.c  | 33 ++++++++++-----------------------
 arch/arm64/kvm/vgic/vgic-v2.c    | 10 +++++-----
 arch/arm64/kvm/vgic/vgic-v3.c    | 14 ++++++++------
 arch/arm64/kvm/vgic/vgic.c       | 25 +++++++++++++++++--------
 include/kvm/arm_vgic.h           |  5 ++++-
 7 files changed, 54 insertions(+), 53 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index fe57f60f06a8..4f67b0cdffe8 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -56,12 +56,12 @@
 
 int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
-void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
-void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
-void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu);
-void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu);
-void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu);
-void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu);
+void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if);
 int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
 void __timer_enable_traps(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 8a1e81a400e0..c07a45643cd4 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -270,8 +270,8 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
 static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
 {
 	if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
-		__vgic_v3_save_state(vcpu);
-		__vgic_v3_deactivate_traps(vcpu);
+		__vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
+		__vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
 	}
 }
 
@@ -279,8 +279,8 @@ static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
 static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu)
 {
 	if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
-		__vgic_v3_activate_traps(vcpu);
-		__vgic_v3_restore_state(vcpu);
+		__vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
+		__vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
 	}
 }
 
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 6b85773e15c4..10ed539835c1 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -194,10 +194,9 @@ static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
 	return val;
 }
 
-void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
+void __hyp_text __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
 {
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	u64 used_lrs = cpu_if->used_lrs;
 
 	/*
 	 * Make sure stores to the GIC via the memory mapped interface
@@ -230,10 +229,9 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 	}
 }
 
-void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
+void __hyp_text __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
 {
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	u64 used_lrs = cpu_if->used_lrs;
 	int i;
 
 	if (used_lrs || cpu_if->its_vpe.its_vm) {
@@ -257,10 +255,8 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 	}
 }
 
-void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
+void __hyp_text __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
 {
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
 	/*
 	 * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
 	 * Group0 interrupt (as generated in GICv2 mode) to be
@@ -306,9 +302,8 @@ void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
 		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
 }
 
-void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
+void __hyp_text __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
 {
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 	u64 val;
 
 	if (!cpu_if->vgic_sre) {
@@ -333,15 +328,11 @@ void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
 		write_gicreg(0, ICH_HCR_EL2);
 }
 
-void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
+void __hyp_text __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
 {
-	struct vgic_v3_cpu_if *cpu_if;
 	u64 val;
 	u32 nr_pre_bits;
 
-	vcpu = kern_hyp_va(vcpu);
-	cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
 	val = read_gicreg(ICH_VTR_EL2);
 	nr_pre_bits = vtr_to_nr_pre_bits(val);
 
@@ -370,15 +361,11 @@ void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
 	}
 }
 
-void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu)
+void __hyp_text __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
 {
-	struct vgic_v3_cpu_if *cpu_if;
 	u64 val;
 	u32 nr_pre_bits;
 
-	vcpu = kern_hyp_va(vcpu);
-	cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
 	val = read_gicreg(ICH_VTR_EL2);
 	nr_pre_bits = vtr_to_nr_pre_bits(val);
 
@@ -451,7 +438,7 @@ static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
 						    u32 vmcr,
 						    u64 *lr_val)
 {
-	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
 	u8 priority = GICv3_IDLE_PRIORITY;
 	int i, lr = -1;
 
@@ -490,7 +477,7 @@ static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
 static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
 					       int intid, u64 *lr_val)
 {
-	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
 	int i;
 
 	for (i = 0; i < used_lrs; i++) {
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 621cc168fe3f..ebf53a4e1296 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -56,7 +56,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 
 	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
 
-	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
+	for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) {
 		u32 val = cpuif->vgic_lr[lr];
 		u32 cpuid, intid = val & GICH_LR_VIRTUALID;
 		struct vgic_irq *irq;
@@ -120,7 +120,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 		vgic_put_irq(vcpu->kvm, irq);
 	}
 
-	vgic_cpu->used_lrs = 0;
+	cpuif->used_lrs = 0;
 }
 
 /*
@@ -427,7 +427,7 @@ out:
 static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	u64 used_lrs = cpu_if->used_lrs;
 	u64 elrsr;
 	int i;
 
@@ -448,7 +448,7 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
 void vgic_v2_save_state(struct kvm_vcpu *vcpu)
 {
 	void __iomem *base = kvm_vgic_global_state.vctrl_base;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
 
 	if (!base)
 		return;
@@ -463,7 +463,7 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
 	void __iomem *base = kvm_vgic_global_state.vctrl_base;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
+	u64 used_lrs = cpu_if->used_lrs;
 	int i;
 
 	if (!base)
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 3ccd6d3cb4d3..76e2d85789ed 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -39,7 +39,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 
 	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
 
-	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
+	for (lr = 0; lr < cpuif->used_lrs; lr++) {
 		u64 val = cpuif->vgic_lr[lr];
 		u32 intid, cpuid;
 		struct vgic_irq *irq;
@@ -111,7 +111,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 		vgic_put_irq(vcpu->kvm, irq);
 	}
 
-	vgic_cpu->used_lrs = 0;
+	cpuif->used_lrs = 0;
 }
 
 /* Requires the irq to be locked already */
@@ -662,10 +662,10 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
 	if (likely(cpu_if->vgic_sre))
 		kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
 
-	kvm_call_hyp(__vgic_v3_restore_aprs, vcpu);
+	kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
 
 	if (has_vhe())
-		__vgic_v3_activate_traps(vcpu);
+		__vgic_v3_activate_traps(cpu_if);
 
 	WARN_ON(vgic_v4_load(vcpu));
 }
@@ -680,12 +680,14 @@ void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
 
 void vgic_v3_put(struct kvm_vcpu *vcpu)
 {
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
 	WARN_ON(vgic_v4_put(vcpu, false));
 
 	vgic_v3_vmcr_sync(vcpu);
 
-	kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
+	kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
 
 	if (has_vhe())
-		__vgic_v3_deactivate_traps(vcpu);
+		__vgic_v3_deactivate_traps(cpu_if);
 }
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 99b02ca730a8..c3643b7f101b 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -786,6 +786,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
 	int count;
 	bool multi_sgi;
 	u8 prio = 0xff;
+	int i = 0;
 
 	lockdep_assert_held(&vgic_cpu->ap_list_lock);
 
@@ -827,11 +828,14 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	vcpu->arch.vgic_cpu.used_lrs = count;
-
 	/* Nuke remaining LRs */
-	for ( ; count < kvm_vgic_global_state.nr_lr; count++)
-		vgic_clear_lr(vcpu, count);
+	for (i = count ; i < kvm_vgic_global_state.nr_lr; i++)
+		vgic_clear_lr(vcpu, i);
+
+	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+		vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count;
+	else
+		vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count;
 }
 
 static inline bool can_access_vgic_from_kernel(void)
@@ -849,13 +853,13 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu)
 	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
 		vgic_v2_save_state(vcpu);
 	else
-		__vgic_v3_save_state(vcpu);
+		__vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
 }
 
 /* Sync back the hardware VGIC state into our emulation after a guest's run. */
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	int used_lrs;
 
 	/* An empty ap_list_head implies used_lrs == 0 */
 	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
@@ -864,7 +868,12 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	if (can_access_vgic_from_kernel())
 		vgic_save_state(vcpu);
 
-	if (vgic_cpu->used_lrs)
+	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+		used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
+	else
+		used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
+
+	if (used_lrs)
 		vgic_fold_lr_state(vcpu);
 	vgic_prune_ap_list(vcpu);
 }
@@ -874,7 +883,7 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
 	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
 		vgic_v2_restore_state(vcpu);
 	else
-		__vgic_v3_restore_state(vcpu);
+		__vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
 }
 
 /* Flush our emulation state into the GIC hardware before entering the guest. */
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 69f4164d6477..a8d8fdcd3723 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -274,6 +274,8 @@ struct vgic_v2_cpu_if {
 	u32		vgic_vmcr;
 	u32		vgic_apr;
 	u32		vgic_lr[VGIC_V2_MAX_LRS];
+
+	unsigned int used_lrs;
 };
 
 struct vgic_v3_cpu_if {
@@ -291,6 +293,8 @@ struct vgic_v3_cpu_if {
 	 * linking the Linux IRQ subsystem and the ITS together.
 	 */
 	struct its_vpe	its_vpe;
+
+	unsigned int used_lrs;
 };
 
 struct vgic_cpu {
@@ -300,7 +304,6 @@ struct vgic_cpu {
 		struct vgic_v3_cpu_if	vgic_v3;
 	};
 
-	unsigned int used_lrs;
 	struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
 
 	raw_spinlock_t ap_list_lock;	/* Protects the ap_list */
-- 
cgit v1.2.3


From 7ea90bdd70c9cf82dfbaa54e7d9f296928679224 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 20 Jun 2019 11:17:00 +0100
Subject: KVM: arm64: Refactor vcpu_{read,write}_sys_reg

Extract the direct HW accessors for later reuse.

Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/sys_regs.c | 128 +++++++++++++++++++++++++---------------------
 1 file changed, 71 insertions(+), 57 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 620eaf11e672..50e328ca1419 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -64,11 +64,8 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
 	return false;
 }
 
-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
+static bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
 {
-	if (!vcpu->arch.sysregs_loaded_on_cpu)
-		goto immediate_read;
-
 	/*
 	 * System registers listed in the switch are not saved on every
 	 * exit from the guest but are only saved on vcpu_put.
@@ -79,40 +76,37 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
 	 * thread when emulating cross-VCPU communication.
 	 */
 	switch (reg) {
-	case CSSELR_EL1:	return read_sysreg_s(SYS_CSSELR_EL1);
-	case SCTLR_EL1:		return read_sysreg_s(SYS_SCTLR_EL12);
-	case ACTLR_EL1:		return read_sysreg_s(SYS_ACTLR_EL1);
-	case CPACR_EL1:		return read_sysreg_s(SYS_CPACR_EL12);
-	case TTBR0_EL1:		return read_sysreg_s(SYS_TTBR0_EL12);
-	case TTBR1_EL1:		return read_sysreg_s(SYS_TTBR1_EL12);
-	case TCR_EL1:		return read_sysreg_s(SYS_TCR_EL12);
-	case ESR_EL1:		return read_sysreg_s(SYS_ESR_EL12);
-	case AFSR0_EL1:		return read_sysreg_s(SYS_AFSR0_EL12);
-	case AFSR1_EL1:		return read_sysreg_s(SYS_AFSR1_EL12);
-	case FAR_EL1:		return read_sysreg_s(SYS_FAR_EL12);
-	case MAIR_EL1:		return read_sysreg_s(SYS_MAIR_EL12);
-	case VBAR_EL1:		return read_sysreg_s(SYS_VBAR_EL12);
-	case CONTEXTIDR_EL1:	return read_sysreg_s(SYS_CONTEXTIDR_EL12);
-	case TPIDR_EL0:		return read_sysreg_s(SYS_TPIDR_EL0);
-	case TPIDRRO_EL0:	return read_sysreg_s(SYS_TPIDRRO_EL0);
-	case TPIDR_EL1:		return read_sysreg_s(SYS_TPIDR_EL1);
-	case AMAIR_EL1:		return read_sysreg_s(SYS_AMAIR_EL12);
-	case CNTKCTL_EL1:	return read_sysreg_s(SYS_CNTKCTL_EL12);
-	case PAR_EL1:		return read_sysreg_s(SYS_PAR_EL1);
-	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
-	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
-	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
+	case CSSELR_EL1:	*val = read_sysreg_s(SYS_CSSELR_EL1);	break;
+	case SCTLR_EL1:		*val = read_sysreg_s(SYS_SCTLR_EL12);	break;
+	case ACTLR_EL1:		*val = read_sysreg_s(SYS_ACTLR_EL1);	break;
+	case CPACR_EL1:		*val = read_sysreg_s(SYS_CPACR_EL12);	break;
+	case TTBR0_EL1:		*val = read_sysreg_s(SYS_TTBR0_EL12);	break;
+	case TTBR1_EL1:		*val = read_sysreg_s(SYS_TTBR1_EL12);	break;
+	case TCR_EL1:		*val = read_sysreg_s(SYS_TCR_EL12);	break;
+	case ESR_EL1:		*val = read_sysreg_s(SYS_ESR_EL12);	break;
+	case AFSR0_EL1:		*val = read_sysreg_s(SYS_AFSR0_EL12);	break;
+	case AFSR1_EL1:		*val = read_sysreg_s(SYS_AFSR1_EL12);	break;
+	case FAR_EL1:		*val = read_sysreg_s(SYS_FAR_EL12);	break;
+	case MAIR_EL1:		*val = read_sysreg_s(SYS_MAIR_EL12);	break;
+	case VBAR_EL1:		*val = read_sysreg_s(SYS_VBAR_EL12);	break;
+	case CONTEXTIDR_EL1:	*val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
+	case TPIDR_EL0:		*val = read_sysreg_s(SYS_TPIDR_EL0);	break;
+	case TPIDRRO_EL0:	*val = read_sysreg_s(SYS_TPIDRRO_EL0);	break;
+	case TPIDR_EL1:		*val = read_sysreg_s(SYS_TPIDR_EL1);	break;
+	case AMAIR_EL1:		*val = read_sysreg_s(SYS_AMAIR_EL12);	break;
+	case CNTKCTL_EL1:	*val = read_sysreg_s(SYS_CNTKCTL_EL12);	break;
+	case PAR_EL1:		*val = read_sysreg_s(SYS_PAR_EL1);	break;
+	case DACR32_EL2:	*val = read_sysreg_s(SYS_DACR32_EL2);	break;
+	case IFSR32_EL2:	*val = read_sysreg_s(SYS_IFSR32_EL2);	break;
+	case DBGVCR32_EL2:	*val = read_sysreg_s(SYS_DBGVCR32_EL2);	break;
+	default:		return false;
 	}
 
-immediate_read:
-	return __vcpu_sys_reg(vcpu, reg);
+	return true;
 }
 
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+static bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
 {
-	if (!vcpu->arch.sysregs_loaded_on_cpu)
-		goto immediate_write;
-
 	/*
 	 * System registers listed in the switch are not restored on every
 	 * entry to the guest but are only restored on vcpu_load.
@@ -122,32 +116,52 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 	 * once, before running the VCPU, and never changed later.
 	 */
 	switch (reg) {
-	case CSSELR_EL1:	write_sysreg_s(val, SYS_CSSELR_EL1);	return;
-	case SCTLR_EL1:		write_sysreg_s(val, SYS_SCTLR_EL12);	return;
-	case ACTLR_EL1:		write_sysreg_s(val, SYS_ACTLR_EL1);	return;
-	case CPACR_EL1:		write_sysreg_s(val, SYS_CPACR_EL12);	return;
-	case TTBR0_EL1:		write_sysreg_s(val, SYS_TTBR0_EL12);	return;
-	case TTBR1_EL1:		write_sysreg_s(val, SYS_TTBR1_EL12);	return;
-	case TCR_EL1:		write_sysreg_s(val, SYS_TCR_EL12);	return;
-	case ESR_EL1:		write_sysreg_s(val, SYS_ESR_EL12);	return;
-	case AFSR0_EL1:		write_sysreg_s(val, SYS_AFSR0_EL12);	return;
-	case AFSR1_EL1:		write_sysreg_s(val, SYS_AFSR1_EL12);	return;
-	case FAR_EL1:		write_sysreg_s(val, SYS_FAR_EL12);	return;
-	case MAIR_EL1:		write_sysreg_s(val, SYS_MAIR_EL12);	return;
-	case VBAR_EL1:		write_sysreg_s(val, SYS_VBAR_EL12);	return;
-	case CONTEXTIDR_EL1:	write_sysreg_s(val, SYS_CONTEXTIDR_EL12); return;
-	case TPIDR_EL0:		write_sysreg_s(val, SYS_TPIDR_EL0);	return;
-	case TPIDRRO_EL0:	write_sysreg_s(val, SYS_TPIDRRO_EL0);	return;
-	case TPIDR_EL1:		write_sysreg_s(val, SYS_TPIDR_EL1);	return;
-	case AMAIR_EL1:		write_sysreg_s(val, SYS_AMAIR_EL12);	return;
-	case CNTKCTL_EL1:	write_sysreg_s(val, SYS_CNTKCTL_EL12);	return;
-	case PAR_EL1:		write_sysreg_s(val, SYS_PAR_EL1);	return;
-	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	return;
-	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	return;
-	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	return;
+	case CSSELR_EL1:	write_sysreg_s(val, SYS_CSSELR_EL1);	break;
+	case SCTLR_EL1:		write_sysreg_s(val, SYS_SCTLR_EL12);	break;
+	case ACTLR_EL1:		write_sysreg_s(val, SYS_ACTLR_EL1);	break;
+	case CPACR_EL1:		write_sysreg_s(val, SYS_CPACR_EL12);	break;
+	case TTBR0_EL1:		write_sysreg_s(val, SYS_TTBR0_EL12);	break;
+	case TTBR1_EL1:		write_sysreg_s(val, SYS_TTBR1_EL12);	break;
+	case TCR_EL1:		write_sysreg_s(val, SYS_TCR_EL12);	break;
+	case ESR_EL1:		write_sysreg_s(val, SYS_ESR_EL12);	break;
+	case AFSR0_EL1:		write_sysreg_s(val, SYS_AFSR0_EL12);	break;
+	case AFSR1_EL1:		write_sysreg_s(val, SYS_AFSR1_EL12);	break;
+	case FAR_EL1:		write_sysreg_s(val, SYS_FAR_EL12);	break;
+	case MAIR_EL1:		write_sysreg_s(val, SYS_MAIR_EL12);	break;
+	case VBAR_EL1:		write_sysreg_s(val, SYS_VBAR_EL12);	break;
+	case CONTEXTIDR_EL1:	write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
+	case TPIDR_EL0:		write_sysreg_s(val, SYS_TPIDR_EL0);	break;
+	case TPIDRRO_EL0:	write_sysreg_s(val, SYS_TPIDRRO_EL0);	break;
+	case TPIDR_EL1:		write_sysreg_s(val, SYS_TPIDR_EL1);	break;
+	case AMAIR_EL1:		write_sysreg_s(val, SYS_AMAIR_EL12);	break;
+	case CNTKCTL_EL1:	write_sysreg_s(val, SYS_CNTKCTL_EL12);	break;
+	case PAR_EL1:		write_sysreg_s(val, SYS_PAR_EL1);	break;
+	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	break;
+	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	break;
+	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	break;
+	default:		return false;
 	}
 
-immediate_write:
+	return true;
+}
+
+u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
+{
+	u64 val = 0x8badf00d8badf00d;
+
+	if (vcpu->arch.sysregs_loaded_on_cpu &&
+	    __vcpu_read_sys_reg_from_cpu(reg, &val))
+		return val;
+
+	return __vcpu_sys_reg(vcpu, reg);
+}
+
+void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+{
+	if (vcpu->arch.sysregs_loaded_on_cpu &&
+	    __vcpu_write_sys_reg_to_cpu(val, reg))
+		return;
+
 	 __vcpu_sys_reg(vcpu, reg) = val;
 }
 
-- 
cgit v1.2.3


From 7ccadf23b8613c946f67e2b3c5e7f436858021aa Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Jan 2020 11:54:42 +0000
Subject: KVM: arm64: Add missing reset handlers for PMU emulation

As we're about to become a bit more harsh when it comes to the lack of
reset callbacks, let's add the missing PMU reset handlers. Note that
these only cover *CLR registers that were always covered by their *SET
counterpart, so there is no semantic change here.

Reviewed-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/sys_regs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 50e328ca1419..9d28eabbdf97 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1546,7 +1546,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
 
 	{ SYS_DESC(SYS_PMINTENSET_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 },
-	{ SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, NULL, PMINTENSET_EL1 },
+	{ SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 },
 
 	{ SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
 	{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
@@ -1585,8 +1585,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
 	{ SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, PMCR_EL0 },
 	{ SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
-	{ SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, NULL, PMCNTENSET_EL0 },
-	{ SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, NULL, PMOVSSET_EL0 },
+	{ SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
+	{ SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, reset_unknown, PMOVSSET_EL0 },
 	{ SYS_DESC(SYS_PMSWINC_EL0), access_pmswinc, reset_unknown, PMSWINC_EL0 },
 	{ SYS_DESC(SYS_PMSELR_EL0), access_pmselr, reset_unknown, PMSELR_EL0 },
 	{ SYS_DESC(SYS_PMCEID0_EL0), access_pmceid },
-- 
cgit v1.2.3


From bb44a8dbea259bc1dc2177b4bc90ca4e8fcbf659 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 27 Jan 2020 11:21:17 +0000
Subject: KVM: arm64: Move sysreg reset check to boot time

Our sysreg reset check has become a bit silly, as it only checks whether
a reset callback actually exists for a given sysreg entry, and apply the
method if available. Doing the check at each vcpu reset is pretty dumb,
as the tables never change. It is thus perfectly possible to do the same
checks at boot time.

This also allows us to introduce a sparse sys_regs[] array, something
that will be required with ARMv8.4-NV.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/sys_regs.c | 72 +++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 37 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 9d28eabbdf97..ad1d57501d6d 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2087,12 +2087,37 @@ static const struct sys_reg_desc cp15_64_regs[] = {
 	{ SYS_DESC(SYS_AARCH32_CNTP_CVAL),    access_arch_timer },
 };
 
+static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
+			      bool is_32)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++) {
+		if (!is_32 && table[i].reg && !table[i].reset) {
+			kvm_err("sys_reg table %p entry %d has lacks reset\n",
+				table, i);
+			return 1;
+		}
+
+		if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) {
+			kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
 /* Target specific emulation tables */
 static struct kvm_sys_reg_target_table *target_tables[KVM_ARM_NUM_TARGETS];
 
 void kvm_register_target_sys_reg_table(unsigned int target,
 				       struct kvm_sys_reg_target_table *table)
 {
+	if (check_sysreg_table(table->table64.table, table->table64.num, false) ||
+	    check_sysreg_table(table->table32.table, table->table32.num, true))
+		return;
+
 	target_tables[target] = table;
 }
 
@@ -2378,19 +2403,13 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu,
 }
 
 static void reset_sys_reg_descs(struct kvm_vcpu *vcpu,
-				const struct sys_reg_desc *table, size_t num,
-				unsigned long *bmap)
+				const struct sys_reg_desc *table, size_t num)
 {
 	unsigned long i;
 
 	for (i = 0; i < num; i++)
-		if (table[i].reset) {
-			int reg = table[i].reg;
-
+		if (table[i].reset)
 			table[i].reset(vcpu, &table[i]);
-			if (reg > 0 && reg < NR_SYS_REGS)
-				set_bit(reg, bmap);
-		}
 }
 
 /**
@@ -2846,32 +2865,18 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 	return write_demux_regids(uindices);
 }
 
-static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 1; i < n; i++) {
-		if (cmp_sys_reg(&table[i-1], &table[i]) >= 0) {
-			kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1);
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
 void kvm_sys_reg_table_init(void)
 {
 	unsigned int i;
 	struct sys_reg_desc clidr;
 
 	/* Make sure tables are unique and in order. */
-	BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs)));
-	BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs)));
-	BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs)));
-	BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs)));
-	BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs)));
-	BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)));
+	BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false));
+	BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true));
+	BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true));
+	BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true));
+	BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true));
+	BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false));
 
 	/* We abuse the reset function to overwrite the table itself. */
 	for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
@@ -2907,17 +2912,10 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
 {
 	size_t num;
 	const struct sys_reg_desc *table;
-	DECLARE_BITMAP(bmap, NR_SYS_REGS) = { 0, };
 
 	/* Generic chip reset first (so target could override). */
-	reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs), bmap);
+	reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
 
 	table = get_target_table(vcpu->arch.target, true, &num);
-	reset_sys_reg_descs(vcpu, table, num, bmap);
-
-	for (num = 1; num < NR_SYS_REGS; num++) {
-		if (WARN(!test_bit(num, bmap),
-			 "Didn't reset __vcpu_sys_reg(%zi)\n", num))
-			break;
-	}
+	reset_sys_reg_descs(vcpu, table, num);
 }
-- 
cgit v1.2.3


From 349c330ced9764667678f4d2804fd4ebc16110c9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 12 Apr 2020 18:49:31 +0100
Subject: KVM: arm64: Don't use empty structures as CPU reset state

Keeping empty structure as the vcpu state initializer is slightly
wasteful: we only want to set pstate, and zero everything else.
Just do that.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/reset.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 658f3a79617b..865c8aa670bc 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -36,15 +36,11 @@ static u32 kvm_ipa_limit;
 /*
  * ARMv8 Reset Values
  */
-static const struct kvm_regs default_regs_reset = {
-	.regs.pstate = (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT |
-			PSR_F_BIT | PSR_D_BIT),
-};
+#define VCPU_RESET_PSTATE_EL1	(PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \
+				 PSR_F_BIT | PSR_D_BIT)
 
-static const struct kvm_regs default_regs_reset32 = {
-	.regs.pstate = (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT |
-			PSR_AA32_I_BIT | PSR_AA32_F_BIT),
-};
+#define VCPU_RESET_PSTATE_SVC	(PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
+				 PSR_AA32_I_BIT | PSR_AA32_F_BIT)
 
 static bool cpu_has_32bit_el1(void)
 {
@@ -257,9 +253,9 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
  */
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 {
-	const struct kvm_regs *cpu_reset;
 	int ret = -EINVAL;
 	bool loaded;
+	u32 pstate;
 
 	/* Reset PMU outside of the non-preemptible section */
 	kvm_pmu_vcpu_reset(vcpu);
@@ -290,16 +286,17 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 		if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) {
 			if (!cpu_has_32bit_el1())
 				goto out;
-			cpu_reset = &default_regs_reset32;
+			pstate = VCPU_RESET_PSTATE_SVC;
 		} else {
-			cpu_reset = &default_regs_reset;
+			pstate = VCPU_RESET_PSTATE_EL1;
 		}
 
 		break;
 	}
 
 	/* Reset core registers */
-	memcpy(vcpu_gp_regs(vcpu), cpu_reset, sizeof(*cpu_reset));
+	memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
+	vcpu_gp_regs(vcpu)->regs.pstate = pstate;
 
 	/* Reset system registers */
 	kvm_reset_sys_regs(vcpu);
-- 
cgit v1.2.3


From d9d7d84d9906e1bc886c5e0fc66aaad26008264b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 21 Apr 2020 18:32:02 +0100
Subject: KVM: arm64: Parametrize exception entry with a target EL

We currently assume that an exception is delivered to EL1, always.
Once we emulate EL2, this no longer will be the case. To prepare
for this, add a target_mode parameter.

While we're at it, merge the computing of the target PC and PSTATE in
a single function that updates both PC and CPSR after saving their
previous values in the corresponding ELR/SPSR. This ensures that they
are updated in the correct order (a pretty common source of bugs...).

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/ptrace.h |  1 +
 arch/arm64/kvm/inject_fault.c   | 75 +++++++++++++++++++++--------------------
 2 files changed, 39 insertions(+), 37 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index bf57308fcd63..953b6a1ce549 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -35,6 +35,7 @@
 #define GIC_PRIO_PSR_I_SET		(1 << 4)
 
 /* Additional SPSR bits not exposed in the UABI */
+#define PSR_MODE_THREAD_BIT	(1 << 0)
 #define PSR_IL_BIT		(1 << 20)
 
 /* AArch32-specific ptrace requests */
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 6aafc2825c1c..e21fdd93027a 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -26,28 +26,12 @@ enum exception_type {
 	except_type_serror	= 0x180,
 };
 
-static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type)
-{
-	u64 exc_offset;
-
-	switch (*vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT)) {
-	case PSR_MODE_EL1t:
-		exc_offset = CURRENT_EL_SP_EL0_VECTOR;
-		break;
-	case PSR_MODE_EL1h:
-		exc_offset = CURRENT_EL_SP_ELx_VECTOR;
-		break;
-	case PSR_MODE_EL0t:
-		exc_offset = LOWER_EL_AArch64_VECTOR;
-		break;
-	default:
-		exc_offset = LOWER_EL_AArch32_VECTOR;
-	}
-
-	return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type;
-}
-
 /*
+ * This performs the exception entry at a given EL (@target_mode), stashing PC
+ * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
+ * The EL passed to this function *must* be a non-secure, privileged mode with
+ * bit 0 being set (PSTATE.SP == 1).
+ *
  * When an exception is taken, most PSTATE fields are left unchanged in the
  * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all
  * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx
@@ -59,10 +43,35 @@ static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type)
  * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from
  * MSB to LSB.
  */
-static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu)
+static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
+			      enum exception_type type)
 {
-	unsigned long sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
-	unsigned long old, new;
+	unsigned long sctlr, vbar, old, new, mode;
+	u64 exc_offset;
+
+	mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+	if      (mode == target_mode)
+		exc_offset = CURRENT_EL_SP_ELx_VECTOR;
+	else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
+		exc_offset = CURRENT_EL_SP_EL0_VECTOR;
+	else if (!(mode & PSR_MODE32_BIT))
+		exc_offset = LOWER_EL_AArch64_VECTOR;
+	else
+		exc_offset = LOWER_EL_AArch32_VECTOR;
+
+	switch (target_mode) {
+	case PSR_MODE_EL1h:
+		vbar = vcpu_read_sys_reg(vcpu, VBAR_EL1);
+		sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+		vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
+		break;
+	default:
+		/* Don't do that */
+		BUG();
+	}
+
+	*vcpu_pc(vcpu) = vbar + exc_offset + type;
 
 	old = *vcpu_cpsr(vcpu);
 	new = 0;
@@ -105,9 +114,10 @@ static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu)
 	new |= PSR_I_BIT;
 	new |= PSR_F_BIT;
 
-	new |= PSR_MODE_EL1h;
+	new |= target_mode;
 
-	return new;
+	*vcpu_cpsr(vcpu) = new;
+	vcpu_write_spsr(vcpu, old);
 }
 
 static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
@@ -116,11 +126,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
 	bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
 	u32 esr = 0;
 
-	vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
-	*vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
-
-	*vcpu_cpsr(vcpu) = get_except64_pstate(vcpu);
-	vcpu_write_spsr(vcpu, cpsr);
+	enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
 
 	vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
 
@@ -148,14 +154,9 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
 
 static void inject_undef64(struct kvm_vcpu *vcpu)
 {
-	unsigned long cpsr = *vcpu_cpsr(vcpu);
 	u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
 
-	vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
-	*vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
-
-	*vcpu_cpsr(vcpu) = get_except64_pstate(vcpu);
-	vcpu_write_spsr(vcpu, cpsr);
+	enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
 
 	/*
 	 * Build an unknown exception, depending on the instruction
-- 
cgit v1.2.3


From 8f7f4fe756bd5cfef73cf8234445081385bdbf7d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 May 2020 11:38:57 +0100
Subject: KVM: arm64: Drop obsolete comment about sys_reg ordering

The general comment about keeping the enum order in sync
with the save/restore code has been obsolete for many years now.

Just drop it.

Note that there are other ordering requirements in the enum,
such as the PtrAuth and PMU registers, which are still valid.

Reported-by: James Morse <james.morse@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 69a338a390a6..59029e90b557 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -115,12 +115,8 @@ struct kvm_vcpu_fault_info {
 	u64 disr_el1;		/* Deferred [SError] Status Register */
 };
 
-/*
- * 0 is reserved as an invalid value.
- * Order should be kept in sync with the save/restore code.
- */
 enum vcpu_sysreg {
-	__INVALID_SYSREG__,
+	__INVALID_SYSREG__,   /* 0 is reserved as an invalid value */
 	MPIDR_EL1,	/* MultiProcessor Affinity Register */
 	CSSELR_EL1,	/* Cache Size Selection Register */
 	SCTLR_EL1,	/* System Control Register */
-- 
cgit v1.2.3


From c9d40913ac5a21eb2b976bb221a4677540e84eba Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 May 2020 11:21:49 -0400
Subject: KVM: x86: enable event window in inject_pending_event

In case an interrupt arrives after nested.check_events but before the
call to kvm_cpu_has_injectable_intr, we could end up enabling the interrupt
window even if the interrupt is actually going to be a vmexit.  This is
useless rather than harmful, but it really complicates reasoning about
SVM's handling of the VINTR intercept.  We'd like to never bother with
the VINTR intercept if V_INTR_MASKING=1 && INTERCEPT_INTR=1, because in
that case there is no interrupt window and we can just exit the nested
guest whenever we want.

This patch moves the opening of the interrupt window inside
inject_pending_event.  This consolidates the check for pending
interrupt/NMI/SMI in one place, and makes KVM's usage of immediate
exits more consistent, extending it beyond just nested virtualization.

There are two functional changes here.  They only affect corner cases,
but overall they simplify the inject_pending_event.

- re-injection of still-pending events will also use req_immediate_exit
instead of using interrupt-window intercepts.  This should have no impact
on performance on Intel since it simply replaces an interrupt-window
or NMI-window exit for a preemption-timer exit.  On AMD, which has no
equivalent of the preemption time, it may incur some overhead but an
actual effect on performance should only be visible in pathological cases.

- kvm_arch_interrupt_allowed and kvm_vcpu_has_events will return true
if an interrupt, NMI or SMI is blocked by nested_run_pending.  This
makes sense because entering the VM will allow it to make progress
and deliver the event.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   8 +--
 arch/x86/kvm/svm/svm.c          |  24 ++++-----
 arch/x86/kvm/vmx/vmx.c          |  20 +++----
 arch/x86/kvm/x86.c              | 117 ++++++++++++++++++++++------------------
 4 files changed, 92 insertions(+), 77 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index db261da578f3..7707bd4b0593 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1136,8 +1136,8 @@ struct kvm_x86_ops {
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu);
 	void (*cancel_injection)(struct kvm_vcpu *vcpu);
-	bool (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
-	bool (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
+	int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
+	int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
 	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
@@ -1234,10 +1234,10 @@ struct kvm_x86_ops {
 
 	void (*setup_mce)(struct kvm_vcpu *vcpu);
 
-	bool (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
+	int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
-	int (*enable_smi_window)(struct kvm_vcpu *vcpu);
+	void (*enable_smi_window)(struct kvm_vcpu *vcpu);
 
 	int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
 	int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d877a0f70cac..5da494847a2f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3053,15 +3053,15 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-static bool svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	if (svm->nested.nested_run_pending)
-		return false;
+		return -EBUSY;
 
 	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
 	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
-		return false;
+		return -EBUSY;
 
 	return !svm_nmi_blocked(vcpu);
 }
@@ -3112,18 +3112,18 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
 	return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
 }
 
-static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	if (svm->nested.nested_run_pending)
-		return false;
+		return -EBUSY;
 
 	/*
 	 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
 	 * e.g. if the IRQ arrived asynchronously after checking nested events.
 	 */
 	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
-		return false;
+		return -EBUSY;
 
 	return !svm_interrupt_blocked(vcpu);
 }
@@ -3793,15 +3793,15 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu)
 	return is_smm(vcpu);
 }
 
-static bool svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	if (svm->nested.nested_run_pending)
-		return false;
+		return -EBUSY;
 
 	/* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
 	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
-		return false;
+		return -EBUSY;
 
 	return !svm_smi_blocked(vcpu);
 }
@@ -3848,7 +3848,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
 	return 0;
 }
 
-static int enable_smi_window(struct kvm_vcpu *vcpu)
+static void enable_smi_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3856,9 +3856,9 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 		if (vgif_enabled(svm))
 			set_intercept(svm, INTERCEPT_STGI);
 		/* STGI will cause a vm exit */
-		return 1;
+	} else {
+		/* We must be in SMM; RSM will cause a vmexit anyway.  */
 	}
-	return 0;
 }
 
 static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4e76e30b661c..b3a41645e157 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4551,14 +4551,14 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
 		 GUEST_INTR_STATE_NMI));
 }
 
-static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	if (to_vmx(vcpu)->nested.nested_run_pending)
-		return false;
+		return -EBUSY;
 
 	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
 	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
-		return false;
+		return -EBUSY;
 
 	return !vmx_nmi_blocked(vcpu);
 }
@@ -4573,17 +4573,17 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
 		(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
 
-static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	if (to_vmx(vcpu)->nested.nested_run_pending)
-		return false;
+		return -EBUSY;
 
        /*
         * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
         * e.g. if the IRQ arrived asynchronously after checking nested events.
         */
 	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
-		return false;
+		return -EBUSY;
 
 	return !vmx_interrupt_blocked(vcpu);
 }
@@ -7757,11 +7757,11 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
 			~FEAT_CTL_LMCE_ENABLED;
 }
 
-static bool vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
 	/* we need a nested vmexit to enter SMM, postpone if run is pending */
 	if (to_vmx(vcpu)->nested.nested_run_pending)
-		return false;
+		return -EBUSY;
 	return !is_smm(vcpu);
 }
 
@@ -7799,9 +7799,9 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
 	return 0;
 }
 
-static int enable_smi_window(struct kvm_vcpu *vcpu)
+static void enable_smi_window(struct kvm_vcpu *vcpu)
 {
-	return 0;
+	/* RSM will cause a vmexit anyway.  */
 }
 
 static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 77b9b4e66673..0ee828f60d05 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7714,7 +7714,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static int inject_pending_event(struct kvm_vcpu *vcpu)
+static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 {
 	int r;
 	bool can_inject = true;
@@ -7760,8 +7760,8 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 	 */
 	if (is_guest_mode(vcpu)) {
 		r = kvm_x86_ops.nested_ops->check_events(vcpu);
-		if (r != 0)
-			return r;
+		if (r < 0)
+			goto busy;
 	}
 
 	/* try to inject new event if pending */
@@ -7799,27 +7799,69 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
 		can_inject = false;
 	}
 
-	/* Finish re-injection before considering new events */
-	if (!can_inject)
-		return 0;
+	/*
+	 * Finally, inject interrupt events.  If an event cannot be injected
+	 * due to architectural conditions (e.g. IF=0) a window-open exit
+	 * will re-request KVM_REQ_EVENT.  Sometimes however an event is pending
+	 * and can architecturally be injected, but we cannot do it right now:
+	 * an interrupt could have arrived just now and we have to inject it
+	 * as a vmexit, or there could already an event in the queue, which is
+	 * indicated by can_inject.  In that case we request an immediate exit
+	 * in order to make progress and get back here for another iteration.
+	 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
+	 */
+	if (vcpu->arch.smi_pending) {
+		r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
+		if (r < 0)
+			goto busy;
+		if (r) {
+			vcpu->arch.smi_pending = false;
+			++vcpu->arch.smi_count;
+			enter_smm(vcpu);
+			can_inject = false;
+		} else
+			kvm_x86_ops.enable_smi_window(vcpu);
+	}
+
+	if (vcpu->arch.nmi_pending) {
+		r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
+		if (r < 0)
+			goto busy;
+		if (r) {
+			--vcpu->arch.nmi_pending;
+			vcpu->arch.nmi_injected = true;
+			kvm_x86_ops.set_nmi(vcpu);
+			can_inject = false;
+			WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
+		}
+		if (vcpu->arch.nmi_pending)
+			kvm_x86_ops.enable_nmi_window(vcpu);
+	}
 
-	if (vcpu->arch.smi_pending &&
-	    kvm_x86_ops.smi_allowed(vcpu, true)) {
-		vcpu->arch.smi_pending = false;
-		++vcpu->arch.smi_count;
-		enter_smm(vcpu);
-	} else if (vcpu->arch.nmi_pending &&
-		   kvm_x86_ops.nmi_allowed(vcpu, true)) {
-		--vcpu->arch.nmi_pending;
-		vcpu->arch.nmi_injected = true;
-		kvm_x86_ops.set_nmi(vcpu);
-	} else if (kvm_cpu_has_injectable_intr(vcpu) &&
-		   kvm_x86_ops.interrupt_allowed(vcpu, true)) {
-		kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
-		kvm_x86_ops.set_irq(vcpu);
+	if (kvm_cpu_has_injectable_intr(vcpu)) {
+		r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
+		if (r < 0)
+			goto busy;
+		if (r) {
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
+			kvm_x86_ops.set_irq(vcpu);
+			WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
+		}
+		if (kvm_cpu_has_injectable_intr(vcpu))
+			kvm_x86_ops.enable_irq_window(vcpu);
 	}
 
-	return 0;
+	if (is_guest_mode(vcpu) &&
+	    kvm_x86_ops.nested_ops->hv_timer_pending &&
+	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+		*req_immediate_exit = true;
+
+	WARN_ON(vcpu->arch.exception.pending);
+	return;
+
+busy:
+	*req_immediate_exit = true;
+	return;
 }
 
 static void process_nmi(struct kvm_vcpu *vcpu)
@@ -8357,36 +8399,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			goto out;
 		}
 
-		if (inject_pending_event(vcpu) != 0)
-			req_immediate_exit = true;
-		else {
-			/* Enable SMI/NMI/IRQ window open exits if needed.
-			 *
-			 * SMIs have three cases:
-			 * 1) They can be nested, and then there is nothing to
-			 *    do here because RSM will cause a vmexit anyway.
-			 * 2) There is an ISA-specific reason why SMI cannot be
-			 *    injected, and the moment when this changes can be
-			 *    intercepted.
-			 * 3) Or the SMI can be pending because
-			 *    inject_pending_event has completed the injection
-			 *    of an IRQ or NMI from the previous vmexit, and
-			 *    then we request an immediate exit to inject the
-			 *    SMI.
-			 */
-			if (vcpu->arch.smi_pending && !is_smm(vcpu))
-				if (!kvm_x86_ops.enable_smi_window(vcpu))
-					req_immediate_exit = true;
-			if (vcpu->arch.nmi_pending)
-				kvm_x86_ops.enable_nmi_window(vcpu);
-			if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
-				kvm_x86_ops.enable_irq_window(vcpu);
-			if (is_guest_mode(vcpu) &&
-			    kvm_x86_ops.nested_ops->hv_timer_pending &&
-			    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
-				req_immediate_exit = true;
-			WARN_ON(vcpu->arch.exception.pending);
-		}
+		inject_pending_event(vcpu, &req_immediate_exit);
+		if (req_int_win)
+			kvm_x86_ops.enable_irq_window(vcpu);
 
 		if (kvm_lapic_enabled(vcpu)) {
 			update_cr8_intercept(vcpu);
-- 
cgit v1.2.3


From 7c86663b68bab393633d8312a0d25a3d004de182 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 16 May 2020 08:42:28 -0400
Subject: KVM: nSVM: inject exceptions via svm_check_nested_events

This allows exceptions injected by the emulator to be properly delivered
as vmexits.  The code also becomes simpler, because we can just let all
L0-intercepted exceptions go through the usual path.  In particular, our
emulation of the VMX #DB exit qualification is very much simplified,
because the vmexit injection path can use kvm_deliver_exception_payload
to update DR6.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/svm/nested.c       | 136 ++++++++++++++++------------------------
 arch/x86/kvm/svm/svm.c          |   9 ---
 arch/x86/kvm/svm/svm.h          |   1 +
 arch/x86/kvm/x86.c              |  13 +---
 5 files changed, 58 insertions(+), 103 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7707bd4b0593..e6f2e1a2dab6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1495,6 +1495,8 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
+void kvm_update_dr7(struct kvm_vcpu *vcpu);
+
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index f39e0d578b9b..1ae13fd17028 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -111,6 +111,8 @@ void recalc_intercepts(struct vcpu_svm *svm)
 	h = &svm->nested.hsave->control;
 	g = &svm->nested;
 
+	svm->nested.host_intercept_exceptions = h->intercept_exceptions;
+
 	c->intercept_cr = h->intercept_cr;
 	c->intercept_dr = h->intercept_dr;
 	c->intercept_exceptions = h->intercept_exceptions;
@@ -616,50 +618,6 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 }
 
-/* DB exceptions for our internal use must not cause vmexit */
-static int nested_svm_intercept_db(struct vcpu_svm *svm)
-{
-	unsigned long dr6 = svm->vmcb->save.dr6;
-
-	/* Always catch it and pass it to userspace if debugging.  */
-	if (svm->vcpu.guest_debug &
-	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
-		return NESTED_EXIT_HOST;
-
-	/* if we're not singlestepping, it's not ours */
-	if (!svm->nmi_singlestep)
-		goto reflected_db;
-
-	/* if it's not a singlestep exception, it's not ours */
-	if (!(dr6 & DR6_BS))
-		goto reflected_db;
-
-	/* if the guest is singlestepping, it should get the vmexit */
-	if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
-		disable_nmi_singlestep(svm);
-		goto reflected_db;
-	}
-
-	/* it's ours, the nested hypervisor must not see this one */
-	return NESTED_EXIT_HOST;
-
-reflected_db:
-	/*
-	 * Synchronize guest DR6 here just like in kvm_deliver_exception_payload;
-	 * it will be moved into the nested VMCB by nested_svm_vmexit.  Once
-	 * exceptions will be moved to svm_check_nested_events, all this stuff
-	 * will just go away and we could just return NESTED_EXIT_HOST
-	 * unconditionally.  db_interception will queue the exception, which
-	 * will be processed by svm_check_nested_events if a nested vmexit is
-	 * required, and we will just use kvm_deliver_exception_payload to copy
-	 * the payload to DR6 before vmexit.
-	 */
-	WARN_ON(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT);
-	svm->vcpu.arch.dr6 &= ~(DR_TRAP_BITS | DR6_RTM);
-	svm->vcpu.arch.dr6 |= dr6 & ~DR6_FIXED_1;
-	return NESTED_EXIT_DONE;
-}
-
 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 {
 	unsigned port, size, iopm_len;
@@ -710,20 +668,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 		break;
 	}
 	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
-		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
-		if (svm->nested.intercept_exceptions & excp_bits) {
-			if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
-				vmexit = nested_svm_intercept_db(svm);
-			else if (exit_code == SVM_EXIT_EXCP_BASE + BP_VECTOR &&
-				 svm->vcpu.guest_debug & KVM_GUESTDBG_USE_SW_BP)
-				vmexit = NESTED_EXIT_HOST;
-			else
-				vmexit = NESTED_EXIT_DONE;
-		}
-		/* async page fault always cause vmexit */
-		else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
-			 svm->vcpu.arch.exception.nested_apf != 0)
-			vmexit = NESTED_EXIT_DONE;
+		/*
+		 * Host-intercepted exceptions have been checked already in
+		 * nested_svm_exit_special.  There is nothing to do here,
+		 * the vmexit is injected by svm_check_nested_events.
+		 */
+		vmexit = NESTED_EXIT_DONE;
 		break;
 	}
 	case SVM_EXIT_ERR: {
@@ -768,35 +718,45 @@ int nested_svm_check_permissions(struct vcpu_svm *svm)
 	return 0;
 }
 
-int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
-			       bool has_error_code, u32 error_code)
+static bool nested_exit_on_exception(struct vcpu_svm *svm)
 {
-	int vmexit;
+	unsigned int nr = svm->vcpu.arch.exception.nr;
 
-	if (!is_guest_mode(&svm->vcpu))
-		return 0;
+	return (svm->nested.intercept_exceptions & (1 << nr));
+}
 
-	vmexit = nested_svm_intercept(svm);
-	if (vmexit != NESTED_EXIT_DONE)
-		return 0;
+static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
+{
+	unsigned int nr = svm->vcpu.arch.exception.nr;
 
 	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
 	svm->vmcb->control.exit_code_hi = 0;
-	svm->vmcb->control.exit_info_1 = error_code;
+
+	if (svm->vcpu.arch.exception.has_error_code)
+		svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
 
 	/*
 	 * EXITINFO2 is undefined for all exception intercepts other
 	 * than #PF.
 	 */
-	if (svm->vcpu.arch.exception.nested_apf)
-		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
-	else if (svm->vcpu.arch.exception.has_payload)
-		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
-	else
-		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+	if (nr == PF_VECTOR) {
+		if (svm->vcpu.arch.exception.nested_apf)
+			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+		else if (svm->vcpu.arch.exception.has_payload)
+			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+		else
+			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+	} else if (nr == DB_VECTOR) {
+		/* See inject_pending_event.  */
+		kvm_deliver_exception_payload(&svm->vcpu);
+		if (svm->vcpu.arch.dr7 & DR7_GD) {
+			svm->vcpu.arch.dr7 &= ~DR7_GD;
+			kvm_update_dr7(&svm->vcpu);
+		}
+	} else
+		WARN_ON(svm->vcpu.arch.exception.has_payload);
 
-	svm->nested.exit_required = true;
-	return vmexit;
+	nested_svm_vmexit(svm);
 }
 
 static void nested_svm_smi(struct vcpu_svm *svm)
@@ -835,6 +795,15 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 		kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required ||
 		svm->nested.nested_run_pending;
 
+	if (vcpu->arch.exception.pending) {
+		if (block_nested_events)
+                        return -EBUSY;
+		if (!nested_exit_on_exception(svm))
+			return 0;
+		nested_svm_inject_exception_vmexit(svm);
+		return 0;
+	}
+
 	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
@@ -872,18 +841,19 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
 	switch (exit_code) {
 	case SVM_EXIT_INTR:
 	case SVM_EXIT_NMI:
-	case SVM_EXIT_EXCP_BASE + MC_VECTOR:
-		return NESTED_EXIT_HOST;
 	case SVM_EXIT_NPF:
-		/* For now we are always handling NPFs when using them */
-		if (npt_enabled)
+		return NESTED_EXIT_HOST;
+	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+
+		if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits)
 			return NESTED_EXIT_HOST;
-		break;
-	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
-		/* Trap async PF even if not shadowing */
-		if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason)
+		else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+			 svm->vcpu.arch.apf.host_apf_reason)
+			/* Trap async PF even if not shadowing */
 			return NESTED_EXIT_HOST;
 		break;
+	}
 	default:
 		break;
 	}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5da494847a2f..89b6fb1e0abc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -331,17 +331,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	unsigned nr = vcpu->arch.exception.nr;
 	bool has_error_code = vcpu->arch.exception.has_error_code;
-	bool reinject = vcpu->arch.exception.injected;
 	u32 error_code = vcpu->arch.exception.error_code;
 
-	/*
-	 * If we are within a nested VM we'd better #VMEXIT and let the guest
-	 * handle the exception
-	 */
-	if (!reinject &&
-	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
-		return;
-
 	kvm_deliver_exception_payload(&svm->vcpu);
 
 	if (nr == BP_VECTOR && !nrips) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 5cc559ab862d..8342032291fc 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -86,6 +86,7 @@ struct nested_state {
 	u64 hsave_msr;
 	u64 vm_cr_msr;
 	u64 vmcb;
+	u32 host_intercept_exceptions;
 
 	/* These are the merged vectors */
 	u32 *msrpm;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ee828f60d05..f0fa610bed91 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1072,7 +1072,7 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+void kvm_update_dr7(struct kvm_vcpu *vcpu)
 {
 	unsigned long dr7;
 
@@ -1085,6 +1085,7 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
 	if (dr7 & DR7_BP_EN_MASK)
 		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
 }
+EXPORT_SYMBOL_GPL(kvm_update_dr7);
 
 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
 {
@@ -7778,16 +7779,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 					     X86_EFLAGS_RF);
 
 		if (vcpu->arch.exception.nr == DB_VECTOR) {
-			/*
-			 * This code assumes that nSVM doesn't use
-			 * check_nested_events(). If it does, the
-			 * DR6/DR7 changes should happen before L1
-			 * gets a #VMEXIT for an intercepted #DB in
-			 * L2.  (Under VMX, on the other hand, the
-			 * DR6/DR7 changes should not happen in the
-			 * event of a VM-exit to L1 for an intercepted
-			 * #DB in L2.)
-			 */
 			kvm_deliver_exception_payload(vcpu);
 			if (vcpu->arch.dr7 & DR7_GD) {
 				vcpu->arch.dr7 &= ~DR7_GD;
-- 
cgit v1.2.3


From bd279629f73f27f0931d09cd3da904b923fb6f35 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 16 May 2020 08:46:00 -0400
Subject: KVM: nSVM: remove exit_required

All events now inject vmexits before vmentry rather than after vmexit.  Therefore,
exit_required is not set anymore and we can remove it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c |  3 +--
 arch/x86/kvm/svm/svm.c    | 14 --------------
 arch/x86/kvm/svm/svm.h    |  3 ---
 3 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 1ae13fd17028..3d17c62a84a3 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -792,8 +792,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	bool block_nested_events =
-		kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required ||
-		svm->nested.nested_run_pending;
+		kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending;
 
 	if (vcpu->arch.exception.pending) {
 		if (block_nested_events)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 89b6fb1e0abc..545f63ebc720 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2889,13 +2889,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 	if (npt_enabled)
 		vcpu->arch.cr3 = svm->vmcb->save.cr3;
 
-	if (unlikely(svm->nested.exit_required)) {
-		nested_svm_vmexit(svm);
-		svm->nested.exit_required = false;
-
-		return 1;
-	}
-
 	if (is_guest_mode(vcpu)) {
 		int vmexit;
 
@@ -3327,13 +3320,6 @@ static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
 	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
 
-	/*
-	 * A vmexit emulation is required before the vcpu can be executed
-	 * again.
-	 */
-	if (unlikely(svm->nested.exit_required))
-		return EXIT_FASTPATH_NONE;
-
 	/*
 	 * Disable singlestep if we're injecting an interrupt/exception.
 	 * We don't want our modified rflags to be pushed on the stack where
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 8342032291fc..89fab75dd4f5 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -95,9 +95,6 @@ struct nested_state {
 	u64 vmcb_msrpm;
 	u64 vmcb_iopm;
 
-	/* A VMEXIT is required but not yet emulated */
-	bool exit_required;
-
 	/* A VMRUN has started but has not yet been performed, so
 	 * we cannot inject a nested vmexit yet.  */
 	bool nested_run_pending;
-- 
cgit v1.2.3


From 5b672408660e309ce7d54462eea050d5e5d0fc3d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 16 May 2020 08:50:35 -0400
Subject: KVM: nSVM: correctly inject INIT vmexits

The usual drill at this point, except there is no code to remove because this
case was not handled at all.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 3d17c62a84a3..dcac4c3510ab 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -25,6 +25,7 @@
 #include "trace.h"
 #include "mmu.h"
 #include "x86.h"
+#include "lapic.h"
 #include "svm.h"
 
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
@@ -788,11 +789,37 @@ static void nested_svm_intr(struct vcpu_svm *svm)
 	nested_svm_vmexit(svm);
 }
 
+static inline bool nested_exit_on_init(struct vcpu_svm *svm)
+{
+	return (svm->nested.intercept & (1ULL << INTERCEPT_INIT));
+}
+
+static void nested_svm_init(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.exit_code   = SVM_EXIT_INIT;
+	svm->vmcb->control.exit_info_1 = 0;
+	svm->vmcb->control.exit_info_2 = 0;
+
+	nested_svm_vmexit(svm);
+}
+
+
 static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	bool block_nested_events =
 		kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (lapic_in_kernel(vcpu) &&
+	    test_bit(KVM_APIC_INIT, &apic->pending_events)) {
+		if (block_nested_events)
+			return -EBUSY;
+		if (!nested_exit_on_init(svm))
+			return 0;
+		nested_svm_init(svm);
+		return 0;
+	}
 
 	if (vcpu->arch.exception.pending) {
 		if (block_nested_events)
-- 
cgit v1.2.3


From 978ce5837c7ed50e4ea30cc0fa20f2f820edf8ea Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 May 2020 08:37:37 -0400
Subject: KVM: SVM: always update CR3 in VMCB

svm_load_mmu_pgd is delaying the write of GUEST_CR3 to prepare_vmcs02 as
an optimization, but this is only correct before the nested vmentry.
If userspace is modifying CR3 with KVM_SET_SREGS after the VM has
already been put in guest mode, the value of CR3 will not be updated.
Remove the optimization, which almost never triggers anyway.
This was was added in commit 689f3bf21628 ("KVM: x86: unify callbacks
to load paging root", 2020-03-16) just to keep the two vendor-specific
modules closer, but we'll fix VMX too.

Fixes: 689f3bf21628 ("KVM: x86: unify callbacks to load paging root")
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c |  6 +-----
 arch/x86/kvm/svm/svm.c    | 16 +++++-----------
 2 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index dcac4c3510ab..8756c9f463fd 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -256,11 +256,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
 	svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
 	svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
-	if (npt_enabled) {
-		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
-		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
-	} else
-		(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+	(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
 
 	/* Guest paging mode is active - reset mmu */
 	kvm_mmu_reset_context(&svm->vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 545f63ebc720..feb96a410f2d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3447,7 +3447,6 @@ static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	bool update_guest_cr3 = true;
 	unsigned long cr3;
 
 	cr3 = __sme_set(root);
@@ -3456,18 +3455,13 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
 		mark_dirty(svm->vmcb, VMCB_NPT);
 
 		/* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
-		if (is_guest_mode(vcpu))
-			update_guest_cr3 = false;
-		else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
-			cr3 = vcpu->arch.cr3;
-		else /* CR3 is already up-to-date.  */
-			update_guest_cr3 = false;
+		if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+			return;
+		cr3 = vcpu->arch.cr3;
 	}
 
-	if (update_guest_cr3) {
-		svm->vmcb->save.cr3 = cr3;
-		mark_dirty(svm->vmcb, VMCB_CR);
-	}
+	svm->vmcb->save.cr3 = cr3;
+	mark_dirty(svm->vmcb, VMCB_CR);
 }
 
 static int is_disabled(void)
-- 
cgit v1.2.3


From df7e0681dd8acfa8d07816eaef232ded816d8a8c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 May 2020 08:37:37 -0400
Subject: KVM: nVMX: always update CR3 in VMCS

vmx_load_mmu_pgd is delaying the write of GUEST_CR3 to prepare_vmcs02 as
an optimization, but this is only correct before the nested vmentry.
If userspace is modifying CR3 with KVM_SET_SREGS after the VM has
already been put in guest mode, the value of CR3 will not be updated.
Remove the optimization, which almost never triggers anyway.

Fixes: 04f11ef45810 ("KVM: nVMX: Always write vmcs02.GUEST_CR3 during nested VM-Enter")
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b3a41645e157..7b55dc6230a9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3085,10 +3085,7 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
 			spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
 		}
 
-		/* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
-		if (is_guest_mode(vcpu))
-			update_guest_cr3 = false;
-		else if (!enable_unrestricted_guest && !is_paging(vcpu))
+		if (!enable_unrestricted_guest && !is_paging(vcpu))
 			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
 		else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
 			guest_cr3 = vcpu->arch.cr3;
-- 
cgit v1.2.3


From 69c9dfa24bb7bac5c9e2bd4d3f631e35b91e3733 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 May 2020 12:57:26 -0400
Subject: KVM: nSVM: move map argument out of enter_svm_guest_mode

Unmapping the nested VMCB in enter_svm_guest_mode is a bit of a wart,
since the map argument is not used elsewhere in the function.  There are
just two callers, and those are also the place where kvm_vcpu_map is
called, so it is cleaner to unmap there.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 14 ++++++--------
 arch/x86/kvm/svm/svm.c    |  3 ++-
 arch/x86/kvm/svm/svm.h    |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 8756c9f463fd..8e98d5e420a5 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -229,7 +229,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 }
 
 void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-			  struct vmcb *nested_vmcb, struct kvm_host_map *map)
+			  struct vmcb *nested_vmcb)
 {
 	bool evaluate_pending_interrupts =
 		is_intercept(svm, INTERCEPT_VINTR) ||
@@ -304,8 +304,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	svm->vmcb->control.pause_filter_thresh =
 		nested_vmcb->control.pause_filter_thresh;
 
-	kvm_vcpu_unmap(&svm->vcpu, map, true);
-
 	/* Enter Guest-Mode */
 	enter_guest_mode(&svm->vcpu);
 
@@ -368,10 +366,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 		nested_vmcb->control.exit_code_hi = 0;
 		nested_vmcb->control.exit_info_1  = 0;
 		nested_vmcb->control.exit_info_2  = 0;
-
-		kvm_vcpu_unmap(&svm->vcpu, &map, true);
-
-		return ret;
+		goto out;
 	}
 
 	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
@@ -414,7 +409,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 	copy_vmcb_control_area(hsave, vmcb);
 
 	svm->nested.nested_run_pending = 1;
-	enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
+	enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb);
 
 	if (!nested_svm_vmrun_msrpm(svm)) {
 		svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
@@ -425,6 +420,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 		nested_svm_vmexit(svm);
 	}
 
+out:
+	kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
 	return ret;
 }
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index feb96a410f2d..76b3f553815e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3814,7 +3814,8 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
 		if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
 			return 1;
 		nested_vmcb = map.hva;
-		enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map);
+		enter_svm_guest_mode(svm, vmcb, nested_vmcb);
+		kvm_vcpu_unmap(&svm->vcpu, &map, true);
 	}
 	return 0;
 }
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 89fab75dd4f5..33e3f09d7a8e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -395,7 +395,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
 }
 
 void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-			  struct vmcb *nested_vmcb, struct kvm_host_map *map);
+			  struct vmcb *nested_vmcb);
 int nested_svm_vmrun(struct vcpu_svm *svm);
 void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
 int nested_svm_vmexit(struct vcpu_svm *svm);
-- 
cgit v1.2.3


From 3e06f0163f7f75364290bf338f4e279f6e3e404f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 May 2020 13:07:26 -0400
Subject: KVM: nSVM: extract load_nested_vmcb_control

When restoring SVM nested state, the control state cache in svm->nested
will have to be filled, but the save state will not have to be moved
into svm->vmcb.  Therefore, pull the code that handles the control area
into a separate function.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 8e98d5e420a5..fc0c6d1678eb 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -228,6 +228,23 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 	return true;
 }
 
+static void load_nested_vmcb_control(struct vcpu_svm *svm,
+				     struct vmcb_control_area *control)
+{
+	svm->nested.nested_cr3 = control->nested_cr3;
+
+	svm->nested.vmcb_msrpm = control->msrpm_base_pa & ~0x0fffULL;
+	svm->nested.vmcb_iopm  = control->iopm_base_pa  & ~0x0fffULL;
+
+	/* cache intercepts */
+	svm->nested.intercept_cr         = control->intercept_cr;
+	svm->nested.intercept_dr         = control->intercept_dr;
+	svm->nested.intercept_exceptions = control->intercept_exceptions;
+	svm->nested.intercept            = control->intercept;
+
+	svm->vcpu.arch.tsc_offset += control->tsc_offset;
+}
+
 void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 			  struct vmcb *nested_vmcb)
 {
@@ -235,15 +252,16 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 		is_intercept(svm, INTERCEPT_VINTR) ||
 		is_intercept(svm, INTERCEPT_IRET);
 
+	svm->nested.vmcb = vmcb_gpa;
 	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
 		svm->vcpu.arch.hflags |= HF_HIF_MASK;
 	else
 		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
 
-	if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
-		svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+	load_nested_vmcb_control(svm, &nested_vmcb->control);
+
+	if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE)
 		nested_svm_init_mmu_context(&svm->vcpu);
-	}
 
 	/* Load the nested guest state */
 	svm->vmcb->save.es = nested_vmcb->save.es;
@@ -274,25 +292,15 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	svm->vcpu.arch.dr6  = nested_vmcb->save.dr6;
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 
-	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
-	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
-
-	/* cache intercepts */
-	svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
-	svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
-	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
-	svm->nested.intercept            = nested_vmcb->control.intercept;
-
 	svm_flush_tlb(&svm->vcpu);
-	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
 	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
 		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
 	else
 		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
-	svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
 	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
 
+	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
 	svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
 	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
 	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
@@ -313,8 +321,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	 */
 	recalc_intercepts(svm);
 
-	svm->nested.vmcb = vmcb_gpa;
-
 	/*
 	 * If L1 had a pending IRQ/NMI before executing VMRUN,
 	 * which wasn't delivered because it was disallowed (e.g.
-- 
cgit v1.2.3


From f241d711b2d17f79ae5fad78c5644c674fce42ba Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 May 2020 10:56:43 -0400
Subject: KVM: nSVM: extract preparation of VMCB for nested run

Split out filling svm->vmcb.save and svm->vmcb.control before VMRUN.
Only the latter will be useful when restoring nested SVM state.

This patch introduces no semantic change, so the MMU setup is still
done in nested_prepare_vmcb_save.  The next patch will clean up things.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index fc0c6d1678eb..73be7af79453 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -245,21 +245,8 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 	svm->vcpu.arch.tsc_offset += control->tsc_offset;
 }
 
-void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-			  struct vmcb *nested_vmcb)
+static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
 {
-	bool evaluate_pending_interrupts =
-		is_intercept(svm, INTERCEPT_VINTR) ||
-		is_intercept(svm, INTERCEPT_IRET);
-
-	svm->nested.vmcb = vmcb_gpa;
-	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
-		svm->vcpu.arch.hflags |= HF_HIF_MASK;
-	else
-		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
-
-	load_nested_vmcb_control(svm, &nested_vmcb->control);
-
 	if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE)
 		nested_svm_init_mmu_context(&svm->vcpu);
 
@@ -291,7 +278,10 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
 	svm->vcpu.arch.dr6  = nested_vmcb->save.dr6;
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+}
 
+static void nested_prepare_vmcb_control(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
+{
 	svm_flush_tlb(&svm->vcpu);
 	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
 		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -321,6 +311,26 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	 */
 	recalc_intercepts(svm);
 
+	mark_all_dirty(svm->vmcb);
+}
+
+void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+			  struct vmcb *nested_vmcb)
+{
+	bool evaluate_pending_interrupts =
+		is_intercept(svm, INTERCEPT_VINTR) ||
+		is_intercept(svm, INTERCEPT_IRET);
+
+	svm->nested.vmcb = vmcb_gpa;
+	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
+		svm->vcpu.arch.hflags |= HF_HIF_MASK;
+	else
+		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+	load_nested_vmcb_control(svm, &nested_vmcb->control);
+	nested_prepare_vmcb_save(svm, nested_vmcb);
+	nested_prepare_vmcb_control(svm, nested_vmcb);
+
 	/*
 	 * If L1 had a pending IRQ/NMI before executing VMRUN,
 	 * which wasn't delivered because it was disallowed (e.g.
@@ -336,8 +346,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	enable_gif(svm);
 	if (unlikely(evaluate_pending_interrupts))
 		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-
-	mark_all_dirty(svm->vmcb);
 }
 
 int nested_svm_vmrun(struct vcpu_svm *svm)
-- 
cgit v1.2.3


From 69cb877487de3662e08081cf036f8fe0e4a3b806 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 May 2020 05:27:46 -0400
Subject: KVM: nSVM: move MMU setup to nested_prepare_vmcb_control

Everything that is needed during nested state restore is now part of
nested_prepare_vmcb_control.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 73be7af79453..a85cc7376a82 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -247,9 +247,6 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 
 static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
 {
-	if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE)
-		nested_svm_init_mmu_context(&svm->vcpu);
-
 	/* Load the nested guest state */
 	svm->vmcb->save.es = nested_vmcb->save.es;
 	svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -263,9 +260,6 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v
 	svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
 	(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
 
-	/* Guest paging mode is active - reset mmu */
-	kvm_mmu_reset_context(&svm->vcpu);
-
 	svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
 	kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
 	kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
@@ -282,6 +276,12 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v
 
 static void nested_prepare_vmcb_control(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
 {
+	if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE)
+		nested_svm_init_mmu_context(&svm->vcpu);
+
+	/* Guest paging mode is active - reset mmu */
+	kvm_mmu_reset_context(&svm->vcpu);
+
 	svm_flush_tlb(&svm->vcpu);
 	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
 		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
-- 
cgit v1.2.3


From 18fc6c55d1f449ff57778a4003c838a79d62b5a8 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 May 2020 11:07:08 -0400
Subject: KVM: nSVM: clean up tsc_offset update

Use l1_tsc_offset to compute svm->vcpu.arch.tsc_offset and
svm->vmcb->control.tsc_offset, instead of relying on hsave.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index a85cc7376a82..5ca403a69148 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -241,8 +241,6 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 	svm->nested.intercept_dr         = control->intercept_dr;
 	svm->nested.intercept_exceptions = control->intercept_exceptions;
 	svm->nested.intercept            = control->intercept;
-
-	svm->vcpu.arch.tsc_offset += control->tsc_offset;
 }
 
 static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
@@ -288,7 +286,8 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm, struct vmcb *neste
 	else
 		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
-	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
+		svm->vcpu.arch.l1_tsc_offset + nested_vmcb->control.tsc_offset;
 
 	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
 	svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
@@ -553,7 +552,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	/* Restore the original control entries */
 	copy_vmcb_control_area(vmcb, hsave);
 
-	svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
+	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
+		svm->vcpu.arch.l1_tsc_offset;
+
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
-- 
cgit v1.2.3


From 2f675917efc8a4d469f8be95e6ea55b2310bfd0d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 May 2020 15:21:22 -0400
Subject: KVM: nSVM: pass vmcb_control_area to copy_vmcb_control_area

This will come in handy when we put a struct vmcb_control_area in
svm->nested.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 5ca403a69148..fd9742c1a860 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -141,11 +141,9 @@ void recalc_intercepts(struct vcpu_svm *svm)
 	c->intercept |= g->intercept;
 }
 
-static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+static void copy_vmcb_control_area(struct vmcb_control_area *dst,
+				   struct vmcb_control_area *from)
 {
-	struct vmcb_control_area *dst  = &dst_vmcb->control;
-	struct vmcb_control_area *from = &from_vmcb->control;
-
 	dst->intercept_cr         = from->intercept_cr;
 	dst->intercept_dr         = from->intercept_dr;
 	dst->intercept_exceptions = from->intercept_exceptions;
@@ -419,7 +417,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 	else
 		hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
 
-	copy_vmcb_control_area(hsave, vmcb);
+	copy_vmcb_control_area(&hsave->control, &vmcb->control);
 
 	svm->nested.nested_run_pending = 1;
 	enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb);
@@ -550,7 +548,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 		nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
 
 	/* Restore the original control entries */
-	copy_vmcb_control_area(vmcb, hsave);
+	copy_vmcb_control_area(&vmcb->control, &hsave->control);
 
 	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
 		svm->vcpu.arch.l1_tsc_offset;
-- 
cgit v1.2.3


From 7923ef4f6ec4a25a902bd827446eac860b01fd1c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 May 2020 15:24:46 -0400
Subject: KVM: nSVM: remove trailing padding for struct vmcb_control_area

Allow placing the VMCB structs on the stack or in other structs without
wasting too much space.  Add BUILD_BUG_ON as a quick safeguard against typos.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/svm.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 6ece8561ba66..8a1f5382a4ea 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -96,7 +96,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u8 reserved_6[8];	/* Offset 0xe8 */
 	u64 avic_logical_id;	/* Offset 0xf0 */
 	u64 avic_physical_id;	/* Offset 0xf8 */
-	u8 reserved_7[768];
 };
 
 
@@ -203,8 +202,16 @@ struct __attribute__ ((__packed__)) vmcb_save_area {
 	u64 last_excp_to;
 };
 
+
+static inline void __unused_size_checks(void)
+{
+	BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298);
+	BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256);
+}
+
 struct __attribute__ ((__packed__)) vmcb {
 	struct vmcb_control_area control;
+	u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
 	struct vmcb_save_area save;
 };
 
-- 
cgit v1.2.3


From e670bf68f4b701506d51f007917ab633894294d0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 May 2020 13:16:12 -0400
Subject: KVM: nSVM: save all control fields in svm->nested

In preparation for nested SVM save/restore, store all data that matters
from the VMCB control area into svm->nested.  It will then become part
of the nested SVM state that is saved by KVM_SET_NESTED_STATE and
restored by KVM_GET_NESTED_STATE, just like the cached vmcs12 for nVMX.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 73 +++++++++++++++++++++--------------------------
 arch/x86/kvm/svm/svm.c    |  4 +--
 arch/x86/kvm/svm/svm.h    | 20 ++++---------
 3 files changed, 39 insertions(+), 58 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index fd9742c1a860..1e5f460b5540 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -60,7 +60,7 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	u64 cr3 = svm->nested.nested_cr3;
+	u64 cr3 = svm->nested.ctl.nested_cr3;
 	u64 pdpte;
 	int ret;
 
@@ -75,7 +75,7 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	return svm->nested.nested_cr3;
+	return svm->nested.ctl.nested_cr3;
 }
 
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
@@ -100,8 +100,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 
 void recalc_intercepts(struct vcpu_svm *svm)
 {
-	struct vmcb_control_area *c, *h;
-	struct nested_state *g;
+	struct vmcb_control_area *c, *h, *g;
 
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 
@@ -110,7 +109,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
 
 	c = &svm->vmcb->control;
 	h = &svm->nested.hsave->control;
-	g = &svm->nested;
+	g = &svm->nested.ctl;
 
 	svm->nested.host_intercept_exceptions = h->intercept_exceptions;
 
@@ -180,7 +179,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 	 */
 	int i;
 
-	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+	if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
 		return true;
 
 	for (i = 0; i < MSRPM_OFFSETS; i++) {
@@ -191,7 +190,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 			break;
 
 		p      = msrpm_offsets[i];
-		offset = svm->nested.vmcb_msrpm + (p * 4);
+		offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
 
 		if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
 			return false;
@@ -229,16 +228,10 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 static void load_nested_vmcb_control(struct vcpu_svm *svm,
 				     struct vmcb_control_area *control)
 {
-	svm->nested.nested_cr3 = control->nested_cr3;
+	copy_vmcb_control_area(&svm->nested.ctl, control);
 
-	svm->nested.vmcb_msrpm = control->msrpm_base_pa & ~0x0fffULL;
-	svm->nested.vmcb_iopm  = control->iopm_base_pa  & ~0x0fffULL;
-
-	/* cache intercepts */
-	svm->nested.intercept_cr         = control->intercept_cr;
-	svm->nested.intercept_dr         = control->intercept_dr;
-	svm->nested.intercept_exceptions = control->intercept_exceptions;
-	svm->nested.intercept            = control->intercept;
+	svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL;
+	svm->nested.ctl.iopm_base_pa  &= ~0x0fffULL;
 }
 
 static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
@@ -270,34 +263,32 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 }
 
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
+static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 {
-	if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE)
+	if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE)
 		nested_svm_init_mmu_context(&svm->vcpu);
 
 	/* Guest paging mode is active - reset mmu */
 	kvm_mmu_reset_context(&svm->vcpu);
 
 	svm_flush_tlb(&svm->vcpu);
-	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+	if (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
 		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
 	else
 		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
 	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
-		svm->vcpu.arch.l1_tsc_offset + nested_vmcb->control.tsc_offset;
+		svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
 
-	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
-	svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
-	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
-	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
-	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
-	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+	svm->vmcb->control.int_ctl             = svm->nested.ctl.int_ctl | V_INTR_MASKING_MASK;
+	svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
+	svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
+	svm->vmcb->control.int_state           = svm->nested.ctl.int_state;
+	svm->vmcb->control.event_inj           = svm->nested.ctl.event_inj;
+	svm->vmcb->control.event_inj_err       = svm->nested.ctl.event_inj_err;
 
-	svm->vmcb->control.pause_filter_count =
-		nested_vmcb->control.pause_filter_count;
-	svm->vmcb->control.pause_filter_thresh =
-		nested_vmcb->control.pause_filter_thresh;
+	svm->vmcb->control.pause_filter_count  = svm->nested.ctl.pause_filter_count;
+	svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
 
 	/* Enter Guest-Mode */
 	enter_guest_mode(&svm->vcpu);
@@ -326,7 +317,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 
 	load_nested_vmcb_control(svm, &nested_vmcb->control);
 	nested_prepare_vmcb_save(svm, nested_vmcb);
-	nested_prepare_vmcb_control(svm, nested_vmcb);
+	nested_prepare_vmcb_control(svm);
 
 	/*
 	 * If L1 had a pending IRQ/NMI before executing VMRUN,
@@ -556,7 +547,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
-	svm->nested.nested_cr3 = 0;
+	svm->nested.ctl.nested_cr3 = 0;
 
 	/* Restore selected save entries */
 	svm->vmcb->save.es = hsave->save.es;
@@ -606,7 +597,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	u32 offset, msr, value;
 	int write, mask;
 
-	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+	if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT)))
 		return NESTED_EXIT_HOST;
 
 	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
@@ -620,7 +611,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	/* Offset is in 32 bit units but need in 8 bit units */
 	offset *= 4;
 
-	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
+	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
 		return NESTED_EXIT_DONE;
 
 	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -633,13 +624,13 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 	u8 start_bit;
 	u64 gpa;
 
-	if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+	if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
 		return NESTED_EXIT_HOST;
 
 	port = svm->vmcb->control.exit_info_1 >> 16;
 	size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
 		SVM_IOIO_SIZE_SHIFT;
-	gpa  = svm->nested.vmcb_iopm + (port / 8);
+	gpa  = svm->nested.ctl.iopm_base_pa + (port / 8);
 	start_bit = port % 8;
 	iopm_len = (start_bit + size > 8) ? 2 : 1;
 	mask = (0xf >> (4 - size)) << start_bit;
@@ -665,13 +656,13 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 		break;
 	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
 		u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
-		if (svm->nested.intercept_cr & bit)
+		if (svm->nested.ctl.intercept_cr & bit)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
 	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
 		u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
-		if (svm->nested.intercept_dr & bit)
+		if (svm->nested.ctl.intercept_dr & bit)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
@@ -690,7 +681,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 	}
 	default: {
 		u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
-		if (svm->nested.intercept & exit_bits)
+		if (svm->nested.ctl.intercept & exit_bits)
 			vmexit = NESTED_EXIT_DONE;
 	}
 	}
@@ -730,7 +721,7 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
 {
 	unsigned int nr = svm->vcpu.arch.exception.nr;
 
-	return (svm->nested.intercept_exceptions & (1 << nr));
+	return (svm->nested.ctl.intercept_exceptions & (1 << nr));
 }
 
 static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
@@ -798,7 +789,7 @@ static void nested_svm_intr(struct vcpu_svm *svm)
 
 static inline bool nested_exit_on_init(struct vcpu_svm *svm)
 {
-	return (svm->nested.intercept & (1ULL << INTERCEPT_INIT));
+	return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT));
 }
 
 static void nested_svm_init(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 76b3f553815e..ec98c5979656 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2173,7 +2173,7 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
 	bool ret = false;
 	u64 intercept;
 
-	intercept = svm->nested.intercept;
+	intercept = svm->nested.ctl.intercept;
 
 	if (!is_guest_mode(&svm->vcpu) ||
 	    (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
@@ -3649,7 +3649,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
 		    info->intercept == x86_intercept_clts)
 			break;
 
-		intercept = svm->nested.intercept;
+		intercept = svm->nested.ctl.intercept;
 
 		if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
 			break;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 33e3f09d7a8e..dd5418f20256 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -91,22 +91,12 @@ struct nested_state {
 	/* These are the merged vectors */
 	u32 *msrpm;
 
-	/* gpa pointers to the real vectors */
-	u64 vmcb_msrpm;
-	u64 vmcb_iopm;
-
 	/* A VMRUN has started but has not yet been performed, so
 	 * we cannot inject a nested vmexit yet.  */
 	bool nested_run_pending;
 
-	/* cache for intercepts of the guest */
-	u32 intercept_cr;
-	u32 intercept_dr;
-	u32 intercept_exceptions;
-	u64 intercept;
-
-	/* Nested Paging related state */
-	u64 nested_cr3;
+	/* cache for control fields of the guest */
+	struct vmcb_control_area ctl;
 };
 
 struct vcpu_svm {
@@ -381,17 +371,17 @@ static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
 
 static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
 {
-	return (svm->nested.intercept & (1ULL << INTERCEPT_SMI));
+	return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI));
 }
 
 static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
 {
-	return (svm->nested.intercept & (1ULL << INTERCEPT_INTR));
+	return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR));
 }
 
 static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
 {
-	return (svm->nested.intercept & (1ULL << INTERCEPT_NMI));
+	return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI));
 }
 
 void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-- 
cgit v1.2.3


From d8e4e58f4bd4bb55d2640a841c3606333930f0e1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 May 2020 07:38:20 -0400
Subject: KVM: nSVM: restore clobbered INT_CTL fields after clearing VINTR

Restore the INT_CTL value from the guest's VMCB once we've stopped using
it, so that virtual interrupts can be injected as requested by L1.
V_TPR is up-to-date however, and it can change if the guest writes to CR8,
so keep it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ec98c5979656..4122ba86bac2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1365,9 +1365,17 @@ static void svm_set_vintr(struct vcpu_svm *svm)
 
 static void svm_clear_vintr(struct vcpu_svm *svm)
 {
+	const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
 	clr_intercept(svm, INTERCEPT_VINTR);
 
-	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+	/* Drop int_ctl fields related to VINTR injection.  */
+	svm->vmcb->control.int_ctl &= mask;
+	if (is_guest_mode(&svm->vcpu)) {
+		WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
+			(svm->nested.ctl.int_ctl & V_TPR_MASK));
+		svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
+	}
+
 	mark_dirty(svm->vmcb, VMCB_INTR);
 }
 
-- 
cgit v1.2.3


From 2d8a42be0e2b15a4e0b20349f27bb8288db5ebe6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 May 2020 03:50:14 -0400
Subject: KVM: nSVM: synchronize VMCB controls updated by the processor on
 every vmexit

The control state changes on every L2->L0 vmexit, and we will have to
serialize it in the nested state.  So keep it up to date in svm->nested.ctl
and just copy them back to the nested VMCB in nested_svm_vmexit.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 57 +++++++++++++++++++++++++++--------------------
 arch/x86/kvm/svm/svm.c    |  5 ++++-
 arch/x86/kvm/svm/svm.h    |  1 +
 3 files changed, 38 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 1e5f460b5540..921466eba556 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -234,6 +234,34 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 	svm->nested.ctl.iopm_base_pa  &= ~0x0fffULL;
 }
 
+/*
+ * Synchronize fields that are written by the processor, so that
+ * they can be copied back into the nested_vmcb.
+ */
+void sync_nested_vmcb_control(struct vcpu_svm *svm)
+{
+	u32 mask;
+	svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
+	svm->nested.ctl.event_inj_err  = svm->vmcb->control.event_inj_err;
+
+	/* Only a few fields of int_ctl are written by the processor.  */
+	mask = V_IRQ_MASK | V_TPR_MASK;
+	if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
+	    is_intercept(svm, SVM_EXIT_VINTR)) {
+		/*
+		 * In order to request an interrupt window, L0 is usurping
+		 * svm->vmcb->control.int_ctl and possibly setting V_IRQ
+		 * even if it was clear in L1's VMCB.  Restoring it would be
+		 * wrong.  However, in this case V_IRQ will remain true until
+		 * interrupt_window_interception calls svm_clear_vintr and
+		 * restores int_ctl.  We can just leave it aside.
+		 */
+		mask &= ~V_IRQ_MASK;
+	}
+	svm->nested.ctl.int_ctl        &= ~mask;
+	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
+}
+
 static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
 {
 	/* Load the nested guest state */
@@ -471,6 +499,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	/* Exit Guest-Mode */
 	leave_guest_mode(&svm->vcpu);
 	svm->nested.vmcb = 0;
+	WARN_ON_ONCE(svm->nested.nested_run_pending);
 
 	/* in case we halted in L2 */
 	svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -497,8 +526,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.dr6    = svm->vcpu.arch.dr6;
 	nested_vmcb->save.cpl    = vmcb->save.cpl;
 
-	nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
-	nested_vmcb->control.int_vector        = vmcb->control.int_vector;
 	nested_vmcb->control.int_state         = vmcb->control.int_state;
 	nested_vmcb->control.exit_code         = vmcb->control.exit_code;
 	nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
@@ -510,34 +537,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	if (svm->nrips_enabled)
 		nested_vmcb->control.next_rip  = vmcb->control.next_rip;
 
-	/*
-	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
-	 * to make sure that we do not lose injected events. So check event_inj
-	 * here and copy it to exit_int_info if it is valid.
-	 * Exit_int_info and event_inj can't be both valid because the case
-	 * below only happens on a VMRUN instruction intercept which has
-	 * no valid exit_int_info set.
-	 */
-	if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
-		struct vmcb_control_area *nc = &nested_vmcb->control;
-
-		nc->exit_int_info     = vmcb->control.event_inj;
-		nc->exit_int_info_err = vmcb->control.event_inj_err;
-	}
-
-	nested_vmcb->control.tlb_ctl           = 0;
-	nested_vmcb->control.event_inj         = 0;
-	nested_vmcb->control.event_inj_err     = 0;
+	nested_vmcb->control.int_ctl           = svm->nested.ctl.int_ctl;
+	nested_vmcb->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
+	nested_vmcb->control.event_inj         = svm->nested.ctl.event_inj;
+	nested_vmcb->control.event_inj_err     = svm->nested.ctl.event_inj_err;
 
 	nested_vmcb->control.pause_filter_count =
 		svm->vmcb->control.pause_filter_count;
 	nested_vmcb->control.pause_filter_thresh =
 		svm->vmcb->control.pause_filter_thresh;
 
-	/* We always set V_INTR_MASKING and remember the old value in hflags */
-	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
-		nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
-
 	/* Restore the original control entries */
 	copy_vmcb_control_area(&vmcb->control, &hsave->control);
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 4122ba86bac2..b710e62ace16 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3427,7 +3427,10 @@ static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 	sync_cr8_to_lapic(vcpu);
 
 	svm->next_rip = 0;
-	svm->nested.nested_run_pending = 0;
+	if (is_guest_mode(&svm->vcpu)) {
+		sync_nested_vmcb_control(svm);
+		svm->nested.nested_run_pending = 0;
+	}
 
 	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index dd5418f20256..7e79f0af1204 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -394,6 +394,7 @@ int nested_svm_check_permissions(struct vcpu_svm *svm);
 int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 			       bool has_error_code, u32 error_code);
 int nested_svm_exit_special(struct vcpu_svm *svm);
+void sync_nested_vmcb_control(struct vcpu_svm *svm);
 
 extern struct kvm_x86_nested_ops svm_nested_ops;
 
-- 
cgit v1.2.3


From 31031098feb9e1233176896d31eaf766c13429df Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 May 2020 12:33:52 -0400
Subject: KVM: nSVM: remove unnecessary if

kvm_vcpu_apicv_active must be false when nested virtualization is enabled,
so there is no need to check it in clgi_interception.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/svm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b710e62ace16..7383f821eb3b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2015,8 +2015,7 @@ static int clgi_interception(struct vcpu_svm *svm)
 	disable_gif(svm);
 
 	/* After a CLGI no interrupts should come */
-	if (!kvm_vcpu_apicv_active(&svm->vcpu))
-		svm_clear_vintr(svm);
+	svm_clear_vintr(svm);
 
 	return ret;
 }
-- 
cgit v1.2.3


From ffdf7f9e80ac1b6fea3bc6a65ea1f264bc226eab Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 May 2020 12:18:27 -0400
Subject: KVM: nSVM: extract svm_set_gif

Extract the code that is needed to implement CLGI and STGI,
so that we can run it from VMRUN and vmexit (and in the future,
KVM_SET_NESTED_STATE).  Skip the request for KVM_REQ_EVENT unless needed,
subsuming the evaluate_pending_interrupts optimization that is found
in enter_svm_guest_mode.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq.c        |  1 +
 arch/x86/kvm/svm/nested.c | 22 ++------------------
 arch/x86/kvm/svm/svm.c    | 51 +++++++++++++++++++++++++++++++----------------
 arch/x86/kvm/svm/svm.h    |  1 +
 4 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 54f7ea68083b..99d118ffc67d 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -83,6 +83,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 
 	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
 }
+EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr);
 
 /*
  * check if there is pending interrupt without
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 921466eba556..7e4a506828c9 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -333,10 +333,6 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 			  struct vmcb *nested_vmcb)
 {
-	bool evaluate_pending_interrupts =
-		is_intercept(svm, INTERCEPT_VINTR) ||
-		is_intercept(svm, INTERCEPT_IRET);
-
 	svm->nested.vmcb = vmcb_gpa;
 	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
 		svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -347,21 +343,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	nested_prepare_vmcb_save(svm, nested_vmcb);
 	nested_prepare_vmcb_control(svm);
 
-	/*
-	 * If L1 had a pending IRQ/NMI before executing VMRUN,
-	 * which wasn't delivered because it was disallowed (e.g.
-	 * interrupts disabled), L0 needs to evaluate if this pending
-	 * event should cause an exit from L2 to L1 or be delivered
-	 * directly to L2.
-	 *
-	 * Usually this would be handled by the processor noticing an
-	 * IRQ/NMI window request.  However, VMRUN can unblock interrupts
-	 * by implicitly setting GIF, so force L0 to perform pending event
-	 * evaluation by requesting a KVM_REQ_EVENT.
-	 */
-	enable_gif(svm);
-	if (unlikely(evaluate_pending_interrupts))
-		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+	svm_set_gif(svm, true);
 }
 
 int nested_svm_vmrun(struct vcpu_svm *svm)
@@ -505,7 +487,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
 	/* Give the current vmcb to the guest */
-	disable_gif(svm);
+	svm_set_gif(svm, false);
 
 	nested_vmcb->save.es     = vmcb->save.es;
 	nested_vmcb->save.cs     = vmcb->save.cs;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7383f821eb3b..e48e4173bc60 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1981,6 +1981,38 @@ static int vmrun_interception(struct vcpu_svm *svm)
 	return nested_svm_vmrun(svm);
 }
 
+void svm_set_gif(struct vcpu_svm *svm, bool value)
+{
+	if (value) {
+		/*
+		 * If VGIF is enabled, the STGI intercept is only added to
+		 * detect the opening of the SMI/NMI window; remove it now.
+		 * Likewise, clear the VINTR intercept, we will set it
+		 * again while processing KVM_REQ_EVENT if needed.
+		 */
+		if (vgif_enabled(svm))
+			clr_intercept(svm, INTERCEPT_STGI);
+		if (is_intercept(svm, SVM_EXIT_VINTR))
+			svm_clear_vintr(svm);
+
+		enable_gif(svm);
+		if (svm->vcpu.arch.smi_pending ||
+		    svm->vcpu.arch.nmi_pending ||
+		    kvm_cpu_has_injectable_intr(&svm->vcpu))
+			kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+	} else {
+		disable_gif(svm);
+
+		/*
+		 * After a CLGI no interrupts should come.  But if vGIF is
+		 * in use, we still rely on the VINTR intercept (rather than
+		 * STGI) to detect an open interrupt window.
+		*/
+		if (!vgif_enabled(svm))
+			svm_clear_vintr(svm);
+	}
+}
+
 static int stgi_interception(struct vcpu_svm *svm)
 {
 	int ret;
@@ -1988,18 +2020,8 @@ static int stgi_interception(struct vcpu_svm *svm)
 	if (nested_svm_check_permissions(svm))
 		return 1;
 
-	/*
-	 * If VGIF is enabled, the STGI intercept is only added to
-	 * detect the opening of the SMI/NMI window; remove it now.
-	 */
-	if (vgif_enabled(svm))
-		clr_intercept(svm, INTERCEPT_STGI);
-
 	ret = kvm_skip_emulated_instruction(&svm->vcpu);
-	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-
-	enable_gif(svm);
-
+	svm_set_gif(svm, true);
 	return ret;
 }
 
@@ -2011,12 +2033,7 @@ static int clgi_interception(struct vcpu_svm *svm)
 		return 1;
 
 	ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-	disable_gif(svm);
-
-	/* After a CLGI no interrupts should come */
-	svm_clear_vintr(svm);
-
+	svm_set_gif(svm, false);
 	return ret;
 }
 
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7e79f0af1204..10b7b55720a0 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -357,6 +357,7 @@ void disable_nmi_singlestep(struct vcpu_svm *svm);
 bool svm_smi_blocked(struct kvm_vcpu *vcpu);
 bool svm_nmi_blocked(struct kvm_vcpu *vcpu);
 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu);
+void svm_set_gif(struct vcpu_svm *svm, bool value);
 
 /* nested.c */
 
-- 
cgit v1.2.3


From 91b7130cb6606d8c6b3b77e54426b3f3a83f48b1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 May 2020 12:28:52 -0400
Subject: KVM: SVM: preserve VGIF across VMCB switch

There is only one GIF flag for the whole processor, so make sure it is not clobbered
when switching to L2 (in which case we also have to include the V_GIF_ENABLE_MASK,
lest we confuse enable_gif/disable_gif/gif_set).  When going back, L1 could in
theory have entered L2 without issuing a CLGI so make sure the svm_set_gif is
done last, after svm->vmcb->control.int_ctl has been copied back from hsave.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 7e4a506828c9..6c7f0bffdf01 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -293,6 +293,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v
 
 static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 {
+	const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
 	if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE)
 		nested_svm_init_mmu_context(&svm->vcpu);
 
@@ -308,7 +309,10 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
 		svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
 
-	svm->vmcb->control.int_ctl             = svm->nested.ctl.int_ctl | V_INTR_MASKING_MASK;
+	svm->vmcb->control.int_ctl             =
+		(svm->nested.ctl.int_ctl & ~mask) |
+		(svm->nested.hsave->control.int_ctl & mask);
+
 	svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
 	svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
 	svm->vmcb->control.int_state           = svm->nested.ctl.int_state;
-- 
cgit v1.2.3


From 36e2e98363e6c13288de6824b51866292dbc151d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 22 May 2020 06:04:57 -0400
Subject: KVM: nSVM: synthesize correct EXITINTINFO on vmexit

This bit was added to nested VMX right when nested_run_pending was
introduced, but it is not yet there in nSVM.  Since we can have pending
events that L0 injected directly into L2 on vmentry, we have to transfer
them into L1's queue.

For this to work, one important change is required: svm_complete_interrupts
(which clears the "injected" fields from the previous VMRUN, and updates them
from svm->vmcb's EXITINTINFO) must be placed before we inject the vmexit.
This is not too scary though; VMX even does it in vmx_vcpu_run.

While at it, the nested_vmexit_inject tracepoint is moved towards the
end of nested_svm_vmexit.  This ensures that the synthesized EXITINTINFO
is visible in the trace.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 59 +++++++++++++++++++++++++++++++++++++----------
 arch/x86/kvm/svm/svm.c    |  4 ++--
 2 files changed, 49 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6c7f0bffdf01..c3c04fef11df 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -262,6 +262,43 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
 	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
 }
 
+/*
+ * Transfer any event that L0 or L1 wanted to inject into L2 to
+ * EXIT_INT_INFO.
+ */
+static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
+					   struct vmcb *nested_vmcb)
+{
+	struct kvm_vcpu *vcpu = &svm->vcpu;
+	u32 exit_int_info = 0;
+	unsigned int nr;
+
+	if (vcpu->arch.exception.injected) {
+		nr = vcpu->arch.exception.nr;
+		exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
+
+		if (vcpu->arch.exception.has_error_code) {
+			exit_int_info |= SVM_EVTINJ_VALID_ERR;
+			nested_vmcb->control.exit_int_info_err =
+				vcpu->arch.exception.error_code;
+		}
+
+	} else if (vcpu->arch.nmi_injected) {
+		exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+	} else if (vcpu->arch.interrupt.injected) {
+		nr = vcpu->arch.interrupt.nr;
+		exit_int_info = nr | SVM_EVTINJ_VALID;
+
+		if (vcpu->arch.interrupt.soft)
+			exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
+		else
+			exit_int_info |= SVM_EVTINJ_TYPE_INTR;
+	}
+
+	nested_vmcb->control.exit_int_info = exit_int_info;
+}
+
 static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
 {
 	/* Load the nested guest state */
@@ -466,13 +503,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	struct vmcb *vmcb = svm->vmcb;
 	struct kvm_host_map map;
 
-	trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
-				       vmcb->control.exit_info_1,
-				       vmcb->control.exit_info_2,
-				       vmcb->control.exit_int_info,
-				       vmcb->control.exit_int_info_err,
-				       KVM_ISA_SVM);
-
 	rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
 	if (rc) {
 		if (rc == -EINVAL)
@@ -517,8 +547,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
 	nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
-	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
-	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+
+	if (nested_vmcb->control.exit_code != SVM_EXIT_ERR)
+		nested_vmcb_save_pending_event(svm, nested_vmcb);
 
 	if (svm->nrips_enabled)
 		nested_vmcb->control.next_rip  = vmcb->control.next_rip;
@@ -539,9 +570,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
 		svm->vcpu.arch.l1_tsc_offset;
 
-	kvm_clear_exception_queue(&svm->vcpu);
-	kvm_clear_interrupt_queue(&svm->vcpu);
-
 	svm->nested.ctl.nested_cr3 = 0;
 
 	/* Restore selected save entries */
@@ -570,6 +598,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	mark_all_dirty(svm->vmcb);
 
+	trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code,
+				       nested_vmcb->control.exit_info_1,
+				       nested_vmcb->control.exit_info_2,
+				       nested_vmcb->control.exit_int_info,
+				       nested_vmcb->control.exit_int_info_err,
+				       KVM_ISA_SVM);
+
 	kvm_vcpu_unmap(&svm->vcpu, &map, true);
 
 	nested_svm_uninit_mmu_context(&svm->vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e48e4173bc60..422b1cc62a20 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2913,6 +2913,8 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 	if (npt_enabled)
 		vcpu->arch.cr3 = svm->vmcb->save.cr3;
 
+	svm_complete_interrupts(svm);
+
 	if (is_guest_mode(vcpu)) {
 		int vmexit;
 
@@ -2932,8 +2934,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 			return 1;
 	}
 
-	svm_complete_interrupts(svm);
-
 	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
-- 
cgit v1.2.3


From e9fd761a46b8655aa5ff84b4adc0c8cf43952145 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 May 2020 13:28:23 -0400
Subject: KVM: nSVM: remove HF_VINTR_MASK

Now that the int_ctl field is stored in svm->nested.ctl.int_ctl, we can
use it instead of vcpu->arch.hflags to check whether L2 is running
in V_INTR_MASKING mode.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/svm/nested.c       | 6 +-----
 arch/x86/kvm/svm/svm.c          | 2 +-
 arch/x86/kvm/svm/svm.h          | 4 +++-
 4 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e6f2e1a2dab6..0dfc522f96cc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1596,7 +1596,6 @@ enum {
 
 #define HF_GIF_MASK		(1 << 0)
 #define HF_HIF_MASK		(1 << 1)
-#define HF_VINTR_MASK		(1 << 2)
 #define HF_NMI_MASK		(1 << 3)
 #define HF_IRET_MASK		(1 << 4)
 #define HF_GUEST_MASK		(1 << 5) /* VCPU is in guest-mode */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index c3c04fef11df..6967fe884eaf 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -118,7 +118,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
 	c->intercept_exceptions = h->intercept_exceptions;
 	c->intercept = h->intercept;
 
-	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+	if (g->int_ctl & V_INTR_MASKING_MASK) {
 		/* We only want the cr8 intercept bits of L1 */
 		c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
 		c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
@@ -338,10 +338,6 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 	kvm_mmu_reset_context(&svm->vcpu);
 
 	svm_flush_tlb(&svm->vcpu);
-	if (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
-		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
-	else
-		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
 	svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
 		svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 422b1cc62a20..6867dba4f736 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3104,7 +3104,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
 
 	if (is_guest_mode(vcpu)) {
 		/* As long as interrupts are being delivered...  */
-		if ((svm->vcpu.arch.hflags & HF_VINTR_MASK)
+		if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
 		    ? !(svm->vcpu.arch.hflags & HF_HIF_MASK)
 		    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
 			return true;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 10b7b55720a0..be8e830f83fa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -367,7 +367,9 @@ void svm_set_gif(struct vcpu_svm *svm, bool value);
 
 static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
 {
-	return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	return is_guest_mode(vcpu) && (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK);
 }
 
 static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
-- 
cgit v1.2.3


From 08245e6d2e589f2b6e9e275ddb343e2ec9ce94ec Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 19 May 2020 09:21:04 -0400
Subject: KVM: nSVM: remove HF_HIF_MASK

The L1 flags can be found in the save area of svm->nested.hsave, fish
it from there so that there is one fewer thing to migrate.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/svm/nested.c       | 5 -----
 arch/x86/kvm/svm/svm.c          | 2 +-
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0dfc522f96cc..3485f8454088 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1595,7 +1595,6 @@ enum {
 };
 
 #define HF_GIF_MASK		(1 << 0)
-#define HF_HIF_MASK		(1 << 1)
 #define HF_NMI_MASK		(1 << 3)
 #define HF_IRET_MASK		(1 << 4)
 #define HF_GUEST_MASK		(1 << 5) /* VCPU is in guest-mode */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6967fe884eaf..65ecc8586f75 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -371,11 +371,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 			  struct vmcb *nested_vmcb)
 {
 	svm->nested.vmcb = vmcb_gpa;
-	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
-		svm->vcpu.arch.hflags |= HF_HIF_MASK;
-	else
-		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
-
 	load_nested_vmcb_control(svm, &nested_vmcb->control);
 	nested_prepare_vmcb_save(svm, nested_vmcb);
 	nested_prepare_vmcb_control(svm);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6867dba4f736..bc08221f6743 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3105,7 +3105,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
 	if (is_guest_mode(vcpu)) {
 		/* As long as interrupts are being delivered...  */
 		if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
-		    ? !(svm->vcpu.arch.hflags & HF_HIF_MASK)
+		    ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF)
 		    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
 			return true;
 
-- 
cgit v1.2.3


From ca46d739e3caf44dcd3db9eb8da30d0ff3aa9180 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 May 2020 13:02:15 -0400
Subject: KVM: nSVM: split nested_vmcb_check_controls

The authoritative state does not come from the VMCB once in guest mode,
but KVM_SET_NESTED_STATE can still perform checks on L1's provided SVM
controls because we get them from userspace.

Therefore, split out a function to do them.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 65ecc8586f75..bd3a89cd4070 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -203,26 +203,31 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 	return true;
 }
 
-static bool nested_vmcb_checks(struct vmcb *vmcb)
+static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
 {
-	if ((vmcb->save.efer & EFER_SVME) == 0)
+	if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
 		return false;
 
-	if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
-	    (vmcb->save.cr0 & X86_CR0_NW))
+	if (control->asid == 0)
 		return false;
 
-	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+	if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
+	    !npt_enabled)
 		return false;
 
-	if (vmcb->control.asid == 0)
+	return true;
+}
+
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+	if ((vmcb->save.efer & EFER_SVME) == 0)
 		return false;
 
-	if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
-	    !npt_enabled)
+	if (((vmcb->save.cr0 & X86_CR0_CD) == 0) &&
+	    (vmcb->save.cr0 & X86_CR0_NW))
 		return false;
 
-	return true;
+	return nested_vmcb_check_controls(&vmcb->control);
 }
 
 static void load_nested_vmcb_control(struct vcpu_svm *svm,
-- 
cgit v1.2.3


From c513f484c5582a8efadf3d72298e2285b041536e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 May 2020 13:08:37 -0400
Subject: KVM: nSVM: leave guest mode when clearing EFER.SVME

According to the AMD manual, the effect of turning off EFER.SVME while a
guest is running is undefined.  We make it leave guest mode immediately,
similar to the effect of clearing the VMX bit in MSR_IA32_FEAT_CTL.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/nested.c | 16 ++++++++++++++++
 arch/x86/kvm/svm/svm.c    | 10 ++++++++--
 arch/x86/kvm/svm/svm.h    |  1 +
 3 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index bd3a89cd4070..369eca73fe3e 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -618,6 +618,22 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	return 0;
 }
 
+/*
+ * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+ */
+void svm_leave_nested(struct vcpu_svm *svm)
+{
+	if (is_guest_mode(&svm->vcpu)) {
+		struct vmcb *hsave = svm->nested.hsave;
+		struct vmcb *vmcb = svm->vmcb;
+
+		svm->nested.nested_run_pending = 0;
+		leave_guest_mode(&svm->vcpu);
+		copy_vmcb_control_area(&vmcb->control, &hsave->control);
+		nested_svm_uninit_mmu_context(&svm->vcpu);
+	}
+}
+
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 offset, msr, value;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index bc08221f6743..b4db9a980469 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -265,6 +265,7 @@ static int get_npt_level(struct kvm_vcpu *vcpu)
 
 void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
 	vcpu->arch.efer = efer;
 
 	if (!npt_enabled) {
@@ -275,8 +276,13 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 			efer &= ~EFER_LME;
 	}
 
-	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
-	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+	if (!(efer & EFER_SVME)) {
+		svm_leave_nested(svm);
+		svm_set_gif(svm, true);
+	}
+
+	svm->vmcb->save.efer = efer | EFER_SVME;
+	mark_dirty(svm->vmcb, VMCB_CR);
 }
 
 static int is_external_interrupt(u32 info)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index be8e830f83fa..6ac4c00a5d82 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -389,6 +389,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
 
 void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 			  struct vmcb *nested_vmcb);
+void svm_leave_nested(struct vcpu_svm *svm);
 int nested_svm_vmrun(struct vcpu_svm *svm);
 void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
 int nested_svm_vmexit(struct vcpu_svm *svm);
-- 
cgit v1.2.3


From 929d1cfaa6926ccee28d8d4220e0b4e2defd9cd1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 19 May 2020 06:18:31 -0400
Subject: KVM: MMU: pass arbitrary CR0/CR4/EFER to kvm_init_shadow_mmu

This allows fetching the registers from the hsave area when setting
up the NPT shadow MMU, and is needed for KVM_SET_NESTED_STATE (which
runs long after the CR0, CR4 and EFER values in vcpu have been switched
to hold L2 guest state).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.h        |  2 +-
 arch/x86/kvm/mmu/mmu.c    | 14 +++++++++-----
 arch/x86/kvm/svm/nested.c |  5 ++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 048e865ad485..0ad06bfe2c2c 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -57,7 +57,7 @@ void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 			     bool accessed_dirty, gpa_t new_eptp);
 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index fd1c9145505c..2e62a03410c7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4952,7 +4952,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
 	return role;
 }
 
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
 {
 	struct kvm_mmu *context = vcpu->arch.mmu;
 	union kvm_mmu_role new_role =
@@ -4961,11 +4961,11 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 	if (new_role.as_u64 == context->mmu_role.as_u64)
 		return;
 
-	if (!is_paging(vcpu))
+	if (!(cr0 & X86_CR0_PG))
 		nonpaging_init_context(vcpu, context);
-	else if (is_long_mode(vcpu))
+	else if (efer & EFER_LMA)
 		paging64_init_context(vcpu, context);
-	else if (is_pae(vcpu))
+	else if (cr4 & X86_CR4_PAE)
 		paging32E_init_context(vcpu, context);
 	else
 		paging32_init_context(vcpu, context);
@@ -5043,7 +5043,11 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = vcpu->arch.mmu;
 
-	kvm_init_shadow_mmu(vcpu);
+	kvm_init_shadow_mmu(vcpu,
+			    kvm_read_cr0_bits(vcpu, X86_CR0_PG),
+			    kvm_read_cr4_bits(vcpu, X86_CR4_PAE),
+			    vcpu->arch.efer);
+
 	context->get_guest_pgd     = get_cr3;
 	context->get_pdptr         = kvm_pdptr_read;
 	context->inject_page_fault = kvm_inject_page_fault;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 369eca73fe3e..c712fe577029 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -80,10 +80,13 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
 
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *hsave = svm->nested.hsave;
+
 	WARN_ON(mmu_is_nested(vcpu));
 
 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-	kvm_init_shadow_mmu(vcpu);
+	kvm_init_shadow_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer);
 	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
 	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
 	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
-- 
cgit v1.2.3


From cc440cdad5b7a4c1de12dace725209eb3e0cf663 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 May 2020 13:36:32 -0400
Subject: KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE

Similar to VMX, the state that is captured through the currently available
IOCTLs is a mix of L1 and L2 state, dependent on whether the L2 guest was
running at the moment when the process was interrupted to save its state.

In particular, the SVM-specific state for nested virtualization includes
the L1 saved state (including the interrupt flag), the cached L2 controls,
and the GIF.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/kvm.h |  17 ++++-
 arch/x86/kvm/cpuid.h            |   5 ++
 arch/x86/kvm/svm/nested.c       | 147 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c          |   1 +
 arch/x86/kvm/vmx/nested.c       |   5 --
 arch/x86/kvm/x86.c              |   3 +-
 6 files changed, 171 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 3f3f780c8c65..12075a9de1c1 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -385,18 +385,22 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
 
 #define KVM_STATE_NESTED_FORMAT_VMX	0
-#define KVM_STATE_NESTED_FORMAT_SVM	1	/* unused */
+#define KVM_STATE_NESTED_FORMAT_SVM	1
 
 #define KVM_STATE_NESTED_GUEST_MODE	0x00000001
 #define KVM_STATE_NESTED_RUN_PENDING	0x00000002
 #define KVM_STATE_NESTED_EVMCS		0x00000004
 #define KVM_STATE_NESTED_MTF_PENDING	0x00000008
+#define KVM_STATE_NESTED_GIF_SET	0x00000100
 
 #define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
 #define KVM_STATE_NESTED_SMM_VMXON	0x00000002
 
 #define KVM_STATE_NESTED_VMX_VMCS_SIZE	0x1000
 
+#define KVM_STATE_NESTED_SVM_VMCB_SIZE	0x1000
+
+
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
 	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
@@ -411,6 +415,15 @@ struct kvm_vmx_nested_state_hdr {
 	} smm;
 };
 
+struct kvm_svm_nested_state_data {
+	/* Save area only used if KVM_STATE_NESTED_RUN_PENDING.  */
+	__u8 vmcb12[KVM_STATE_NESTED_SVM_VMCB_SIZE];
+};
+
+struct kvm_svm_nested_state_hdr {
+	__u64 vmcb_pa;
+};
+
 /* for KVM_CAP_NESTED_STATE */
 struct kvm_nested_state {
 	__u16 flags;
@@ -419,6 +432,7 @@ struct kvm_nested_state {
 
 	union {
 		struct kvm_vmx_nested_state_hdr vmx;
+		struct kvm_svm_nested_state_hdr svm;
 
 		/* Pad the header to 128 bytes.  */
 		__u8 pad[120];
@@ -431,6 +445,7 @@ struct kvm_nested_state {
 	 */
 	union {
 		struct kvm_vmx_nested_state_data vmx[0];
+		struct kvm_svm_nested_state_data svm[0];
 	} data;
 };
 
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 63a70f6a3df3..05434cd9342f 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -303,4 +303,9 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
 		kvm_cpu_cap_set(x86_feature);
 }
 
+static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
+}
+
 #endif
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index c712fe577029..6b1049148c1b 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -25,6 +25,7 @@
 #include "trace.h"
 #include "mmu.h"
 #include "x86.h"
+#include "cpuid.h"
 #include "lapic.h"
 #include "svm.h"
 
@@ -238,6 +239,8 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 {
 	copy_vmcb_control_area(&svm->nested.ctl, control);
 
+	/* Copy it here because nested_svm_check_controls will check it.  */
+	svm->nested.ctl.asid           = control->asid;
 	svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL;
 	svm->nested.ctl.iopm_base_pa  &= ~0x0fffULL;
 }
@@ -930,6 +933,150 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
 	return NESTED_EXIT_CONTINUE;
 }
 
+static int svm_get_nested_state(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				u32 user_data_size)
+{
+	struct vcpu_svm *svm;
+	struct kvm_nested_state kvm_state = {
+		.flags = 0,
+		.format = KVM_STATE_NESTED_FORMAT_SVM,
+		.size = sizeof(kvm_state),
+	};
+	struct vmcb __user *user_vmcb = (struct vmcb __user *)
+		&user_kvm_nested_state->data.svm[0];
+
+	if (!vcpu)
+		return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;
+
+	svm = to_svm(vcpu);
+
+	if (user_data_size < kvm_state.size)
+		goto out;
+
+	/* First fill in the header and copy it out.  */
+	if (is_guest_mode(vcpu)) {
+		kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb;
+		kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
+		kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+		if (svm->nested.nested_run_pending)
+			kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+	}
+
+	if (gif_set(svm))
+		kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;
+
+	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+		return -EFAULT;
+
+	if (!is_guest_mode(vcpu))
+		goto out;
+
+	/*
+	 * Copy over the full size of the VMCB rather than just the size
+	 * of the structs.
+	 */
+	if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
+		return -EFAULT;
+	if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
+			 sizeof(user_vmcb->control)))
+		return -EFAULT;
+	if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+			 sizeof(user_vmcb->save)))
+		return -EFAULT;
+
+out:
+	return kvm_state.size;
+}
+
+static int svm_set_nested_state(struct kvm_vcpu *vcpu,
+				struct kvm_nested_state __user *user_kvm_nested_state,
+				struct kvm_nested_state *kvm_state)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *hsave = svm->nested.hsave;
+	struct vmcb __user *user_vmcb = (struct vmcb __user *)
+		&user_kvm_nested_state->data.svm[0];
+	struct vmcb_control_area ctl;
+	struct vmcb_save_area save;
+	u32 cr0;
+
+	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
+		return -EINVAL;
+
+	if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
+				 KVM_STATE_NESTED_RUN_PENDING |
+				 KVM_STATE_NESTED_GIF_SET))
+		return -EINVAL;
+
+	/*
+	 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
+	 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
+	 */
+	if (!(vcpu->arch.efer & EFER_SVME)) {
+		/* GIF=1 and no guest mode are required if SVME=0.  */
+		if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
+			return -EINVAL;
+	}
+
+	/* SMM temporarily disables SVM, so we cannot be in guest mode.  */
+	if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+		return -EINVAL;
+
+	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
+		svm_leave_nested(svm);
+		goto out_set_gif;
+	}
+
+	if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
+		return -EINVAL;
+	if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
+		return -EINVAL;
+	if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl)))
+		return -EFAULT;
+	if (copy_from_user(&save, &user_vmcb->save, sizeof(save)))
+		return -EFAULT;
+
+	if (!nested_vmcb_check_controls(&ctl))
+		return -EINVAL;
+
+	/*
+	 * Processor state contains L2 state.  Check that it is
+	 * valid for guest mode (see nested_vmcb_checks).
+	 */
+	cr0 = kvm_read_cr0(vcpu);
+        if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
+                return -EINVAL;
+
+	/*
+	 * Validate host state saved from before VMRUN (see
+	 * nested_svm_check_permissions).
+	 * TODO: validate reserved bits for all saved state.
+	 */
+	if (!(save.cr0 & X86_CR0_PG))
+		return -EINVAL;
+
+	/*
+	 * All checks done, we can enter guest mode.  L1 control fields
+	 * come from the nested save state.  Guest state is already
+	 * in the registers, the save area of the nested state instead
+	 * contains saved L1 state.
+	 */
+	copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
+	hsave->save = save;
+
+	svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
+	load_nested_vmcb_control(svm, &ctl);
+	nested_prepare_vmcb_control(svm);
+
+out_set_gif:
+	svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+	return 0;
+}
+
 struct kvm_x86_nested_ops svm_nested_ops = {
 	.check_events = svm_check_nested_events,
+	.get_state = svm_get_nested_state,
+	.set_state = svm_set_nested_state,
 };
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b4db9a980469..3871bfb40594 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1193,6 +1193,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 		svm->avic_is_running = true;
 
 	svm->nested.hsave = page_address(hsave_page);
+	clear_page(svm->nested.hsave);
 
 	svm->msrpm = page_address(msrpm_pages);
 	svm_vcpu_init_msrpm(svm->msrpm);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 51ebb60e1533..106fc6fceb97 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -437,11 +437,6 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 	}
 }
 
-static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
-	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
-}
-
 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
 					       struct vmcs12 *vmcs12)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0fa610bed91..d4aa7dc662d5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4628,7 +4628,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
 		if (kvm_state.flags &
 		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
-		      | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING))
+		      | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
+		      | KVM_STATE_NESTED_GIF_SET))
 			break;
 
 		/* nested_run_pending implies guest_mode.  */
-- 
cgit v1.2.3


From f4a9fdd5f12ad48700ad5dcb7a0dc7cf9a3bcd5a Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 7 May 2020 13:56:18 -0500
Subject: KVM: VMX: Replace zero-length array with flexible-array

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

sizeof(flexible-array-member) triggers a warning because flexible array
members have incomplete type[1]. There are some instances of code in
which the sizeof operator is being incorrectly/erroneously applied to
zero-length arrays and the result is zero. Such instances may be hiding
some bugs. So, this work (flexible-array member conversions) will also
help to get completely rid of those sorts of issues.

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Message-Id: <20200507185618.GA14831@embeddedor>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmcs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 481ad879197b..5c0ff80b85c0 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -19,7 +19,7 @@ struct vmcs_hdr {
 struct vmcs {
 	struct vmcs_hdr hdr;
 	u32 abort;
-	char data[0];
+	char data[];
 };
 
 DECLARE_PER_CPU(struct vmcs *, current_vmcs);
-- 
cgit v1.2.3


From 84b09f33a5de528d05c007d9847403a364dfe35e Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 25 May 2020 16:41:16 +0200
Subject: Revert "KVM: async_pf: Fix #DF due to inject "Page not Present" and
 "Page Ready" exceptions simultaneously"

Commit 9a6e7c39810e (""KVM: async_pf: Fix #DF due to inject "Page not
Present" and "Page Ready" exceptions simultaneously") added a protection
against 'page ready' notification coming before 'page not present' is
delivered. This situation seems to be impossible since commit 2a266f23550b
("KVM MMU: check pending exception before injecting APF) which added
'vcpu->arch.exception.pending' check to kvm_can_do_async_pf.

On x86, kvm_arch_async_page_present() has only one call site:
kvm_check_async_pf_completion() loop and we only enter the loop when
kvm_arch_can_inject_async_page_present(vcpu) which when async pf msr
is enabled, translates into kvm_can_do_async_pf().

There is also one problem with the cancellation mechanism. We don't seem
to check that the 'page not present' notification we're canceling matches
the 'page ready' notification so in theory, we may erroneously drop two
valid events.

Revert the commit.

Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200525144125.143875-2-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d4aa7dc662d5..b9a5ff7b922c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10427,13 +10427,6 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 				      sizeof(val));
 }
 
-static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
-{
-
-	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
-				      sizeof(u32));
-}
-
 static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
 {
 	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
@@ -10498,7 +10491,6 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work)
 {
 	struct x86_exception fault;
-	u32 val;
 
 	if (work->wakeup_all)
 		work->arch.token = ~0; /* broadcast wakeup */
@@ -10507,19 +10499,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
 
 	if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
-	    !apf_get_user(vcpu, &val)) {
-		if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
-		    vcpu->arch.exception.pending &&
-		    vcpu->arch.exception.nr == PF_VECTOR &&
-		    !apf_put_user(vcpu, 0)) {
-			vcpu->arch.exception.injected = false;
-			vcpu->arch.exception.pending = false;
-			vcpu->arch.exception.nr = 0;
-			vcpu->arch.exception.has_error_code = false;
-			vcpu->arch.exception.error_code = 0;
-			vcpu->arch.exception.has_payload = false;
-			vcpu->arch.exception.payload = 0;
-		} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
 			fault.vector = PF_VECTOR;
 			fault.error_code_valid = true;
 			fault.error_code = 0;
@@ -10527,7 +10507,6 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 			fault.address = work->arch.token;
 			fault.async_page_fault = true;
 			kvm_inject_page_fault(vcpu, &fault);
-		}
 	}
 	vcpu->arch.apf.halted = false;
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-- 
cgit v1.2.3


From 68fd66f100d196d35ab3008d4c69af3a0d7e7200 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 25 May 2020 16:41:17 +0200
Subject: KVM: x86: extend struct kvm_vcpu_pv_apf_data with token info

Currently, APF mechanism relies on the #PF abuse where the token is being
passed through CR2. If we switch to using interrupts to deliver page-ready
notifications we need a different way to pass the data. Extent the existing
'struct kvm_vcpu_pv_apf_data' with token information for page-ready
notifications.

While on it, rename 'reason' to 'flags'. This doesn't change the semantics
as we only have reasons '1' and '2' and these can be treated as bit flags
but KVM_PV_REASON_PAGE_READY is going away with interrupt based delivery
making 'reason' name misleading.

The newly introduced apf_put_user_ready() temporary puts both flags and
token information, this will be changed to put token only when we switch
to interrupt based notifications.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200525144125.143875-3-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h      |  2 +-
 arch/x86/include/asm/kvm_para.h      |  4 ++--
 arch/x86/include/uapi/asm/kvm_para.h |  5 +++--
 arch/x86/kernel/kvm.c                | 16 ++++++++--------
 arch/x86/kvm/mmu/mmu.c               |  6 +++---
 arch/x86/kvm/svm/nested.c            |  2 +-
 arch/x86/kvm/svm/svm.c               |  3 ++-
 arch/x86/kvm/vmx/nested.c            |  2 +-
 arch/x86/kvm/vmx/vmx.c               |  4 ++--
 arch/x86/kvm/x86.c                   | 17 +++++++++++++----
 10 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3485f8454088..033f6173a857 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -770,7 +770,7 @@ struct kvm_vcpu_arch {
 		u64 msr_val;
 		u32 id;
 		bool send_user_only;
-		u32 host_apf_reason;
+		u32 host_apf_flags;
 		unsigned long nested_apf_token;
 		bool delivery_as_pf_vmexit;
 	} apf;
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 118e5c2379f9..57fd1966c4ea 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -90,7 +90,7 @@ unsigned int kvm_arch_para_features(void);
 unsigned int kvm_arch_para_hints(void);
 void kvm_async_pf_task_wait_schedule(u32 token);
 void kvm_async_pf_task_wake(u32 token);
-u32 kvm_read_and_reset_pf_reason(void);
+u32 kvm_read_and_reset_apf_flags(void);
 void kvm_disable_steal_time(void);
 bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
 
@@ -131,7 +131,7 @@ static inline unsigned int kvm_arch_para_hints(void)
 	return 0;
 }
 
-static inline u32 kvm_read_and_reset_pf_reason(void)
+static inline u32 kvm_read_and_reset_apf_flags(void)
 {
 	return 0;
 }
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 2a8e0b6b9805..d1cd5c0f431a 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -112,8 +112,9 @@ struct kvm_mmu_op_release_pt {
 #define KVM_PV_REASON_PAGE_READY 2
 
 struct kvm_vcpu_pv_apf_data {
-	__u32 reason;
-	__u8 pad[60];
+	__u32 flags;
+	__u32 token; /* Used for page ready notification only */
+	__u8 pad[56];
 	__u32 enabled;
 };
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b3d9b0d7a37d..d6f22a3a1f7d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -218,23 +218,23 @@ again:
 }
 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
 
-u32 kvm_read_and_reset_pf_reason(void)
+u32 kvm_read_and_reset_apf_flags(void)
 {
-	u32 reason = 0;
+	u32 flags = 0;
 
 	if (__this_cpu_read(apf_reason.enabled)) {
-		reason = __this_cpu_read(apf_reason.reason);
-		__this_cpu_write(apf_reason.reason, 0);
+		flags = __this_cpu_read(apf_reason.flags);
+		__this_cpu_write(apf_reason.flags, 0);
 	}
 
-	return reason;
+	return flags;
 }
-EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
-NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
+NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags);
 
 bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
-	u32 reason = kvm_read_and_reset_pf_reason();
+	u32 reason = kvm_read_and_reset_apf_flags();
 
 	switch (reason) {
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2e62a03410c7..5de1929cfc55 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4164,7 +4164,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 #endif
 
 	vcpu->arch.l1tf_flush_l1d = true;
-	switch (vcpu->arch.apf.host_apf_reason) {
+	switch (vcpu->arch.apf.host_apf_flags) {
 	default:
 		trace_kvm_page_fault(fault_address, error_code);
 
@@ -4174,13 +4174,13 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 				insn_len);
 		break;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
-		vcpu->arch.apf.host_apf_reason = 0;
+		vcpu->arch.apf.host_apf_flags = 0;
 		local_irq_disable();
 		kvm_async_pf_task_wait_schedule(fault_address);
 		local_irq_enable();
 		break;
 	case KVM_PV_REASON_PAGE_READY:
-		vcpu->arch.apf.host_apf_reason = 0;
+		vcpu->arch.apf.host_apf_flags = 0;
 		local_irq_disable();
 		kvm_async_pf_task_wake(fault_address);
 		local_irq_enable();
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6b1049148c1b..8a6db11dcb43 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -921,7 +921,7 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
 		if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits)
 			return NESTED_EXIT_HOST;
 		else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
-			 svm->vcpu.arch.apf.host_apf_reason)
+			 svm->vcpu.arch.apf.host_apf_flags)
 			/* Trap async PF even if not shadowing */
 			return NESTED_EXIT_HOST;
 		break;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3871bfb40594..9e333b91ff78 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3459,7 +3459,8 @@ static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	/* if exit due to PF check for async PF */
 	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
-		svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
+		svm->vcpu.arch.apf.host_apf_flags =
+			kvm_read_and_reset_apf_flags();
 
 	if (npt_enabled) {
 		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 106fc6fceb97..119a2f7395d6 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5652,7 +5652,7 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
 		if (is_nmi(intr_info))
 			return true;
 		else if (is_page_fault(intr_info))
-			return vcpu->arch.apf.host_apf_reason || !enable_ept;
+			return vcpu->arch.apf.host_apf_flags || !enable_ept;
 		else if (is_debug(intr_info) &&
 			 vcpu->guest_debug &
 			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7b55dc6230a9..5a43af0061ef 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4765,7 +4765,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 	if (is_page_fault(intr_info)) {
 		cr2 = vmx_get_exit_qual(vcpu);
 		/* EPT won't cause page fault directly */
-		WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
+		WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept);
 		return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
 	}
 
@@ -6360,7 +6360,7 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
 
 	/* if exit due to PF check for async PF */
 	if (is_page_fault(intr_info)) {
-		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
+		vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
 	/* Handle machine checks before interrupts are enabled */
 	} else if (is_machine_check(intr_info)) {
 		kvm_machine_check();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b9a5ff7b922c..84aa3c1519ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2690,7 +2690,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 	}
 
 	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
-					sizeof(u32)))
+					sizeof(u64)))
 		return 1;
 
 	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
@@ -10420,8 +10420,17 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 	}
 }
 
-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
 {
+	u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
+
+	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
+				      sizeof(reason));
+}
+
+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
+{
+	u64 val = (u64)token << 32 | KVM_PV_REASON_PAGE_READY;
 
 	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
 				      sizeof(val));
@@ -10466,7 +10475,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
 
 	if (kvm_can_deliver_async_pf(vcpu) &&
-	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+	    !apf_put_user_notpresent(vcpu)) {
 		fault.vector = PF_VECTOR;
 		fault.error_code_valid = true;
 		fault.error_code = 0;
@@ -10499,7 +10508,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
 
 	if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
-	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+	    !apf_put_user_ready(vcpu, work->arch.token)) {
 			fault.vector = PF_VECTOR;
 			fault.error_code_valid = true;
 			fault.error_code = 0;
-- 
cgit v1.2.3


From 7c0ade6c9023b2b90b757e2927b306bec1cc4ca6 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 25 May 2020 16:41:18 +0200
Subject: KVM: rename kvm_arch_can_inject_async_page_present() to
 kvm_arch_can_dequeue_async_page_present()

An innocent reader of the following x86 KVM code:

bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
{
        if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
                return true;
...

may get very confused: if APF mechanism is not enabled, why do we report
that we 'can inject async page present'? In reality, upon injection
kvm_arch_async_page_present() will check the same condition again and,
in case APF is disabled, will just drop the item. This is fine as the
guest which deliberately disabled APF doesn't expect to get any APF
notifications.

Rename kvm_arch_can_inject_async_page_present() to
kvm_arch_can_dequeue_async_page_present() to make it clear what we are
checking: if the item can be dequeued (meaning either injected or just
dropped).

On s390 kvm_arch_can_inject_async_page_present() always returns 'true' so
the rename doesn't matter much.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200525144125.143875-4-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/include/asm/kvm_host.h | 2 +-
 arch/s390/kvm/kvm-s390.c         | 2 +-
 arch/x86/include/asm/kvm_host.h  | 2 +-
 arch/x86/kvm/x86.c               | 2 +-
 virt/kvm/async_pf.c              | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 176f74cc2257..9d23bde1363b 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -973,7 +973,7 @@ struct kvm_arch_async_pf {
 	unsigned long pfault_token;
 };
 
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
 
 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 			       struct kvm_async_pf *work);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index a560a368f92c..06bde4bad205 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3943,7 +3943,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 	/* s390 will always inject the page directly */
 }
 
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 {
 	/*
 	 * s390 will always inject the page directly,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 033f6173a857..f3897e417b69 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1660,7 +1660,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work);
 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 			       struct kvm_async_pf *work);
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
 extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84aa3c1519ed..0e79b37b2b7e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10521,7 +10521,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 }
 
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 {
 	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
 		return true;
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 10b533f641a6..82e53f180a1a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -134,7 +134,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 	struct kvm_async_pf *work;
 
 	while (!list_empty_careful(&vcpu->async_pf.done) &&
-	      kvm_arch_can_inject_async_page_present(vcpu)) {
+	      kvm_arch_can_dequeue_async_page_present(vcpu)) {
 		spin_lock(&vcpu->async_pf.lock);
 		work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
 					      link);
-- 
cgit v1.2.3


From 2635b5c4a0e407b84f68e188c719f28ba0e9ae1b Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 25 May 2020 16:41:20 +0200
Subject: KVM: x86: interrupt based APF 'page ready' event delivery

Concerns were expressed around APF delivery via synthetic #PF exception as
in some cases such delivery may collide with real page fault. For 'page
ready' notifications we can easily switch to using an interrupt instead.
Introduce new MSR_KVM_ASYNC_PF_INT mechanism and deprecate the legacy one.

One notable difference between the two mechanisms is that interrupt may not
get handled immediately so whenever we would like to deliver next event
(regardless of its type) we must be sure the guest had read and cleared
previous event in the slot.

While on it, get rid on 'type 1/type 2' names for APF events in the
documentation as they are causing confusion. Use 'page not present'
and 'page ready' everywhere instead.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200525144125.143875-6-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/msr.rst       | 102 ++++++++++++++++++++++++-----------
 arch/x86/include/asm/kvm_host.h      |   4 +-
 arch/x86/include/uapi/asm/kvm_para.h |  12 ++++-
 arch/x86/kvm/x86.c                   |  93 +++++++++++++++++++++++---------
 4 files changed, 152 insertions(+), 59 deletions(-)

(limited to 'arch')

diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst
index 33892036672d..be08df12f31a 100644
--- a/Documentation/virt/kvm/msr.rst
+++ b/Documentation/virt/kvm/msr.rst
@@ -190,41 +190,68 @@ MSR_KVM_ASYNC_PF_EN:
 	0x4b564d02
 
 data:
-	Bits 63-6 hold 64-byte aligned physical address of a
-	64 byte memory area which must be in guest RAM and must be
-	zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1
-	when asynchronous page faults are enabled on the vcpu 0 when
-	disabled. Bit 1 is 1 if asynchronous page faults can be injected
-	when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults
-	are delivered to L1 as #PF vmexits.  Bit 2 can be set only if
-	KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID.
-
-	First 4 byte of 64 byte memory location will be written to by
-	the hypervisor at the time of asynchronous page fault (APF)
-	injection to indicate type of asynchronous page fault. Value
-	of 1 means that the page referred to by the page fault is not
-	present. Value 2 means that the page is now available. Disabling
-	interrupt inhibits APFs. Guest must not enable interrupt
-	before the reason is read, or it may be overwritten by another
-	APF. Since APF uses the same exception vector as regular page
-	fault guest must reset the reason to 0 before it does
-	something that can generate normal page fault.  If during page
-	fault APF reason is 0 it means that this is regular page
-	fault.
-
-	During delivery of type 1 APF cr2 contains a token that will
-	be used to notify a guest when missing page becomes
-	available. When page becomes available type 2 APF is sent with
-	cr2 set to the token associated with the page. There is special
-	kind of token 0xffffffff which tells vcpu that it should wake
-	up all processes waiting for APFs and no individual type 2 APFs
-	will be sent.
+	Asynchronous page fault (APF) control MSR.
+
+	Bits 63-6 hold 64-byte aligned physical address of a 64 byte memory area
+	which must be in guest RAM and must be zeroed. This memory is expected
+	to hold a copy of the following structure::
+
+	  struct kvm_vcpu_pv_apf_data {
+		/* Used for 'page not present' events delivered via #PF */
+		__u32 flags;
+
+		/* Used for 'page ready' events delivered via interrupt notification */
+		__u32 token;
+
+		__u8 pad[56];
+		__u32 enabled;
+	  };
+
+	Bits 5-4 of the MSR are reserved and should be zero. Bit 0 is set to 1
+	when asynchronous page faults are enabled on the vcpu, 0 when disabled.
+	Bit 1 is 1 if asynchronous page faults can be injected when vcpu is in
+	cpl == 0. Bit 2 is 1 if asynchronous page faults are delivered to L1 as
+	#PF vmexits.  Bit 2 can be set only if KVM_FEATURE_ASYNC_PF_VMEXIT is
+	present in CPUID. Bit 3 enables interrupt based delivery of 'page ready'
+	events.
+
+	'Page not present' events are currently always delivered as synthetic
+	#PF exception. During delivery of these events APF CR2 register contains
+	a token that will be used to notify the guest when missing page becomes
+	available. Also, to make it possible to distinguish between real #PF and
+	APF, first 4 bytes of 64 byte memory location ('flags') will be written
+	to by the hypervisor at the time of injection. Only first bit of 'flags'
+	is currently supported, when set, it indicates that the guest is dealing
+	with asynchronous 'page not present' event. If during a page fault APF
+	'flags' is '0' it means that this is regular page fault. Guest is
+	supposed to clear 'flags' when it is done handling #PF exception so the
+	next event can be delivered.
+
+	Note, since APF 'page not present' events use the same exception vector
+	as regular page fault, guest must reset 'flags' to '0' before it does
+	something that can generate normal page fault.
+
+	Bytes 5-7 of 64 byte memory location ('token') will be written to by the
+	hypervisor at the time of APF 'page ready' event injection. The content
+	of these bytes is a token which was previously delivered as 'page not
+	present' event. The event indicates the page in now available. Guest is
+	supposed to write '0' to 'token' when it is done handling 'page ready'
+	event so the next one can be delivered.
+
+	Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page
+	ready' APF delivery needs to be written to before enabling APF mechanism
+	in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected.
+
+	Note, previously, 'page ready' events were delivered via the same #PF
+	exception as 'page not present' events but this is now deprecated. If
+	bit 3 (interrupt based delivery) is not set APF events are not delivered.
 
 	If APF is disabled while there are outstanding APFs, they will
 	not be delivered.
 
-	Currently type 2 APF will be always delivered on the same vcpu as
-	type 1 was, but guest should not rely on that.
+	Currently 'page ready' APF events will be always delivered on the
+	same vcpu as 'page not present' event was, but guest should not rely on
+	that.
 
 MSR_KVM_STEAL_TIME:
 	0x4b564d03
@@ -319,3 +346,16 @@ data:
 
 	KVM guests can request the host not to poll on HLT, for example if
 	they are performing polling themselves.
+
+MSR_KVM_ASYNC_PF_INT:
+	0x4b564d06
+
+data:
+	Second asynchronous page fault (APF) control MSR.
+
+	Bits 0-7: APIC vector for delivery of 'page ready' APF events.
+	Bits 8-63: Reserved
+
+	Interrupt vector for asynchnonous 'page ready' notifications delivery.
+	The vector has to be set up before asynchronous page fault mechanism
+	is enabled in MSR_KVM_ASYNC_PF_EN.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f3897e417b69..2d39571451a0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -767,7 +767,9 @@ struct kvm_vcpu_arch {
 		bool halted;
 		gfn_t gfns[ASYNC_PF_PER_VCPU];
 		struct gfn_to_hva_cache data;
-		u64 msr_val;
+		u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */
+		u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */
+		u16 vec;
 		u32 id;
 		bool send_user_only;
 		u32 host_apf_flags;
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index d1cd5c0f431a..1d37d616b1fc 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -50,6 +50,7 @@
 #define MSR_KVM_STEAL_TIME  0x4b564d03
 #define MSR_KVM_PV_EOI_EN      0x4b564d04
 #define MSR_KVM_POLL_CONTROL	0x4b564d05
+#define MSR_KVM_ASYNC_PF_INT	0x4b564d06
 
 struct kvm_steal_time {
 	__u64 steal;
@@ -81,6 +82,11 @@ struct kvm_clock_pairing {
 #define KVM_ASYNC_PF_ENABLED			(1 << 0)
 #define KVM_ASYNC_PF_SEND_ALWAYS		(1 << 1)
 #define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT	(1 << 2)
+#define KVM_ASYNC_PF_DELIVERY_AS_INT		(1 << 3)
+
+/* MSR_KVM_ASYNC_PF_INT */
+#define KVM_ASYNC_PF_VEC_MASK			GENMASK(7, 0)
+
 
 /* Operations for KVM_HC_MMU_OP */
 #define KVM_MMU_OP_WRITE_PTE            1
@@ -112,8 +118,12 @@ struct kvm_mmu_op_release_pt {
 #define KVM_PV_REASON_PAGE_READY 2
 
 struct kvm_vcpu_pv_apf_data {
+	/* Used for 'page not present' events delivered via #PF */
 	__u32 flags;
-	__u32 token; /* Used for page ready notification only */
+
+	/* Used for 'page ready' events delivered via interrupt notification */
+	__u32 token;
+
 	__u8 pad[56];
 	__u32 enabled;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e79b37b2b7e..e6f3ec5193b2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1248,7 +1248,7 @@ static const u32 emulated_msrs_all[] = {
 	HV_X64_MSR_TSC_EMULATION_STATUS,
 
 	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
-	MSR_KVM_PV_EOI_EN,
+	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT,
 
 	MSR_IA32_TSC_ADJUST,
 	MSR_IA32_TSCDEADLINE,
@@ -2673,17 +2673,24 @@ out:
 	return r;
 }
 
+static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+	u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+	return (vcpu->arch.apf.msr_en_val & mask) == mask;
+}
+
 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 {
 	gpa_t gpa = data & ~0x3f;
 
-	/* Bits 3:5 are reserved, Should be zero */
-	if (data & 0x38)
+	/* Bits 4:5 are reserved, Should be zero */
+	if (data & 0x30)
 		return 1;
 
-	vcpu->arch.apf.msr_val = data;
+	vcpu->arch.apf.msr_en_val = data;
 
-	if (!(data & KVM_ASYNC_PF_ENABLED)) {
+	if (!kvm_pv_async_pf_enabled(vcpu)) {
 		kvm_clear_async_pf_completion_queue(vcpu);
 		kvm_async_pf_hash_reset(vcpu);
 		return 0;
@@ -2695,7 +2702,25 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 
 	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
 	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
 	kvm_async_pf_wakeup_all(vcpu);
+
+	return 0;
+}
+
+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
+{
+	/* Bits 8-63 are reserved */
+	if (data >> 8)
+		return 1;
+
+	if (!lapic_in_kernel(vcpu))
+		return 1;
+
+	vcpu->arch.apf.msr_int_val = data;
+
+	vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
+
 	return 0;
 }
 
@@ -2917,6 +2942,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (kvm_pv_enable_async_pf(vcpu, data))
 			return 1;
 		break;
+	case MSR_KVM_ASYNC_PF_INT:
+		if (kvm_pv_enable_async_pf_int(vcpu, data))
+			return 1;
+		break;
 	case MSR_KVM_STEAL_TIME:
 
 		if (unlikely(!sched_info_on()))
@@ -3191,7 +3220,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vcpu->arch.time;
 		break;
 	case MSR_KVM_ASYNC_PF_EN:
-		msr_info->data = vcpu->arch.apf.msr_val;
+		msr_info->data = vcpu->arch.apf.msr_en_val;
+		break;
+	case MSR_KVM_ASYNC_PF_INT:
+		msr_info->data = vcpu->arch.apf.msr_int_val;
 		break;
 	case MSR_KVM_STEAL_TIME:
 		msr_info->data = vcpu->arch.st.msr_val;
@@ -9553,7 +9585,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vcpu->arch.cr2 = 0;
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	vcpu->arch.apf.msr_val = 0;
+	vcpu->arch.apf.msr_en_val = 0;
+	vcpu->arch.apf.msr_int_val = 0;
 	vcpu->arch.st.msr_val = 0;
 
 	kvmclock_reset(vcpu);
@@ -10430,10 +10463,22 @@ static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
 
 static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
 {
-	u64 val = (u64)token << 32 | KVM_PV_REASON_PAGE_READY;
+	unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
 
-	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
-				      sizeof(val));
+	return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+					     &token, offset, sizeof(token));
+}
+
+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
+{
+	unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
+	u32 val;
+
+	if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+					 &val, offset, sizeof(val)))
+		return false;
+
+	return !val;
 }
 
 static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
@@ -10441,9 +10486,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
 	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
 		return false;
 
-	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
-	    (vcpu->arch.apf.send_user_only &&
-	     kvm_x86_ops.get_cpl(vcpu) == 0))
+	if (!kvm_pv_async_pf_enabled(vcpu) ||
+	    (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
 		return false;
 
 	return true;
@@ -10499,7 +10543,10 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work)
 {
-	struct x86_exception fault;
+	struct kvm_lapic_irq irq = {
+		.delivery_mode = APIC_DM_FIXED,
+		.vector = vcpu->arch.apf.vec
+	};
 
 	if (work->wakeup_all)
 		work->arch.token = ~0; /* broadcast wakeup */
@@ -10507,26 +10554,20 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
 	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
 
-	if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
-	    !apf_put_user_ready(vcpu, work->arch.token)) {
-			fault.vector = PF_VECTOR;
-			fault.error_code_valid = true;
-			fault.error_code = 0;
-			fault.nested_page_fault = false;
-			fault.address = work->arch.token;
-			fault.async_page_fault = true;
-			kvm_inject_page_fault(vcpu, &fault);
-	}
+	if (kvm_pv_async_pf_enabled(vcpu) &&
+	    !apf_put_user_ready(vcpu, work->arch.token))
+		kvm_apic_set_irq(vcpu, &irq, NULL);
+
 	vcpu->arch.apf.halted = false;
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 }
 
 bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 {
-	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+	if (!kvm_pv_async_pf_enabled(vcpu))
 		return true;
 	else
-		return kvm_can_do_async_pf(vcpu);
+		return apf_pageready_slot_free(vcpu);
 }
 
 void kvm_arch_start_assignment(struct kvm *kvm)
-- 
cgit v1.2.3


From 557a961abbe06ed9dfd3b55ef7bd6e68295cda3d Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 25 May 2020 16:41:21 +0200
Subject: KVM: x86: acknowledgment mechanism for async pf page ready
 notifications

If two page ready notifications happen back to back the second one is not
delivered and the only mechanism we currently have is
kvm_check_async_pf_completion() check in vcpu_run() loop. The check will
only be performed with the next vmexit when it happens and in some cases
it may take a while. With interrupt based page ready notification delivery
the situation is even worse: unlike exceptions, interrupts are not handled
immediately so we must check if the slot is empty. This is slow and
unnecessary. Introduce dedicated MSR_KVM_ASYNC_PF_ACK MSR to communicate
the fact that the slot is free and host should check its notification
queue. Mandate using it for interrupt based 'page ready' APF event
delivery.

As kvm_check_async_pf_completion() is going away from vcpu_run() we need
a way to communicate the fact that vcpu->async_pf.done queue has
transitioned from empty to non-empty state. Introduce
kvm_arch_async_page_present_queued() and KVM_REQ_APF_READY to do the job.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200525144125.143875-7-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/msr.rst       | 15 ++++++++++++++-
 arch/s390/include/asm/kvm_host.h     |  2 ++
 arch/x86/include/asm/kvm_host.h      |  3 +++
 arch/x86/include/uapi/asm/kvm_para.h |  1 +
 arch/x86/kvm/x86.c                   | 26 ++++++++++++++++++++++----
 virt/kvm/async_pf.c                  | 10 ++++++++++
 6 files changed, 52 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst
index be08df12f31a..9b107889b033 100644
--- a/Documentation/virt/kvm/msr.rst
+++ b/Documentation/virt/kvm/msr.rst
@@ -236,7 +236,9 @@ data:
 	of these bytes is a token which was previously delivered as 'page not
 	present' event. The event indicates the page in now available. Guest is
 	supposed to write '0' to 'token' when it is done handling 'page ready'
-	event so the next one can be delivered.
+	event and to write 1' to MSR_KVM_ASYNC_PF_ACK after clearing the location;
+	writing to the MSR forces KVM to re-scan its queue and deliver the next
+	pending notification.
 
 	Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page
 	ready' APF delivery needs to be written to before enabling APF mechanism
@@ -359,3 +361,14 @@ data:
 	Interrupt vector for asynchnonous 'page ready' notifications delivery.
 	The vector has to be set up before asynchronous page fault mechanism
 	is enabled in MSR_KVM_ASYNC_PF_EN.
+
+MSR_KVM_ASYNC_PF_ACK:
+	0x4b564d07
+
+data:
+	Asynchronous page fault (APF) acknowledgment.
+
+	When the guest is done processing 'page ready' APF event and 'token'
+	field in 'struct kvm_vcpu_pv_apf_data' is cleared it is supposed to
+	write '1' to bit 0 of the MSR, this causes the host to re-scan its queue
+	and check if there are more notifications pending.
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9d23bde1363b..3d554887794e 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -984,6 +984,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work);
 
+static inline void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) {}
+
 void kvm_arch_crypto_clear_masks(struct kvm *kvm);
 void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
 			       unsigned long *aqm, unsigned long *adm);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2d39571451a0..f345bcf38cfb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -86,6 +86,7 @@
 #define KVM_REQ_TLB_FLUSH_CURRENT	KVM_ARCH_REQ(26)
 #define KVM_REQ_HV_TLB_FLUSH \
 	KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_APF_READY		KVM_ARCH_REQ(28)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -775,6 +776,7 @@ struct kvm_vcpu_arch {
 		u32 host_apf_flags;
 		unsigned long nested_apf_token;
 		bool delivery_as_pf_vmexit;
+		bool pageready_pending;
 	} apf;
 
 	/* OSVW MSRs (AMD only) */
@@ -1662,6 +1664,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work);
 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
 			       struct kvm_async_pf *work);
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu);
 bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
 extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 1d37d616b1fc..7ac20df80ba8 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -51,6 +51,7 @@
 #define MSR_KVM_PV_EOI_EN      0x4b564d04
 #define MSR_KVM_POLL_CONTROL	0x4b564d05
 #define MSR_KVM_ASYNC_PF_INT	0x4b564d06
+#define MSR_KVM_ASYNC_PF_ACK	0x4b564d07
 
 struct kvm_steal_time {
 	__u64 steal;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e6f3ec5193b2..c9d709a672f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1248,7 +1248,7 @@ static const u32 emulated_msrs_all[] = {
 	HV_X64_MSR_TSC_EMULATION_STATUS,
 
 	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
-	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT,
+	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
 
 	MSR_IA32_TSC_ADJUST,
 	MSR_IA32_TSCDEADLINE,
@@ -2946,6 +2946,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (kvm_pv_enable_async_pf_int(vcpu, data))
 			return 1;
 		break;
+	case MSR_KVM_ASYNC_PF_ACK:
+		if (data & 0x1) {
+			vcpu->arch.apf.pageready_pending = false;
+			kvm_check_async_pf_completion(vcpu);
+		}
+		break;
 	case MSR_KVM_STEAL_TIME:
 
 		if (unlikely(!sched_info_on()))
@@ -3225,6 +3231,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_KVM_ASYNC_PF_INT:
 		msr_info->data = vcpu->arch.apf.msr_int_val;
 		break;
+	case MSR_KVM_ASYNC_PF_ACK:
+		msr_info->data = 0;
+		break;
 	case MSR_KVM_STEAL_TIME:
 		msr_info->data = vcpu->arch.st.msr_val;
 		break;
@@ -8413,6 +8422,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_hv_process_stimers(vcpu);
 		if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
 			kvm_vcpu_update_apicv(vcpu);
+		if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
+			kvm_check_async_pf_completion(vcpu);
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -8664,8 +8675,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			break;
 		}
 
-		kvm_check_async_pf_completion(vcpu);
-
 		if (signal_pending(current)) {
 			r = -EINTR;
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -10555,13 +10564,22 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
 
 	if (kvm_pv_async_pf_enabled(vcpu) &&
-	    !apf_put_user_ready(vcpu, work->arch.token))
+	    !apf_put_user_ready(vcpu, work->arch.token)) {
+		vcpu->arch.apf.pageready_pending = true;
 		kvm_apic_set_irq(vcpu, &irq, NULL);
+	}
 
 	vcpu->arch.apf.halted = false;
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 }
 
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
+{
+	kvm_make_request(KVM_REQ_APF_READY, vcpu);
+	if (!vcpu->arch.apf.pageready_pending)
+		kvm_vcpu_kick(vcpu);
+}
+
 bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 {
 	if (!kvm_pv_async_pf_enabled(vcpu))
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 82e53f180a1a..f1e07fae84e9 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -51,6 +51,7 @@ static void async_pf_execute(struct work_struct *work)
 	unsigned long addr = apf->addr;
 	gpa_t cr2_or_gpa = apf->cr2_or_gpa;
 	int locked = 1;
+	bool first;
 
 	might_sleep();
 
@@ -69,10 +70,14 @@ static void async_pf_execute(struct work_struct *work)
 		kvm_arch_async_page_present(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);
+	first = list_empty(&vcpu->async_pf.done);
 	list_add_tail(&apf->link, &vcpu->async_pf.done);
 	apf->vcpu = NULL;
 	spin_unlock(&vcpu->async_pf.lock);
 
+	if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
+		kvm_arch_async_page_present_queued(vcpu);
+
 	/*
 	 * apf may be freed by kvm_check_async_pf_completion() after
 	 * this point
@@ -201,6 +206,7 @@ retry_sync:
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
 {
 	struct kvm_async_pf *work;
+	bool first;
 
 	if (!list_empty_careful(&vcpu->async_pf.done))
 		return 0;
@@ -213,9 +219,13 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
 	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
 
 	spin_lock(&vcpu->async_pf.lock);
+	first = list_empty(&vcpu->async_pf.done);
 	list_add_tail(&work->link, &vcpu->async_pf.done);
 	spin_unlock(&vcpu->async_pf.lock);
 
+	if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
+		kvm_arch_async_page_present_queued(vcpu);
+
 	vcpu->async_pf.queued++;
 	return 0;
 }
-- 
cgit v1.2.3


From 72de5fa4c16195827252b961ba44028a39dfeaff Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Mon, 25 May 2020 16:41:22 +0200
Subject: KVM: x86: announce KVM_FEATURE_ASYNC_PF_INT

Introduce new capability to indicate that KVM supports interrupt based
delivery of 'page ready' APF events. This includes support for both
MSR_KVM_ASYNC_PF_INT and MSR_KVM_ASYNC_PF_ACK.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200525144125.143875-8-vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/cpuid.rst     |  6 ++++++
 Documentation/virt/kvm/msr.rst       | 12 ++++++++----
 arch/x86/include/uapi/asm/kvm_para.h |  1 +
 arch/x86/kvm/cpuid.c                 |  3 ++-
 arch/x86/kvm/x86.c                   |  1 +
 include/uapi/linux/kvm.h             |  1 +
 6 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst
index f721c89327ec..a7dff9186bed 100644
--- a/Documentation/virt/kvm/cpuid.rst
+++ b/Documentation/virt/kvm/cpuid.rst
@@ -86,6 +86,12 @@ KVM_FEATURE_PV_SCHED_YIELD        13          guest checks this feature bit
                                               before using paravirtualized
                                               sched yield.
 
+KVM_FEATURE_ASYNC_PF_INT          14          guest checks this feature bit
+                                              before using the second async
+                                              pf control msr 0x4b564d06 and
+                                              async pf acknowledgment msr
+                                              0x4b564d07.
+
 KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24          host will warn if no guest-side
                                               per-cpu warps are expeced in
                                               kvmclock
diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst
index 9b107889b033..e37a14c323d2 100644
--- a/Documentation/virt/kvm/msr.rst
+++ b/Documentation/virt/kvm/msr.rst
@@ -213,7 +213,8 @@ data:
 	cpl == 0. Bit 2 is 1 if asynchronous page faults are delivered to L1 as
 	#PF vmexits.  Bit 2 can be set only if KVM_FEATURE_ASYNC_PF_VMEXIT is
 	present in CPUID. Bit 3 enables interrupt based delivery of 'page ready'
-	events.
+	events. Bit 3 can only be set if KVM_FEATURE_ASYNC_PF_INT is present in
+	CPUID.
 
 	'Page not present' events are currently always delivered as synthetic
 	#PF exception. During delivery of these events APF CR2 register contains
@@ -242,7 +243,8 @@ data:
 
 	Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page
 	ready' APF delivery needs to be written to before enabling APF mechanism
-	in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected.
+	in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected. The MSR is
+	available if KVM_FEATURE_ASYNC_PF_INT is present in CPUID.
 
 	Note, previously, 'page ready' events were delivered via the same #PF
 	exception as 'page not present' events but this is now deprecated. If
@@ -360,7 +362,8 @@ data:
 
 	Interrupt vector for asynchnonous 'page ready' notifications delivery.
 	The vector has to be set up before asynchronous page fault mechanism
-	is enabled in MSR_KVM_ASYNC_PF_EN.
+	is enabled in MSR_KVM_ASYNC_PF_EN.  The MSR is only available if
+	KVM_FEATURE_ASYNC_PF_INT is present in CPUID.
 
 MSR_KVM_ASYNC_PF_ACK:
 	0x4b564d07
@@ -371,4 +374,5 @@ data:
 	When the guest is done processing 'page ready' APF event and 'token'
 	field in 'struct kvm_vcpu_pv_apf_data' is cleared it is supposed to
 	write '1' to bit 0 of the MSR, this causes the host to re-scan its queue
-	and check if there are more notifications pending.
+	and check if there are more notifications pending. The MSR is available
+	if KVM_FEATURE_ASYNC_PF_INT is present in CPUID.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 7ac20df80ba8..812e9b4c1114 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -31,6 +31,7 @@
 #define KVM_FEATURE_PV_SEND_IPI	11
 #define KVM_FEATURE_POLL_CONTROL	12
 #define KVM_FEATURE_PV_SCHED_YIELD	13
+#define KVM_FEATURE_ASYNC_PF_INT	14
 
 #define KVM_HINTS_REALTIME      0
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index cd708b0b460a..a9f1905896fb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -711,7 +711,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			     (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
 			     (1 << KVM_FEATURE_PV_SEND_IPI) |
 			     (1 << KVM_FEATURE_POLL_CONTROL) |
-			     (1 << KVM_FEATURE_PV_SCHED_YIELD);
+			     (1 << KVM_FEATURE_PV_SCHED_YIELD) |
+			     (1 << KVM_FEATURE_ASYNC_PF_INT);
 
 		if (sched_info_on())
 			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c9d709a672f3..ae3a7f2fbda2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3446,6 +3446,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_ASYNC_PF_INT:
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_READONLY_MEM:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ac9eba0289d1..fc0075988b80 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1018,6 +1018,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_PROTECTED 180
 #define KVM_CAP_PPC_SECURE_GUEST 181
 #define KVM_CAP_HALT_POLL 182
+#define KVM_CAP_ASYNC_PF_INT 183
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From cbd717585b8038d93c309176bb563a5c6de60ac7 Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Fri, 29 May 2020 15:43:44 +0800
Subject: KVM: x86/pmu: Tweak kvm_pmu_get_msr to pass 'struct msr_data' in

Change kvm_pmu_get_msr() to get the msr_data struct, as the host_initiated
field from the struct could be used by get_msr. This also makes this API
consistent with kvm_pmu_set_msr. No functional changes.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Message-Id: <20200529074347.124619-2-like.xu@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/pmu.c           |  4 ++--
 arch/x86/kvm/pmu.h           |  4 ++--
 arch/x86/kvm/svm/pmu.c       |  7 ++++---
 arch/x86/kvm/vmx/pmu_intel.c | 19 +++++++++++--------
 arch/x86/kvm/x86.c           |  4 ++--
 5 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index a5078841bdac..b86346903f2e 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -397,9 +397,9 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
 		__set_bit(pmc->idx, pmu->pmc_in_use);
 }
 
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-	return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr, data);
+	return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
 }
 
 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index a6c78a797cb1..ab85eed8a6cc 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -32,7 +32,7 @@ struct kvm_pmu_ops {
 	struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
 	int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
 	bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
-	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+	int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
 	int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
 	void (*refresh)(struct kvm_vcpu *vcpu);
 	void (*init)(struct kvm_vcpu *vcpu);
@@ -147,7 +147,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
 int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
 bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
 void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
 void kvm_pmu_reset(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index ce0b10fe5e2b..035da07500e8 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -215,21 +215,22 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
 	return pmc;
 }
 
-static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
 
 	/* MSR_PERFCTRn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
 	if (pmc) {
-		*data = pmc_read_counter(pmc);
+		msr_info->data = pmc_read_counter(pmc);
 		return 0;
 	}
 	/* MSR_EVNTSELn */
 	pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
 	if (pmc) {
-		*data = pmc->eventsel;
+		msr_info->data = pmc->eventsel;
 		return 0;
 	}
 
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 7c857737b438..e1a303fefc16 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -184,35 +184,38 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
 	return pmc;
 }
 
-static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
 
 	switch (msr) {
 	case MSR_CORE_PERF_FIXED_CTR_CTRL:
-		*data = pmu->fixed_ctr_ctrl;
+		msr_info->data = pmu->fixed_ctr_ctrl;
 		return 0;
 	case MSR_CORE_PERF_GLOBAL_STATUS:
-		*data = pmu->global_status;
+		msr_info->data = pmu->global_status;
 		return 0;
 	case MSR_CORE_PERF_GLOBAL_CTRL:
-		*data = pmu->global_ctrl;
+		msr_info->data = pmu->global_ctrl;
 		return 0;
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-		*data = pmu->global_ovf_ctrl;
+		msr_info->data = pmu->global_ovf_ctrl;
 		return 0;
 	default:
 		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
 			u64 val = pmc_read_counter(pmc);
-			*data = val & pmu->counter_bitmask[KVM_PMC_GP];
+			msr_info->data =
+				val & pmu->counter_bitmask[KVM_PMC_GP];
 			return 0;
 		} else if ((pmc = get_fixed_pmc(pmu, msr))) {
 			u64 val = pmc_read_counter(pmc);
-			*data = val & pmu->counter_bitmask[KVM_PMC_FIXED];
+			msr_info->data =
+				val & pmu->counter_bitmask[KVM_PMC_FIXED];
 			return 0;
 		} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
-			*data = pmc->eventsel;
+			msr_info->data = pmc->eventsel;
 			return 0;
 		}
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae3a7f2fbda2..7e34ff784695 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3148,7 +3148,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
 	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
-			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+			return kvm_pmu_get_msr(vcpu, msr_info);
 		msr_info->data = 0;
 		break;
 	case MSR_IA32_UCODE_REV:
@@ -3316,7 +3316,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
-			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+			return kvm_pmu_get_msr(vcpu, msr_info);
 		if (!ignore_msrs) {
 			vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
 					       msr_info->index);
-- 
cgit v1.2.3


From 27461da31089ace6966e4f1695cba7eb87ffbe4f Mon Sep 17 00:00:00 2001
From: Like Xu <like.xu@linux.intel.com>
Date: Fri, 29 May 2020 15:43:45 +0800
Subject: KVM: x86/pmu: Support full width counting

Intel CPUs have a new alternative MSR range (starting from MSR_IA32_PMC0)
for GP counters that allows writing the full counter width. Enable this
range from a new capability bit (IA32_PERF_CAPABILITIES.FW_WRITE[bit 13]).

The guest would query CPUID to get the counter width, and sign extends
the counter values as needed. The traditional MSRs always limit to 32bit,
even though the counter internally is larger (48 or 57 bits).

When the new capability is set, use the alternative range which do not
have these restrictions. This lowers the overhead of perf stat slightly
because it has to do less interrupts to accumulate the counter value.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Message-Id: <20200529074347.124619-3-like.xu@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c            |  2 +-
 arch/x86/kvm/vmx/capabilities.h | 11 +++++++++
 arch/x86/kvm/vmx/pmu_intel.c    | 52 +++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/vmx/vmx.c          |  3 +++
 arch/x86/kvm/x86.c              |  2 ++
 6 files changed, 66 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f345bcf38cfb..b878fcd164ce 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -601,6 +601,7 @@ struct kvm_vcpu_arch {
 	u64 ia32_xss;
 	u64 microcode_version;
 	u64 arch_capabilities;
+	u64 perf_capabilities;
 
 	/*
 	 * Paging state of the vcpu
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a9f1905896fb..253b8e875ccd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -296,7 +296,7 @@ void kvm_set_cpu_caps(void)
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) |
 		F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 8903475f751e..4bbd8b448d22 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -18,6 +18,8 @@ extern int __read_mostly pt_mode;
 #define PT_MODE_SYSTEM		0
 #define PT_MODE_HOST_GUEST	1
 
+#define PMU_CAP_FW_WRITES	(1ULL << 13)
+
 struct nested_vmx_msrs {
 	/*
 	 * We only store the "true" versions of the VMX capability MSRs. We
@@ -367,4 +369,13 @@ static inline bool vmx_pt_mode_is_host_guest(void)
 	return pt_mode == PT_MODE_HOST_GUEST;
 }
 
+static inline u64 vmx_get_perf_capabilities(void)
+{
+	/*
+	 * Since counters are virtualized, KVM would support full
+	 * width counting unconditionally, even if the host lacks it.
+	 */
+	return PMU_CAP_FW_WRITES;
+}
+
 #endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index e1a303fefc16..d33d890b605f 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -18,6 +18,8 @@
 #include "nested.h"
 #include "pmu.h"
 
+#define MSR_PMC_FULL_WIDTH_BIT      (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+
 static struct kvm_event_hw_type_mapping intel_arch_events[] = {
 	/* Index must match CPUID 0x0A.EBX bit vector */
 	[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
@@ -150,6 +152,22 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
 	return &counters[array_index_nospec(idx, num_counters)];
 }
 
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+		return false;
+
+	return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+}
+
+static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+	if (!fw_writes_is_enabled(pmu_to_vcpu(pmu)))
+		return NULL;
+
+	return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
+}
+
 static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 {
 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -162,10 +180,13 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
 		ret = pmu->version > 1;
 		break;
+	case MSR_IA32_PERF_CAPABILITIES:
+		ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM);
+		break;
 	default:
 		ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
 			get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
-			get_fixed_pmc(pmu, msr);
+			get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr);
 		break;
 	}
 
@@ -203,8 +224,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
 		msr_info->data = pmu->global_ovf_ctrl;
 		return 0;
+	case MSR_IA32_PERF_CAPABILITIES:
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+			return 1;
+		msr_info->data = vcpu->arch.perf_capabilities;
+		return 0;
 	default:
-		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+		    (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
 			u64 val = pmc_read_counter(pmc);
 			msr_info->data =
 				val & pmu->counter_bitmask[KVM_PMC_GP];
@@ -261,9 +289,22 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 0;
 		}
 		break;
+	case MSR_IA32_PERF_CAPABILITIES:
+		if (!msr_info->host_initiated)
+			return 1;
+		if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ?
+			(data & ~vmx_get_perf_capabilities()) : data)
+			return 1;
+		vcpu->arch.perf_capabilities = data;
+		return 0;
 	default:
-		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
-			if (!msr_info->host_initiated)
+		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+		    (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+			if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
+			    (data & ~pmu->counter_bitmask[KVM_PMC_GP]))
+				return 1;
+			if (!msr_info->host_initiated &&
+			    !(msr & MSR_PMC_FULL_WIDTH_BIT))
 				data = (s64)(s32)data;
 			pmc->counter += data - pmc_read_counter(pmc);
 			if (pmc->perf_event)
@@ -303,6 +344,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
 	pmu->version = 0;
 	pmu->reserved_bits = 0xffffffff00200000ull;
+	vcpu->arch.perf_capabilities = 0;
 
 	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
 	if (!entry)
@@ -315,6 +357,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
 		return;
 
 	perf_get_x86_pmu_capability(&x86_pmu);
+	if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+		vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
 
 	pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
 					 x86_pmu.num_counters_gp);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5a43af0061ef..170cc76a581f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1788,6 +1788,9 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
 		if (!nested)
 			return 1;
 		return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+	case MSR_IA32_PERF_CAPABILITIES:
+		msr->data = vmx_get_perf_capabilities();
+		return 0;
 	default:
 		return 1;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7e34ff784695..ca8a57312291 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1253,6 +1253,7 @@ static const u32 emulated_msrs_all[] = {
 	MSR_IA32_TSC_ADJUST,
 	MSR_IA32_TSCDEADLINE,
 	MSR_IA32_ARCH_CAPABILITIES,
+	MSR_IA32_PERF_CAPABILITIES,
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
@@ -1319,6 +1320,7 @@ static const u32 msr_based_features_all[] = {
 	MSR_F10H_DECFG,
 	MSR_IA32_UCODE_REV,
 	MSR_IA32_ARCH_CAPABILITIES,
+	MSR_IA32_PERF_CAPABILITIES,
 };
 
 static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
-- 
cgit v1.2.3


From 850448f35aaf45215276468d63c91ab1e230cf06 Mon Sep 17 00:00:00 2001
From: Peter Shier <pshier@google.com>
Date: Tue, 26 May 2020 14:51:06 -0700
Subject: KVM: nVMX: Fix VMX preemption timer migration

Add new field to hold preemption timer expiration deadline
appended to struct kvm_vmx_nested_state_hdr. This is to prevent
the first VM-Enter after migration from incorrectly restarting the timer
with the full timer value instead of partially decayed timer value.
KVM_SET_NESTED_STATE restarts timer using migrated state regardless
of whether L1 sets VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.

Fixes: cf8b84f48a593 ("kvm: nVMX: Prepare for checkpointing L2 state")

Signed-off-by: Peter Shier <pshier@google.com>
Signed-off-by: Makarand Sonare <makarandsonare@google.com>
Message-Id: <20200526215107.205814-2-makarandsonare@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  |  4 +++
 arch/x86/include/uapi/asm/kvm.h |  3 +++
 arch/x86/kvm/vmx/nested.c       | 55 +++++++++++++++++++++++++++++++++++------
 arch/x86/kvm/vmx/vmx.h          |  2 ++
 4 files changed, 56 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index d0569db2b1e2..db22e834ce2a 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4334,9 +4334,13 @@ Errors:
   #define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE	0x00000001
   #define KVM_STATE_NESTED_VMX_SMM_VMXON	0x00000002
 
+#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
+
   struct kvm_vmx_nested_state_hdr {
+	__u32 flags;
 	__u64 vmxon_pa;
 	__u64 vmcs12_pa;
+	__u64 preemption_timer_deadline;
 
 	struct {
 		__u16 flags;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 12075a9de1c1..17c5a038f42d 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -400,6 +400,7 @@ struct kvm_sync_regs {
 
 #define KVM_STATE_NESTED_SVM_VMCB_SIZE	0x1000
 
+#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE	0x00000001
 
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
@@ -407,8 +408,10 @@ struct kvm_vmx_nested_state_data {
 };
 
 struct kvm_vmx_nested_state_hdr {
+	__u32 flags;
 	__u64 vmxon_pa;
 	__u64 vmcs12_pa;
+	__u64 preemption_timer_deadline;
 
 	struct {
 		__u16 flags;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 119a2f7395d6..da87bb8670bb 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2087,9 +2087,29 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	u64 timer_value = 0;
+
+	u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
+			    VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+
+	if (!vmx->nested.has_preemption_timer_deadline) {
+		timer_value = vmcs12->vmx_preemption_timer_value;
+		vmx->nested.preemption_timer_deadline = timer_value +
+							l1_scaled_tsc;
+		vmx->nested.has_preemption_timer_deadline = true;
+	} else if (l1_scaled_tsc < vmx->nested.preemption_timer_deadline)
+		timer_value = vmx->nested.preemption_timer_deadline -
+			      l1_scaled_tsc;
+	return timer_value;
+}
+
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
+					u64 preemption_timeout)
 {
-	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	/*
@@ -3348,8 +3368,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 	 * the timer.
 	 */
 	vmx->nested.preemption_timer_expired = false;
-	if (nested_cpu_has_preemption_timer(vmcs12))
-		vmx_start_preemption_timer(vcpu);
+	if (nested_cpu_has_preemption_timer(vmcs12)) {
+		u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
+		vmx_start_preemption_timer(vcpu, timer_value);
+	}
 
 	/*
 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -3457,6 +3479,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	 * the nested entry.
 	 */
 	vmx->nested.nested_run_pending = 1;
+	vmx->nested.has_preemption_timer_deadline = false;
 	status = nested_vmx_enter_non_root_mode(vcpu, true);
 	if (unlikely(status != NVMX_VMENTRY_SUCCESS))
 		goto vmentry_failed;
@@ -3957,9 +3980,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
 
 	if (nested_cpu_has_preemption_timer(vmcs12) &&
-	    vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
-			vmcs12->vmx_preemption_timer_value =
-				vmx_get_preemption_timer_value(vcpu);
+	    vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
+	    !vmx->nested.nested_run_pending)
+		vmcs12->vmx_preemption_timer_value =
+			vmx_get_preemption_timer_value(vcpu);
 
 	/*
 	 * In some cases (usually, nested EPT), L2 is allowed to change its
@@ -5891,8 +5915,10 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 		.flags = 0,
 		.format = KVM_STATE_NESTED_FORMAT_VMX,
 		.size = sizeof(kvm_state),
+		.hdr.vmx.flags = 0,
 		.hdr.vmx.vmxon_pa = -1ull,
 		.hdr.vmx.vmcs12_pa = -1ull,
+		.hdr.vmx.preemption_timer_deadline = 0,
 	};
 	struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
 		&user_kvm_nested_state->data.vmx[0];
@@ -5934,6 +5960,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 
 			if (vmx->nested.mtf_pending)
 				kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
+
+			if (nested_cpu_has_preemption_timer(vmcs12) &&
+			    vmx->nested.has_preemption_timer_deadline) {
+				kvm_state.hdr.vmx.flags |=
+					KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
+				kvm_state.hdr.vmx.preemption_timer_deadline =
+					vmx->nested.preemption_timer_deadline;
+			}
 		}
 	}
 
@@ -5979,7 +6013,6 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 				 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
 			return -EFAULT;
 	}
-
 out:
 	return kvm_state.size;
 }
@@ -6141,6 +6174,12 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 			goto error_guest_mode;
 	}
 
+	if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
+		vmx->nested.has_preemption_timer_deadline = true;
+		vmx->nested.preemption_timer_deadline =
+			kvm_state->hdr.vmx.preemption_timer_deadline;
+	}
+
 	if (nested_vmx_check_controls(vcpu, vmcs12) ||
 	    nested_vmx_check_host_state(vcpu, vmcs12) ||
 	    nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 298ddef79d00..672c28f17e49 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -169,6 +169,8 @@ struct nested_vmx {
 	u16 posted_intr_nv;
 
 	struct hrtimer preemption_timer;
+	u64 preemption_timer_deadline;
+	bool has_preemption_timer_deadline;
 	bool preemption_timer_expired;
 
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
-- 
cgit v1.2.3


From 8d7fbf01f9afc9ea1381de705013aa3ca453009f Mon Sep 17 00:00:00 2001
From: Makarand Sonare <makarandsonare@google.com>
Date: Tue, 26 May 2020 14:51:07 -0700
Subject: KVM: selftests: VMX preemption timer migration test

When a nested VM with a VMX-preemption timer is migrated, verify that the
nested VM and its parent VM observe the VMX-preemption timer exit close to
the original expiration deadline.

Signed-off-by: Makarand Sonare <makarandsonare@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Message-Id: <20200526215107.205814-3-makarandsonare@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c                          |  12 +-
 tools/arch/x86/include/uapi/asm/kvm.h              |   1 +
 tools/testing/selftests/kvm/.gitignore             |   1 +
 tools/testing/selftests/kvm/Makefile               |   1 +
 tools/testing/selftests/kvm/include/kvm_util.h     |   2 +
 .../selftests/kvm/include/x86_64/processor.h       |  11 +-
 tools/testing/selftests/kvm/include/x86_64/vmx.h   |  27 +++
 .../kvm/x86_64/vmx_preemption_timer_test.c         | 255 +++++++++++++++++++++
 8 files changed, 298 insertions(+), 12 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index da87bb8670bb..9c74a732b08d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2091,20 +2091,16 @@ static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u64 timer_value = 0;
 
 	u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
 			    VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
 
 	if (!vmx->nested.has_preemption_timer_deadline) {
-		timer_value = vmcs12->vmx_preemption_timer_value;
-		vmx->nested.preemption_timer_deadline = timer_value +
-							l1_scaled_tsc;
+		vmx->nested.preemption_timer_deadline =
+			vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
 		vmx->nested.has_preemption_timer_deadline = true;
-	} else if (l1_scaled_tsc < vmx->nested.preemption_timer_deadline)
-		timer_value = vmx->nested.preemption_timer_deadline -
-			      l1_scaled_tsc;
-	return timer_value;
+	}
+	return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
 }
 
 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 3f3f780c8c65..43e24903812c 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -400,6 +400,7 @@ struct kvm_sync_regs {
 struct kvm_vmx_nested_state_data {
 	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
 	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+	__u64 preemption_timer_deadline;
 };
 
 struct kvm_vmx_nested_state_hdr {
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 222e50104296..f159718f90c0 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -10,6 +10,7 @@
 /x86_64/set_sregs_test
 /x86_64/smm_test
 /x86_64/state_test
+/x86_64/vmx_preemption_timer_test
 /x86_64/svm_vmcall_test
 /x86_64/sync_regs_test
 /x86_64/vmx_close_while_nested_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 29200b606bcc..b4ff112e5c7e 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -46,6 +46,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
 TEST_GEN_PROGS_x86_64 += x86_64/smm_test
 TEST_GEN_PROGS_x86_64 += x86_64/state_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index e244c6ecfc1d..919e161dd289 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -314,6 +314,8 @@ void ucall_uninit(struct kvm_vm *vm);
 void ucall(uint64_t cmd, int nargs, ...);
 uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
 
+#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4)	\
+				ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
 #define GUEST_SYNC(stage)	ucall(UCALL_SYNC, 2, "hello", stage)
 #define GUEST_DONE()		ucall(UCALL_DONE, 0)
 #define __GUEST_ASSERT(_condition, _nargs, _args...) do {	\
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 7428513a4c68..7cb19eca6c72 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -79,13 +79,16 @@ static inline uint64_t get_desc64_base(const struct desc64 *desc)
 static inline uint64_t rdtsc(void)
 {
 	uint32_t eax, edx;
-
+	uint32_t tsc_val;
 	/*
 	 * The lfence is to wait (on Intel CPUs) until all previous
-	 * instructions have been executed.
+	 * instructions have been executed. If software requires RDTSC to be
+	 * executed prior to execution of any subsequent instruction, it can
+	 * execute LFENCE immediately after RDTSC
 	 */
-	__asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx));
-	return ((uint64_t)edx) << 32 | eax;
+	__asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
+	tsc_val = ((uint64_t)edx) << 32 | eax;
+	return tsc_val;
 }
 
 static inline uint64_t rdtscp(uint32_t *aux)
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
index 3d27069b9ed9..ccff3e6e2704 100644
--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -575,6 +575,33 @@ struct vmx_pages {
 	void *eptp;
 };
 
+union vmx_basic {
+	u64 val;
+	struct {
+		u32 revision;
+		u32	size:13,
+			reserved1:3,
+			width:1,
+			dual:1,
+			type:4,
+			insouts:1,
+			ctrl:1,
+			vm_entry_exception_ctrl:1,
+			reserved2:7;
+	};
+};
+
+union vmx_ctrl_msr {
+	u64 val;
+	struct {
+		u32 set, clr;
+	};
+};
+
+union vmx_basic basic;
+union vmx_ctrl_msr ctrl_pin_rev;
+union vmx_ctrl_msr ctrl_exit_rev;
+
 struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
 bool prepare_for_vmx_operation(struct vmx_pages *vmx);
 void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
new file mode 100644
index 000000000000..cc72b6188ca7
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VMX-preemption timer test
+ *
+ * Copyright (C) 2020, Google, LLC.
+ *
+ * Test to ensure the VM-Enter after migration doesn't
+ * incorrectly restarts the timer with the full timer
+ * value instead of partially decayed timer value
+ *
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VCPU_ID		5
+#define PREEMPTION_TIMER_VALUE			100000000ull
+#define PREEMPTION_TIMER_VALUE_THRESHOLD1	 80000000ull
+
+u32 vmx_pt_rate;
+bool l2_save_restore_done;
+static u64 l2_vmx_pt_start;
+volatile u64 l2_vmx_pt_finish;
+
+void l2_guest_code(void)
+{
+	u64 vmx_pt_delta;
+
+	vmcall();
+	l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+	/*
+	 * Wait until the 1st threshold has passed
+	 */
+	do {
+		l2_vmx_pt_finish = rdtsc();
+		vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
+				vmx_pt_rate;
+	} while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
+
+	/*
+	 * Force L2 through Save and Restore cycle
+	 */
+	GUEST_SYNC(1);
+
+	l2_save_restore_done = 1;
+
+	/*
+	 * Now wait for the preemption timer to fire and
+	 * exit to L1
+	 */
+	while ((l2_vmx_pt_finish = rdtsc()))
+		;
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	u64 l1_vmx_pt_start;
+	u64 l1_vmx_pt_finish;
+	u64 l1_tsc_deadline, l2_tsc_deadline;
+
+	GUEST_ASSERT(vmx_pages->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/*
+	 * Check for Preemption timer support
+	 */
+	basic.val = rdmsr(MSR_IA32_VMX_BASIC);
+	ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
+			: MSR_IA32_VMX_PINBASED_CTLS);
+	ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
+			: MSR_IA32_VMX_EXIT_CTLS);
+
+	if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+	    !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+		return;
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN));
+
+	/*
+	 * Turn on PIN control and resume the guest
+	 */
+	GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
+			      vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
+			      PIN_BASED_VMX_PREEMPTION_TIMER));
+
+	GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
+			      PREEMPTION_TIMER_VALUE));
+
+	vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
+
+	l2_save_restore_done = 0;
+
+	l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+	GUEST_ASSERT(!vmresume());
+
+	l1_vmx_pt_finish = rdtsc();
+
+	/*
+	 * Ensure exit from L2 happens after L2 goes through
+	 * save and restore
+	 */
+	GUEST_ASSERT(l2_save_restore_done);
+
+	/*
+	 * Ensure the exit from L2 is due to preemption timer expiry
+	 */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
+
+	l1_tsc_deadline = l1_vmx_pt_start +
+		(PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+	l2_tsc_deadline = l2_vmx_pt_start +
+		(PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+	/*
+	 * Sync with the host and pass the l1|l2 pt_expiry_finish times and
+	 * tsc deadlines so that host can verify they are as expected
+	 */
+	GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
+		l2_vmx_pt_finish, l2_tsc_deadline);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
+{
+	if (vmx_pages)
+		l1_guest_code(vmx_pages);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva = 0;
+
+	struct kvm_regs regs1, regs2;
+	struct kvm_vm *vm;
+	struct kvm_run *run;
+	struct kvm_x86_state *state;
+	struct ucall uc;
+	int stage;
+
+	/*
+	 * AMD currently does not implement any VMX features, so for now we
+	 * just early out.
+	 */
+	nested_vmx_check_supported();
+
+	/* Create VM */
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	run = vcpu_state(vm, VCPU_ID);
+
+	vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+	if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+		vcpu_alloc_vmx(vm, &vmx_pages_gva);
+		vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+	} else {
+		pr_info("will skip vmx preemption timer checks\n");
+		goto done;
+	}
+
+	for (stage = 1;; stage++) {
+		_vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Stage %d: unexpected exit reason: %u (%s),\n",
+			    stage, run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		case UCALL_ABORT:
+			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+				  __FILE__, uc.args[1]);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		/* UCALL_SYNC is handled here.  */
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+		/*
+		 * If this stage 2 then we should verify the vmx pt expiry
+		 * is as expected.
+		 * From L1's perspective verify Preemption timer hasn't
+		 * expired too early.
+		 * From L2's perspective verify Preemption timer hasn't
+		 * expired too late.
+		 */
+		if (stage == 2) {
+
+			pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
+				stage, uc.args[2], uc.args[3]);
+
+			pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
+				stage, uc.args[4], uc.args[5]);
+
+			TEST_ASSERT(uc.args[2] >= uc.args[3],
+				"Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
+				stage, uc.args[2], uc.args[3]);
+
+			TEST_ASSERT(uc.args[4] < uc.args[5],
+				"Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
+				stage, uc.args[4], uc.args[5]);
+		}
+
+		state = vcpu_save_state(vm, VCPU_ID);
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		kvm_vm_restart(vm, O_RDWR);
+		vm_vcpu_add(vm, VCPU_ID);
+		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+		vcpu_load_state(vm, VCPU_ID, state);
+		run = vcpu_state(vm, VCPU_ID);
+		free(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vm, VCPU_ID, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+
+done:
+	kvm_vm_free(vm);
+}
-- 
cgit v1.2.3


From 22ad0026d0978d4211eb07a2be77f49fb40d86eb Mon Sep 17 00:00:00 2001
From: Jon Doron <arilou@gmail.com>
Date: Fri, 29 May 2020 16:45:39 +0300
Subject: x86/hyper-v: Add synthetic debugger definitions

Hyper-V synthetic debugger has two modes, one that uses MSRs and
the other that use Hypercalls.

Add all the required definitions to both types of synthetic debugger
interface.

Some of the required new CPUIDs and MSRs are not documented in the TLFS
so they are in hyperv.h instead.

The reason they are not documented is because they are subjected to be
removed in future versions of Windows.

Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Jon Doron <arilou@gmail.com>
Message-Id: <20200529134543.1127440-3-arilou@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/hyperv-tlfs.h |  6 ++++++
 arch/x86/kvm/hyperv.h              | 27 +++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 29336574d0bc..53ef6b7bd380 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -131,6 +131,8 @@
 #define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE		BIT(8)
 /* Crash MSR available */
 #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE		BIT(10)
+/* Support for debug MSRs available */
+#define HV_FEATURE_DEBUG_MSRS_AVAILABLE			BIT(11)
 /* stimer Direct Mode is available */
 #define HV_STIMER_DIRECT_MODE_AVAILABLE			BIT(19)
 
@@ -376,6 +378,9 @@ struct hv_tsc_emulation_status {
 #define HVCALL_SEND_IPI_EX			0x0015
 #define HVCALL_POST_MESSAGE			0x005c
 #define HVCALL_SIGNAL_EVENT			0x005d
+#define HVCALL_POST_DEBUG_DATA			0x0069
+#define HVCALL_RETRIEVE_DEBUG_DATA		0x006a
+#define HVCALL_RESET_DEBUG_SESSION		0x006b
 #define HVCALL_RETARGET_INTERRUPT		0x007e
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
@@ -422,6 +427,7 @@ enum HV_GENERIC_SET_FORMAT {
 #define HV_STATUS_INVALID_HYPERCALL_INPUT	3
 #define HV_STATUS_INVALID_ALIGNMENT		4
 #define HV_STATUS_INVALID_PARAMETER		5
+#define HV_STATUS_OPERATION_DENIED		8
 #define HV_STATUS_INSUFFICIENT_MEMORY		11
 #define HV_STATUS_INVALID_PORT_ID		17
 #define HV_STATUS_INVALID_CONNECTION_ID		18
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 757cb578101c..7f50ff0bad07 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -23,6 +23,33 @@
 
 #include <linux/kvm_host.h>
 
+/*
+ * The #defines related to the synthetic debugger are required by KDNet, but
+ * they are not documented in the Hyper-V TLFS because the synthetic debugger
+ * functionality has been deprecated and is subject to removal in future
+ * versions of Windows.
+ */
+#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS	0x40000080
+#define HYPERV_CPUID_SYNDBG_INTERFACE			0x40000081
+#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES	0x40000082
+
+/*
+ * Hyper-V synthetic debugger platform capabilities
+ * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits.
+ */
+#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING	BIT(1)
+
+/* Hyper-V Synthetic debug options MSR */
+#define HV_X64_MSR_SYNDBG_CONTROL		0x400000F1
+#define HV_X64_MSR_SYNDBG_STATUS		0x400000F2
+#define HV_X64_MSR_SYNDBG_SEND_BUFFER		0x400000F3
+#define HV_X64_MSR_SYNDBG_RECV_BUFFER		0x400000F4
+#define HV_X64_MSR_SYNDBG_PENDING_BUFFER	0x400000F5
+#define HV_X64_MSR_SYNDBG_OPTIONS		0x400000FF
+
+/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */
+#define HV_X64_SYNDBG_OPTION_USE_HCALLS		BIT(2)
+
 static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
 {
 	return &vcpu->arch.hyperv;
-- 
cgit v1.2.3


From f97f5a56f5977311f3833056a73cdbb0ee56cb1e Mon Sep 17 00:00:00 2001
From: Jon Doron <arilou@gmail.com>
Date: Fri, 29 May 2020 16:45:40 +0300
Subject: x86/kvm/hyper-v: Add support for synthetic debugger interface

Add support for Hyper-V synthetic debugger (syndbg) interface.
The syndbg interface is using MSRs to emulate a way to send/recv packets
data.

The debug transport dll (kdvm/kdnet) will identify if Hyper-V is enabled
and if it supports the synthetic debugger interface it will attempt to
use it, instead of trying to initialize a network adapter.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Jon Doron <arilou@gmail.com>
Message-Id: <20200529134543.1127440-4-arilou@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  |  16 ++++
 arch/x86/include/asm/kvm_host.h |  13 ++++
 arch/x86/kvm/hyperv.c           | 158 +++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/hyperv.h           |   5 ++
 arch/x86/kvm/trace.h            |  51 +++++++++++++
 arch/x86/kvm/x86.c              |   8 ++
 include/uapi/linux/kvm.h        |  10 +++
 7 files changed, 258 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index db22e834ce2a..aad60be4884e 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5070,6 +5070,7 @@ EOI was received.
 		struct kvm_hyperv_exit {
   #define KVM_EXIT_HYPERV_SYNIC          1
   #define KVM_EXIT_HYPERV_HCALL          2
+  #define KVM_EXIT_HYPERV_SYNDBG         3
 			__u32 type;
 			__u32 pad1;
 			union {
@@ -5085,6 +5086,15 @@ EOI was received.
 					__u64 result;
 					__u64 params[2];
 				} hcall;
+				struct {
+					__u32 msr;
+					__u32 pad2;
+					__u64 control;
+					__u64 status;
+					__u64 send_page;
+					__u64 recv_page;
+					__u64 pending_page;
+				} syndbg;
 			} u;
 		};
 		/* KVM_EXIT_HYPERV */
@@ -5101,6 +5111,12 @@ Hyper-V SynIC state change. Notification is used to remap SynIC
 event/message pages and to enable/disable SynIC messages/events processing
 in userspace.
 
+	- KVM_EXIT_HYPERV_SYNDBG -- synchronously notify user-space about
+
+Hyper-V Synthetic debugger state change. Notification is used to either update
+the pending_page location or to send a control command (send the buffer located
+in send_page or recv a buffer to recv_page).
+
 ::
 
 		/* KVM_EXIT_ARM_NISV */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b878fcd164ce..58337a25396a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -863,6 +863,18 @@ struct kvm_apic_map {
 	struct kvm_lapic *phys_map[];
 };
 
+/* Hyper-V synthetic debugger (SynDbg)*/
+struct kvm_hv_syndbg {
+	struct {
+		u64 control;
+		u64 status;
+		u64 send_page;
+		u64 recv_page;
+		u64 pending_page;
+	} control;
+	u64 options;
+};
+
 /* Hyper-V emulation context */
 struct kvm_hv {
 	struct mutex hv_lock;
@@ -886,6 +898,7 @@ struct kvm_hv {
 	atomic_t num_mismatched_vp_indexes;
 
 	struct hv_partition_assist_pg *hv_pa_pg;
+	struct kvm_hv_syndbg hv_syndbg;
 };
 
 enum kvm_irqchip_mode {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f9d3b919823c..c21f99357ad5 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -21,6 +21,7 @@
 #include "x86.h"
 #include "lapic.h"
 #include "ioapic.h"
+#include "cpuid.h"
 #include "hyperv.h"
 
 #include <linux/cpu.h>
@@ -266,6 +267,123 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
 	return ret;
 }
 
+static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *entry;
+
+	entry = kvm_find_cpuid_entry(vcpu,
+				     HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES,
+				     0);
+	if (!entry)
+		return false;
+
+	return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+}
+
+static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+
+	if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL)
+		hv->hv_syndbg.control.status =
+			vcpu->run->hyperv.u.syndbg.status;
+	return 1;
+}
+
+static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+	struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+	hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG;
+	hv_vcpu->exit.u.syndbg.msr = msr;
+	hv_vcpu->exit.u.syndbg.control = syndbg->control.control;
+	hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page;
+	hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page;
+	hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page;
+	vcpu->arch.complete_userspace_io =
+			kvm_hv_syndbg_complete_userspace;
+
+	kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+	struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+
+	if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
+		return 1;
+
+	trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id,
+				    vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data);
+	switch (msr) {
+	case HV_X64_MSR_SYNDBG_CONTROL:
+		syndbg->control.control = data;
+		if (!host)
+			syndbg_exit(vcpu, msr);
+		break;
+	case HV_X64_MSR_SYNDBG_STATUS:
+		syndbg->control.status = data;
+		break;
+	case HV_X64_MSR_SYNDBG_SEND_BUFFER:
+		syndbg->control.send_page = data;
+		break;
+	case HV_X64_MSR_SYNDBG_RECV_BUFFER:
+		syndbg->control.recv_page = data;
+		break;
+	case HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+		syndbg->control.pending_page = data;
+		if (!host)
+			syndbg_exit(vcpu, msr);
+		break;
+	case HV_X64_MSR_SYNDBG_OPTIONS:
+		syndbg->options = data;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+	struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+
+	if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
+		return 1;
+
+	switch (msr) {
+	case HV_X64_MSR_SYNDBG_CONTROL:
+		*pdata = syndbg->control.control;
+		break;
+	case HV_X64_MSR_SYNDBG_STATUS:
+		*pdata = syndbg->control.status;
+		break;
+	case HV_X64_MSR_SYNDBG_SEND_BUFFER:
+		*pdata = syndbg->control.send_page;
+		break;
+	case HV_X64_MSR_SYNDBG_RECV_BUFFER:
+		*pdata = syndbg->control.recv_page;
+		break;
+	case HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+		*pdata = syndbg->control.pending_page;
+		break;
+	case HV_X64_MSR_SYNDBG_OPTIONS:
+		*pdata = syndbg->options;
+		break;
+	default:
+		break;
+	}
+
+	trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id,
+				    vcpu_to_hv_vcpu(vcpu)->vp_index, msr,
+				    *pdata);
+
+	return 0;
+}
+
 static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
 			 bool host)
 {
@@ -800,6 +918,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
 	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
 	case HV_X64_MSR_TSC_EMULATION_CONTROL:
 	case HV_X64_MSR_TSC_EMULATION_STATUS:
+	case HV_X64_MSR_SYNDBG_OPTIONS:
+	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
 		r = true;
 		break;
 	}
@@ -1061,6 +1181,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 		if (!host)
 			return 1;
 		break;
+	case HV_X64_MSR_SYNDBG_OPTIONS:
+	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+		return syndbg_set_msr(vcpu, msr, data, host);
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -1190,7 +1313,8 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 	return 0;
 }
 
-static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+			     bool host)
 {
 	u64 data = 0;
 	struct kvm *kvm = vcpu->kvm;
@@ -1227,6 +1351,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_TSC_EMULATION_STATUS:
 		data = hv->hv_tsc_emulation_status;
 		break;
+	case HV_X64_MSR_SYNDBG_OPTIONS:
+	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+		return syndbg_get_msr(vcpu, msr, pdata, host);
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -1316,7 +1443,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 		int r;
 
 		mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
-		r = kvm_hv_get_msr_pw(vcpu, msr, pdata);
+		r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host);
 		mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
 		return r;
 	} else
@@ -1795,6 +1922,9 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 		{ .function = HYPERV_CPUID_FEATURES },
 		{ .function = HYPERV_CPUID_ENLIGHTMENT_INFO },
 		{ .function = HYPERV_CPUID_IMPLEMENT_LIMITS },
+		{ .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS },
+		{ .function = HYPERV_CPUID_SYNDBG_INTERFACE },
+		{ .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES	},
 		{ .function = HYPERV_CPUID_NESTED_FEATURES },
 	};
 	int i, nent = ARRAY_SIZE(cpuid_entries);
@@ -1820,7 +1950,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 		case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS:
 			memcpy(signature, "Linux KVM Hv", 12);
 
-			ent->eax = HYPERV_CPUID_NESTED_FEATURES;
+			ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
 			ent->ebx = signature[0];
 			ent->ecx = signature[1];
 			ent->edx = signature[2];
@@ -1859,6 +1989,10 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 			ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
 			ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
 
+			ent->ebx |= HV_X64_DEBUGGING;
+			ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE;
+			ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+
 			/*
 			 * Direct Synthetic timers only make sense with in-kernel
 			 * LAPIC
@@ -1902,6 +2036,24 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 
 			break;
 
+		case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS:
+			memcpy(signature, "Linux KVM Hv", 12);
+
+			ent->eax = 0;
+			ent->ebx = signature[0];
+			ent->ecx = signature[1];
+			ent->edx = signature[2];
+			break;
+
+		case HYPERV_CPUID_SYNDBG_INTERFACE:
+			memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
+			ent->eax = signature[0];
+			break;
+
+		case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES:
+			ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+			break;
+
 		default:
 			break;
 		}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 7f50ff0bad07..e68c6c2e9649 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -73,6 +73,11 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
 	return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
 }
 
+static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu)
+{
+	return &vcpu->kvm->arch.hyperv.hv_syndbg;
+}
+
 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 54a10c98d746..b66432b015d2 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1541,6 +1541,57 @@ TRACE_EVENT(kvm_nested_vmenter_failed,
 		__print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
 );
 
+/*
+ * Tracepoint for syndbg_set_msr.
+ */
+TRACE_EVENT(kvm_hv_syndbg_set_msr,
+	TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data),
+	TP_ARGS(vcpu_id, vp_index, msr, data),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, vp_index)
+		__field(u32, msr)
+		__field(u64, data)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->vp_index = vp_index;
+		__entry->msr = msr;
+		__entry->data = data;
+	),
+
+	TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx",
+		  __entry->vcpu_id, __entry->vp_index, __entry->msr,
+		  __entry->data)
+);
+
+/*
+ * Tracepoint for syndbg_get_msr.
+ */
+TRACE_EVENT(kvm_hv_syndbg_get_msr,
+	TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data),
+	TP_ARGS(vcpu_id, vp_index, msr, data),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, vp_index)
+		__field(u32, msr)
+		__field(u64, data)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->vp_index = vp_index;
+		__entry->msr = msr;
+		__entry->data = data;
+	),
+
+	TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx",
+		  __entry->vcpu_id, __entry->vp_index, __entry->msr,
+		  __entry->data)
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ca8a57312291..9e41b5135340 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1246,6 +1246,10 @@ static const u32 emulated_msrs_all[] = {
 	HV_X64_MSR_VP_ASSIST_PAGE,
 	HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
 	HV_X64_MSR_TSC_EMULATION_STATUS,
+	HV_X64_MSR_SYNDBG_OPTIONS,
+	HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+	HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+	HV_X64_MSR_SYNDBG_PENDING_BUFFER,
 
 	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
@@ -3011,6 +3015,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 */
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+	case HV_X64_MSR_SYNDBG_OPTIONS:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
@@ -3272,6 +3278,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = 0x20000000;
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+	case HV_X64_MSR_SYNDBG_OPTIONS:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index cc04aceb8793..6721eb563eda 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -188,6 +188,7 @@ struct kvm_s390_cmma_log {
 struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
 #define KVM_EXIT_HYPERV_HCALL          2
+#define KVM_EXIT_HYPERV_SYNDBG         3
 	__u32 type;
 	__u32 pad1;
 	union {
@@ -203,6 +204,15 @@ struct kvm_hyperv_exit {
 			__u64 result;
 			__u64 params[2];
 		} hcall;
+		struct {
+			__u32 msr;
+			__u32 pad2;
+			__u64 control;
+			__u64 status;
+			__u64 send_page;
+			__u64 recv_page;
+			__u64 pending_page;
+		} syndbg;
 	} u;
 };
 
-- 
cgit v1.2.3


From 45c38973ed1868b8448079edd48bf24ab8b326fa Mon Sep 17 00:00:00 2001
From: Jon Doron <arilou@gmail.com>
Date: Fri, 29 May 2020 16:45:41 +0300
Subject: x86/kvm/hyper-v: enable hypercalls regardless of hypercall page

Microsoft's kdvm.dll dbgtransport module does not respect the hypercall
page and simply identifies the CPU being used (AMD/Intel) and according
to it simply makes hypercalls with the relevant instruction
(vmmcall/vmcall respectively).

The relevant function in kdvm is KdHvConnectHypervisor which first checks
if the hypercall page has been enabled via HV_X64_MSR_HYPERCALL_ENABLE,
and in case it was not it simply sets the HV_X64_MSR_GUEST_OS_ID to
0x1000101010001 which means:
build_number = 0x0001
service_version = 0x01
minor_version = 0x01
major_version = 0x01
os_id = 0x00 (Undefined)
vendor_id = 1 (Microsoft)
os_type = 0 (A value of 0 indicates a proprietary, closed source OS)

and starts issuing the hypercall without setting the hypercall page.

To resolve this issue simply enable hypercalls also if the guest_os_id
is not 0.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Jon Doron <arilou@gmail.com>
Message-Id: <20200529134543.1127440-5-arilou@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c21f99357ad5..2fb1464a483f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1656,7 +1656,7 @@ ret_success:
 
 bool kvm_hv_hypercall_enabled(struct kvm *kvm)
 {
-	return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
+	return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0;
 }
 
 static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
-- 
cgit v1.2.3


From b187038b5e3f8404dbafce89fd169e66fe604df4 Mon Sep 17 00:00:00 2001
From: Jon Doron <arilou@gmail.com>
Date: Fri, 29 May 2020 16:45:42 +0300
Subject: x86/kvm/hyper-v: Add support for synthetic debugger via hypercalls

There is another mode for the synthetic debugger which uses hypercalls
to send/recv network data instead of the MSR interface.

This interface is much slower and less recommended since you might get
a lot of VMExits while KDVM polling for new packets to recv, rather
than simply checking the pending page to see if there is data avialble
and then request.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Jon Doron <arilou@gmail.com>
Message-Id: <20200529134543.1127440-6-arilou@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 2fb1464a483f..4e1695db788a 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1835,6 +1835,34 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 		}
 		ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false);
 		break;
+	case HVCALL_POST_DEBUG_DATA:
+	case HVCALL_RETRIEVE_DEBUG_DATA:
+		if (unlikely(fast)) {
+			ret = HV_STATUS_INVALID_PARAMETER;
+			break;
+		}
+		fallthrough;
+	case HVCALL_RESET_DEBUG_SESSION: {
+		struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+
+		if (!kvm_hv_is_syndbg_enabled(vcpu)) {
+			ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+			break;
+		}
+
+		if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) {
+			ret = HV_STATUS_OPERATION_DENIED;
+			break;
+		}
+		vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+		vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+		vcpu->run->hyperv.u.hcall.input = param;
+		vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+		vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+		vcpu->arch.complete_userspace_io =
+				kvm_hv_hypercall_complete_userspace;
+		return 0;
+	}
 	default:
 		ret = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
-- 
cgit v1.2.3