From 52ac8b358b0cb7e91c966225fca61be5d1c984bc Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 27 May 2021 08:09:15 -0400
Subject: KVM: Block memslot updates across range_start() and range_end()

We would like to avoid taking mmu_lock for .invalidate_range_{start,end}()
notifications that are unrelated to KVM.  Because mmu_notifier_count
must be modified while holding mmu_lock for write, and must always
be paired across start->end to stay balanced, lock elision must
happen in both or none.  Therefore, in preparation for this change,
this patch prevents memslot updates across range_start() and range_end().

Note, technically flag-only memslot updates could be allowed in parallel,
but stalling a memslot update for a relatively short amount of time is
not a scalability issue, and this is all more than complex enough.

A long note on the locking: a previous version of the patch used an rwsem
to block the memslot update while the MMU notifier run, but this resulted
in the following deadlock involving the pseudo-lock tagged as
"mmu_notifier_invalidate_range_start".

   ======================================================
   WARNING: possible circular locking dependency detected
   5.12.0-rc3+ #6 Tainted: G           OE
   ------------------------------------------------------
   qemu-system-x86/3069 is trying to acquire lock:
   ffffffff9c775ca0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: __mmu_notifier_invalidate_range_end+0x5/0x190

   but task is already holding lock:
   ffffaff7410a9160 (&kvm->mmu_notifier_slots_lock){.+.+}-{3:3}, at: kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm]

   which lock already depends on the new lock.

This corresponds to the following MMU notifier logic:

    invalidate_range_start
      take pseudo lock
      down_read()           (*)
      release pseudo lock
    invalidate_range_end
      take pseudo lock      (**)
      up_read()
      release pseudo lock

At point (*) we take the mmu_notifiers_slots_lock inside the pseudo lock;
at point (**) we take the pseudo lock inside the mmu_notifiers_slots_lock.

This could cause a deadlock (ignoring for a second that the pseudo lock
is not a lock):

- invalidate_range_start waits on down_read(), because the rwsem is
held by install_new_memslots

- install_new_memslots waits on down_write(), because the rwsem is
held till (another) invalidate_range_end finishes

- invalidate_range_end sits waits on the pseudo lock, held by
invalidate_range_start.

Removing the fairness of the rwsem breaks the cycle (in lockdep terms,
it would change the *shared* rwsem readers into *shared recursive*
readers), so open-code the wait using a readers count and a
spinlock.  This also allows handling blockable and non-blockable
critical section in the same way.

Losing the rwsem fairness does theoretically allow MMU notifiers to
block install_new_memslots forever.  Note that mm/mmu_notifier.c's own
retry scheme in mmu_interval_read_begin also uses wait/wake_up
and is likewise not fair.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/locking.rst | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'Documentation/virt')

diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 35eca377543d..8138201efb09 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows:
   can be taken inside a kvm->srcu read-side critical section,
   while kvm->slots_lock cannot.
 
+- kvm->mn_active_invalidate_count ensures that pairs of
+  invalidate_range_start() and invalidate_range_end() callbacks
+  use the same memslots array.  kvm->slots_lock and kvm->slots_arch_lock
+  are taken on the waiting side in install_new_memslots, so MMU notifiers
+  must not take either kvm->slots_lock or kvm->slots_arch_lock.
+
 On x86:
 
 - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
-- 
cgit v1.2.3


From 0176ec51290f8ef543a8c18a02e932d6ccedbbc5 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Mon, 2 Aug 2021 16:56:30 +0000
Subject: KVM: stats: Update doc for histogram statistics

Add documentations for linear and logarithmic histogram statistics.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210802165633.1866976-3-jingzhangos@google.com>
[Small changes to the phrasing. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'Documentation/virt')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index dae68e68ca23..86d7ad3a126c 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5207,6 +5207,9 @@ by a string of size ``name_size``.
 	#define KVM_STATS_TYPE_CUMULATIVE	(0x0 << KVM_STATS_TYPE_SHIFT)
 	#define KVM_STATS_TYPE_INSTANT		(0x1 << KVM_STATS_TYPE_SHIFT)
 	#define KVM_STATS_TYPE_PEAK		(0x2 << KVM_STATS_TYPE_SHIFT)
+	#define KVM_STATS_TYPE_LINEAR_HIST	(0x3 << KVM_STATS_TYPE_SHIFT)
+	#define KVM_STATS_TYPE_LOG_HIST		(0x4 << KVM_STATS_TYPE_SHIFT)
+	#define KVM_STATS_TYPE_MAX		KVM_STATS_TYPE_LOG_HIST
 
 	#define KVM_STATS_UNIT_SHIFT		4
 	#define KVM_STATS_UNIT_MASK		(0xF << KVM_STATS_UNIT_SHIFT)
@@ -5214,18 +5217,20 @@ by a string of size ``name_size``.
 	#define KVM_STATS_UNIT_BYTES		(0x1 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_SECONDS		(0x2 << KVM_STATS_UNIT_SHIFT)
 	#define KVM_STATS_UNIT_CYCLES		(0x3 << KVM_STATS_UNIT_SHIFT)
+	#define KVM_STATS_UNIT_MAX		KVM_STATS_UNIT_CYCLES
 
 	#define KVM_STATS_BASE_SHIFT		8
 	#define KVM_STATS_BASE_MASK		(0xF << KVM_STATS_BASE_SHIFT)
 	#define KVM_STATS_BASE_POW10		(0x0 << KVM_STATS_BASE_SHIFT)
 	#define KVM_STATS_BASE_POW2		(0x1 << KVM_STATS_BASE_SHIFT)
+	#define KVM_STATS_BASE_MAX		KVM_STATS_BASE_POW2
 
 	struct kvm_stats_desc {
 		__u32 flags;
 		__s16 exponent;
 		__u16 size;
 		__u32 offset;
-		__u32 unused;
+		__u32 bucket_size;
 		char name[];
 	};
 
@@ -5235,21 +5240,35 @@ The following flags are supported:
 
 Bits 0-3 of ``flags`` encode the type:
   * ``KVM_STATS_TYPE_CUMULATIVE``
-    The statistics data is cumulative. The value of data can only be increased.
+    The statistics reports a cumulative count. The value of data can only be increased.
     Most of the counters used in KVM are of this type.
     The corresponding ``size`` field for this type is always 1.
     All cumulative statistics data are read/write.
   * ``KVM_STATS_TYPE_INSTANT``
-    The statistics data is instantaneous. Its value can be increased or
+    The statistics reports an instantaneous value. Its value can be increased or
     decreased. This type is usually used as a measurement of some resources,
     like the number of dirty pages, the number of large pages, etc.
     All instant statistics are read only.
     The corresponding ``size`` field for this type is always 1.
   * ``KVM_STATS_TYPE_PEAK``
-    The statistics data is peak. The value of data can only be increased, and
-    represents a peak value for a measurement, for example the maximum number
+    The statistics data reports a peak value, for example the maximum number
     of items in a hash table bucket, the longest time waited and so on.
+    The value of data can only be increased.
     The corresponding ``size`` field for this type is always 1.
+  * ``KVM_STATS_TYPE_LINEAR_HIST``
+    The statistic is reported as a linear histogram. The number of
+    buckets is specified by the ``size`` field. The size of buckets is specified
+    by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``)
+    is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last
+    bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity
+    value.) The bucket value indicates how many samples fell in the bucket's range.
+  * ``KVM_STATS_TYPE_LOG_HIST``
+    The statistic is reported as a logarithmic histogram. The number of
+    buckets is specified by the ``size`` field. The range of the first bucket is
+    [0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF).
+    Otherwise, The Nth bucket (1 < N < ``size``) covers
+    [pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell
+    in the bucket's range.
 
 Bits 4-7 of ``flags`` encode the unit:
   * ``KVM_STATS_UNIT_NONE``
@@ -5282,9 +5301,9 @@ unsigned 64bit data.
 The ``offset`` field is the offset from the start of Data Block to the start of
 the corresponding statistics data.
 
-The ``unused`` field is reserved for future support for other types of
-statistics data, like log/linear histogram. Its value is always 0 for the types
-defined above.
+The ``bucket_size`` field is used as a parameter for histogram statistics data.
+It is only used by linear histogram statistics data, specifying the size of a
+bucket.
 
 The ``name`` field is the name string of the statistics data. The name string
 starts at the end of ``struct kvm_stats_desc``.  The maximum length including
-- 
cgit v1.2.3


From 61e5f69ef08379cdc74e8f15d3770976ed48480a Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Wed, 11 Aug 2021 15:29:26 +0300
Subject: KVM: x86: implement KVM_GUESTDBG_BLOCKIRQ

KVM_GUESTDBG_BLOCKIRQ will allow KVM to block all interrupts
while running.

This change is mostly intended for more robust single stepping
of the guest and it has the following benefits when enabled:

* Resuming from a breakpoint is much more reliable.
  When resuming execution from a breakpoint, with interrupts enabled,
  more often than not, KVM would inject an interrupt and make the CPU
  jump immediately to the interrupt handler and eventually return to
  the breakpoint, to trigger it again.

  From the user point of view it looks like the CPU never executed a
  single instruction and in some cases that can even prevent forward
  progress, for example, when the breakpoint is placed by an automated
  script (e.g lx-symbols), which does something in response to the
  breakpoint and then continues the guest automatically.
  If the script execution takes enough time for another interrupt to
  arrive, the guest will be stuck on the same breakpoint RIP forever.

* Normal single stepping is much more predictable, since it won't
  land the debugger into an interrupt handler.

* RFLAGS.TF has less chance to be leaked to the guest:

  We set that flag behind the guest's back to do single stepping
  but if single step lands us into an interrupt/exception handler
  it will be leaked to the guest in the form of being pushed
  to the stack.
  This doesn't completely eliminate this problem as exceptions
  can still happen, but at least this reduces the chances
  of this happening.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210811122927.900604-6-mlevitsk@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  | 1 +
 arch/x86/include/asm/kvm_host.h | 3 ++-
 arch/x86/include/uapi/asm/kvm.h | 1 +
 arch/x86/kvm/x86.c              | 4 ++++
 4 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'Documentation/virt')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 86d7ad3a126c..4ea1bb28297b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -3357,6 +3357,7 @@ flags which can include the following:
   - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
+  - KVM_GUESTDBG_BLOCKIRQ:      avoid injecting interrupts/NMI/SMI [x86]
 
 For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints
 are enabled in memory so we need to ensure breakpoint exceptions are
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4c4983a9378c..7723865077fd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -222,7 +222,8 @@ enum x86_intercept_stage;
 	KVM_GUESTDBG_USE_HW_BP | \
 	KVM_GUESTDBG_USE_SW_BP | \
 	KVM_GUESTDBG_INJECT_BP | \
-	KVM_GUESTDBG_INJECT_DB)
+	KVM_GUESTDBG_INJECT_DB | \
+	KVM_GUESTDBG_BLOCKIRQ)
 
 
 #define PFERR_PRESENT_BIT 0
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a6c327f8ad9e..2ef1f6513c68 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -295,6 +295,7 @@ struct kvm_debug_exit_arch {
 #define KVM_GUESTDBG_USE_HW_BP		0x00020000
 #define KVM_GUESTDBG_INJECT_DB		0x00040000
 #define KVM_GUESTDBG_INJECT_BP		0x00080000
+#define KVM_GUESTDBG_BLOCKIRQ		0x00100000
 
 /* for KVM_SET_GUEST_DEBUG */
 struct kvm_guest_debug_arch {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4e07cae56636..1a00af1b076b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8892,6 +8892,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
 		can_inject = false;
 	}
 
+	/* Don't inject interrupts if the user asked to avoid doing so */
+	if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
+		return 0;
+
 	/*
 	 * Finally, inject interrupt events.  If an event cannot be injected
 	 * due to architectural conditions (e.g. IF=0) a window-open exit
-- 
cgit v1.2.3