From 812756a82ea51e3c7ff7ba5e6fa3f34345234bc7 Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Tue, 14 Apr 2020 17:56:25 +0200 Subject: kvm_host: unify VM_STAT and VCPU_STAT definitions in a single place The macros VM_STAT and VCPU_STAT are redundantly implemented in multiple files, each used by a different architecure to initialize the debugfs entries for statistics. Since they all have the same purpose, they can be unified in a single common definition in include/linux/kvm_host.h Signed-off-by: Emanuele Giuseppe Esposito Message-Id: <20200414155625.20559-1-eesposit@redhat.com> Acked-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 01276e3d01b9..658215f6102c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1130,6 +1130,11 @@ struct kvm_stats_debugfs_item { #define KVM_DBGFS_GET_MODE(dbgfs_item) \ ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644) +#define VM_STAT(n, x, ...) \ + { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ } +#define VCPU_STAT(n, x, ...) \ + { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ } + extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; -- cgit v1.2.3 From c36b71503a2268206ebeda6697094ffb4e7e94c2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 16 Apr 2020 09:48:07 -0400 Subject: KVM: x86/mmu: Avoid an extra memslot lookup in try_async_pf() for L2 Create a new function kvm_is_visible_memslot() and use it from kvm_is_visible_gfn(); use the new function in try_async_pf() too, to avoid an extra memslot lookup. Opportunistically squish a multi-line comment into a single-line comment. Note, the end result, KVM_PFN_NOSLOT, is unchanged. Cc: Jim Mattson Cc: Rick Edgecombe Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 +++------ include/linux/kvm_host.h | 6 ++++++ virt/kvm/kvm_main.c | 6 +----- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b67046fa0ac4..e618472c572b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4082,19 +4082,16 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, bool *writable) { - struct kvm_memory_slot *slot; + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; - /* - * Don't expose private memslots to L2. - */ - if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { *pfn = KVM_PFN_NOSLOT; *writable = false; return false; } - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); async = false; *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 658215f6102c..7d4f1eb70274 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1357,6 +1357,12 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ +static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot) +{ + return (memslot && memslot->id < KVM_USER_MEM_SLOTS && + !(memslot->flags & KVM_MEMSLOT_INVALID)); +} + struct kvm_vcpu *kvm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index da8fd45e0e3e..8aa577db131e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1607,11 +1607,7 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); - if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || - memslot->flags & KVM_MEMSLOT_INVALID) - return false; - - return true; + return kvm_is_visible_memslot(memslot); } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); -- cgit v1.2.3 From 1b94f6f81007b4afaea3480ec018bc9236148961 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 16 Apr 2020 13:10:57 +0800 Subject: KVM: Remove redundant argument to kvm_arch_vcpu_ioctl_run In earlier versions of kvm, 'kvm_run' was an independent structure and was not included in the vcpu structure. At present, 'kvm_run' is already included in the vcpu structure, so the parameter 'kvm_run' is redundant. This patch simplifies the function definition, removes the extra 'kvm_run' parameter, and extracts it from the 'kvm_vcpu' structure if necessary. Signed-off-by: Tianjia Zhang Message-Id: <20200416051057.26526-1-tianjia.zhang@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 3 ++- arch/powerpc/kvm/powerpc.c | 3 ++- arch/s390/kvm/kvm-s390.c | 3 ++- arch/x86/kvm/x86.c | 11 ++++++----- include/linux/kvm_host.h | 2 +- virt/kvm/arm/arm.c | 6 +++--- virt/kvm/kvm_main.c | 2 +- 7 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index fdf1c14d9205..9f50ceef9978 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -438,8 +438,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int r = -EINTR; vcpu_load(vcpu); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e15166b0a16d..7e24691e138a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1764,8 +1764,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int r; vcpu_load(vcpu); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5307929a6c89..75471b646fd7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4333,8 +4333,9 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) store_regs_fmt2(vcpu, kvm_run); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int rc; if (kvm_run->immediate_exit) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d9bcd5faac5..59958ce2b681 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8710,8 +8710,9 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) trace_kvm_fpu(0); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int r; vcpu_load(vcpu); @@ -8729,18 +8730,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + kvm_run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } goto out; } - if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { r = -EINVAL; goto out; } - if (vcpu->run->kvm_dirty_regs) { + if (kvm_run->kvm_dirty_regs) { r = sync_regs(vcpu); if (r != 0) goto out; @@ -8770,7 +8771,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) out: kvm_put_guest_fpu(vcpu); - if (vcpu->run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7d4f1eb70274..5285a5568208 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -866,7 +866,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state); int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu); int kvm_arch_init(void *opaque); void kvm_arch_exit(void); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 48d0ec44ad77..f5390ac2165b 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -639,7 +639,6 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer - * @run: The kvm_run structure pointer used for userspace state exchange * * This function is called through the VCPU_RUN ioctl called from user space. It * will execute VM code in a loop until the time slice for the process is used @@ -647,8 +646,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) * return with return value 0 and with the kvm_run structure filled in with the * required data for the requested emulation. */ -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int ret; if (unlikely(!kvm_vcpu_initialized(vcpu))) @@ -659,7 +659,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return ret; if (run->exit_reason == KVM_EXIT_MMIO) { - ret = kvm_handle_mmio_return(vcpu, vcpu->run); + ret = kvm_handle_mmio_return(vcpu, run); if (ret) return ret; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8aa577db131e..e2f60e313c87 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3129,7 +3129,7 @@ static long kvm_vcpu_ioctl(struct file *filp, synchronize_rcu(); put_pid(oldpid); } - r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); + r = kvm_arch_vcpu_ioctl_run(vcpu); trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } -- cgit v1.2.3 From acd05785e48c01edb2c4f4d014d28478b5f19fb5 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 17 Apr 2020 15:14:46 -0700 Subject: kvm: add capability for halt polling KVM_CAP_HALT_POLL is a per-VM capability that lets userspace control the halt-polling time, allowing halt-polling to be tuned or disabled on particular VMs. With dynamic halt-polling, a VM's VCPUs can poll from anywhere from [0, halt_poll_ns] on each halt. KVM_CAP_HALT_POLL sets the upper limit on the poll time. Signed-off-by: David Matlack Signed-off-by: Jon Cargille Reviewed-by: Jim Mattson Message-Id: <20200417221446.108733-1-jcargill@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 17 +++++++++++++++++ include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 1 + virt/kvm/kvm_main.c | 19 +++++++++++++++---- 4 files changed, 34 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index efbbe570aa9b..d871dacb984e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5802,6 +5802,23 @@ If present, this capability can be enabled for a VM, meaning that KVM will allow the transition to secure guest mode. Otherwise KVM will veto the transition. +7.20 KVM_CAP_HALT_POLL +---------------------- + +:Architectures: all +:Target: VM +:Parameters: args[0] is the maximum poll time in nanoseconds +:Returns: 0 on success; -1 on error + +This capability overrides the kvm module parameter halt_poll_ns for the +target VM. + +VCPU polling allows a VCPU to poll for wakeup events instead of immediately +scheduling during guest halts. The maximum time a VCPU can spend polling is +controlled by the kvm module parameter halt_poll_ns. This capability allows +the maximum halt time to specified on a per-VM basis, effectively overriding +the module parameter for the target VM. + 8. Other capabilities. ====================== diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5285a5568208..3cc6ccbb1183 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -503,6 +503,7 @@ struct kvm { struct srcu_struct srcu; struct srcu_struct irq_srcu; pid_t userspace_pid; + unsigned int max_halt_poll_ns; }; #define kvm_err(fmt, ...) \ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 428c7dde6b4b..ac9eba0289d1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1017,6 +1017,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_VCPU_RESETS 179 #define KVM_CAP_S390_PROTECTED 180 #define KVM_CAP_PPC_SECURE_GUEST 181 +#define KVM_CAP_HALT_POLL 182 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e7436d054305..33e1eee96f75 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -710,6 +710,8 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err_no_arch_destroy_vm; } + kvm->max_halt_poll_ns = halt_poll_ns; + r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm; @@ -2713,15 +2715,16 @@ out: if (!kvm_arch_no_poll(vcpu)) { if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); - } else if (halt_poll_ns) { + } else if (vcpu->kvm->max_halt_poll_ns) { if (block_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + else if (vcpu->halt_poll_ns && + block_ns > vcpu->kvm->max_halt_poll_ns) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ - else if (vcpu->halt_poll_ns < halt_poll_ns && - block_ns < halt_poll_ns) + else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && + block_ns < vcpu->kvm->max_halt_poll_ns) grow_halt_poll_ns(vcpu); } else { vcpu->halt_poll_ns = 0; @@ -3510,6 +3513,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: case KVM_CAP_ENABLE_CAP_VM: + case KVM_CAP_HALT_POLL: return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -3560,6 +3564,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, return 0; } #endif + case KVM_CAP_HALT_POLL: { + if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) + return -EINVAL; + + kvm->max_halt_poll_ns = cap->args[0]; + return 0; + } default: return kvm_vm_ioctl_enable_cap(kvm, cap); } -- cgit v1.2.3 From 9d9a6ebfea329cadeda87544dceae01e9c27aaef Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:34 -0700 Subject: rcuwait: Let rcuwait_wake_up() return whether or not a task was awoken Propagating the return value of wake_up_process() back to the caller can come in handy for future users, such as for statistics or accounting purposes. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-3-dave@stgolabs.net> Signed-off-by: Paolo Bonzini --- include/linux/rcuwait.h | 2 +- kernel/exit.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 2ffe1ee6d482..6ebb23258a27 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -25,7 +25,7 @@ static inline void rcuwait_init(struct rcuwait *w) w->task = NULL; } -extern void rcuwait_wake_up(struct rcuwait *w); +extern int rcuwait_wake_up(struct rcuwait *w); /* * The caller is responsible for locking around rcuwait_wait_event(), diff --git a/kernel/exit.c b/kernel/exit.c index 9f9015f3f6b0..f3beb637acf7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -227,8 +227,9 @@ repeat: goto repeat; } -void rcuwait_wake_up(struct rcuwait *w) +int rcuwait_wake_up(struct rcuwait *w) { + int ret = 0; struct task_struct *task; rcu_read_lock(); @@ -248,8 +249,10 @@ void rcuwait_wake_up(struct rcuwait *w) task = rcu_dereference(w->task); if (task) - wake_up_process(task); + ret = wake_up_process(task); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); -- cgit v1.2.3 From 5c21f7b322cb9fcb97f2e22f5976a0ae8dd71354 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:35 -0700 Subject: rcuwait: Introduce prepare_to and finish_rcuwait This allows further flexibility for some callers to implement ad-hoc versions of the generic rcuwait_wait_event(). For example, kvm will need this to maintain tracing semantics. The naming is of course similar to what waitqueue apis offer. Also go ahead and make use of rcu_assign_pointer() for both task writes as it will make the __rcu sparse people happy - this will be the special nil case, thus no added serialization. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-4-dave@stgolabs.net> Signed-off-by: Paolo Bonzini --- include/linux/rcuwait.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 6ebb23258a27..45bc6604e9b1 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -29,12 +29,25 @@ extern int rcuwait_wake_up(struct rcuwait *w); /* * The caller is responsible for locking around rcuwait_wait_event(), - * such that writes to @task are properly serialized. + * and [prepare_to/finish]_rcuwait() such that writes to @task are + * properly serialized. */ + +static inline void prepare_to_rcuwait(struct rcuwait *w) +{ + rcu_assign_pointer(w->task, current); +} + +static inline void finish_rcuwait(struct rcuwait *w) +{ + rcu_assign_pointer(w->task, NULL); + __set_current_state(TASK_RUNNING); +} + #define rcuwait_wait_event(w, condition, state) \ ({ \ int __ret = 0; \ - rcu_assign_pointer((w)->task, current); \ + prepare_to_rcuwait(w); \ for (;;) { \ /* \ * Implicit barrier (A) pairs with (B) in \ @@ -51,9 +64,7 @@ extern int rcuwait_wake_up(struct rcuwait *w); \ schedule(); \ } \ - \ - WRITE_ONCE((w)->task, NULL); \ - __set_current_state(TASK_RUNNING); \ + finish_rcuwait(w); \ __ret; \ }) -- cgit v1.2.3 From 191a43be61d6791fd4a9098a35c1a09e73f55228 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:36 -0700 Subject: rcuwait: Introduce rcuwait_active() This call is lockless and thus should not be trusted blindly. For example, the return value of rcuwait_wakeup() should be used to track whether a process was woken up. Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-5-dave@stgolabs.net> Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini --- include/linux/rcuwait.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 45bc6604e9b1..c1414ce44abc 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -25,6 +25,15 @@ static inline void rcuwait_init(struct rcuwait *w) w->task = NULL; } +/* + * Note: this provides no serialization and, just as with waitqueues, + * requires care to estimate as to whether or not the wait is active. + */ +static inline int rcuwait_active(struct rcuwait *w) +{ + return !!rcu_dereference(w->task); +} + extern int rcuwait_wake_up(struct rcuwait *w); /* -- cgit v1.2.3 From da4ad88cab5867ee240dfd0585e9d115a8cc47db Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Apr 2020 22:48:37 -0700 Subject: kvm: Replace vcpu->swait with rcuwait The use of any sort of waitqueue (simple or regular) for wait/waking vcpus has always been an overkill and semantically wrong. Because this is per-vcpu (which is blocked) there is only ever a single waiting vcpu, thus no need for any sort of queue. As such, make use of the rcuwait primitive, with the following considerations: - rcuwait already provides the proper barriers that serialize concurrent waiter and waker. - Task wakeup is done in rcu read critical region, with a stable task pointer. - Because there is no concurrency among waiters, we need not worry about rcuwait_wait_event() calls corrupting the wait->task. As a consequence, this saves the locking done in swait when modifying the queue. This also applies to per-vcore wait for powerpc kvm-hv. The x86 tscdeadline_latency test mentioned in 8577370fb0cb ("KVM: Use simple waitqueue for vcpu->wq") shows that, on avg, latency is reduced by around 15-20% with this change. Cc: Paul Mackerras Cc: kvmarm@lists.cs.columbia.edu Cc: linux-mips@vger.kernel.org Reviewed-by: Marc Zyngier Signed-off-by: Davidlohr Bueso Message-Id: <20200424054837.5138-6-dave@stgolabs.net> [Avoid extra logic changes. - Paolo] Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 6 ++---- arch/powerpc/include/asm/kvm_book3s.h | 2 +- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/kvm/book3s_hv.c | 23 ++++++++++------------- arch/powerpc/kvm/powerpc.c | 2 +- arch/x86/kvm/lapic.c | 2 +- include/linux/kvm_host.h | 10 +++++----- virt/kvm/arm/arch_timer.c | 3 ++- virt/kvm/arm/arm.c | 9 +++++---- virt/kvm/async_pf.c | 3 +-- virt/kvm/kvm_main.c | 19 +++++++++---------- 11 files changed, 38 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9f50ceef9978..9787cdec33e6 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -283,8 +283,7 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) kvm_mips_callbacks->queue_timer_int(vcpu); vcpu->arch.wait = 0; - if (swq_has_sleeper(&vcpu->wq)) - swake_up_one(&vcpu->wq); + rcuwait_wake_up(&vcpu->wait); return kvm_mips_count_timeout(vcpu); } @@ -511,8 +510,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, dvcpu->arch.wait = 0; - if (swq_has_sleeper(&dvcpu->wq)) - swake_up_one(&dvcpu->wq); + rcuwait_wake_up(&dvcpu->wait); return 0; } diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 506e4df2d730..6e5d85ba588d 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -78,7 +78,7 @@ struct kvmppc_vcore { struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; struct list_head preempt_list; spinlock_t lock; - struct swait_queue_head wq; + struct rcuwait wait; spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ u64 stolen_tb; u64 preempt_tb; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1dc63101ffe1..337047ba4a56 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -751,7 +751,7 @@ struct kvm_vcpu_arch { u8 irq_pending; /* Used by XIVE to signal pending guest irqs */ u32 last_inst; - struct swait_queue_head *wqp; + struct rcuwait *waitp; struct kvmppc_vcore *vcore; int ret; int trap; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 93493f0cbfe8..7f59c47a5b9d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -230,13 +230,11 @@ static bool kvmppc_ipi_thread(int cpu) static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) { int cpu; - struct swait_queue_head *wqp; + struct rcuwait *waitp; - wqp = kvm_arch_vcpu_wq(vcpu); - if (swq_has_sleeper(wqp)) { - swake_up_one(wqp); + waitp = kvm_arch_vcpu_get_wait(vcpu); + if (rcuwait_wake_up(waitp)) ++vcpu->stat.halt_wakeup; - } cpu = READ_ONCE(vcpu->arch.thread_cpu); if (cpu >= 0 && kvmppc_ipi_thread(cpu)) @@ -2125,7 +2123,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id) spin_lock_init(&vcore->lock); spin_lock_init(&vcore->stoltb_lock); - init_swait_queue_head(&vcore->wq); + rcuwait_init(&vcore->wait); vcore->preempt_tb = TB_NIL; vcore->lpcr = kvm->arch.lpcr; vcore->first_vcpuid = id; @@ -3784,7 +3782,6 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) ktime_t cur, start_poll, start_wait; int do_sleep = 1; u64 block_ns; - DECLARE_SWAITQUEUE(wait); /* Poll for pending exceptions and ceded state */ cur = start_poll = ktime_get(); @@ -3812,10 +3809,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) } } - prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE); - + prepare_to_rcuwait(&vc->wait); + set_current_state(TASK_INTERRUPTIBLE); if (kvmppc_vcore_check_block(vc)) { - finish_swait(&vc->wq, &wait); + finish_rcuwait(&vc->wait); do_sleep = 0; /* If we polled, count this as a successful poll */ if (vc->halt_poll_ns) @@ -3829,7 +3826,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) trace_kvmppc_vcore_blocked(vc, 0); spin_unlock(&vc->lock); schedule(); - finish_swait(&vc->wq, &wait); + finish_rcuwait(&vc->wait); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; trace_kvmppc_vcore_blocked(vc, 1); @@ -3940,7 +3937,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { - swake_up_one(&vc->wq); + rcuwait_wake_up(&vc->wait); } } @@ -4279,7 +4276,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) } user_vrsave = mfspr(SPRN_VRSAVE); - vcpu->arch.wqp = &vcpu->arch.vcore->wq; + vcpu->arch.waitp = &vcpu->arch.vcore->wait; vcpu->arch.pgdir = kvm->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 052614e9d468..27ccff612903 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -752,7 +752,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (err) goto out_vcpu_uninit; - vcpu->arch.wqp = &vcpu->wq; + vcpu->arch.waitp = &vcpu->wait; kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); return 0; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 38f7dc9c16ee..42cd2e3ec6fd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1861,7 +1861,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out; - WARN_ON(swait_active(&vcpu->wq)); + WARN_ON(rcuwait_active(&vcpu->wait)); cancel_hv_timer(apic); apic_timer_expired(apic); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index abfa71cb5d2d..161684696610 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -277,7 +277,7 @@ struct kvm_vcpu { struct mutex mutex; struct kvm_run *run; - struct swait_queue_head wq; + struct rcuwait wait; struct pid __rcu *pid; int sigset_active; sigset_t sigset; @@ -960,12 +960,12 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) } #endif -static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) +static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_WQP - return vcpu->arch.wqp; + return vcpu->arch.waitp; #else - return &vcpu->wq; + return &vcpu->wait; #endif } diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 93bd59b46848..d5024416e722 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -571,6 +571,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct timer_map map; + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); if (unlikely(!timer->enabled)) return; @@ -593,7 +594,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) if (map.emul_ptimer) soft_timer_cancel(&map.emul_ptimer->hrtimer); - if (swait_active(kvm_arch_vcpu_wq(vcpu))) + if (rcuwait_active(wait)) kvm_timer_blocking(vcpu); /* diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f5390ac2165b..d5db0d6141ff 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -579,16 +579,17 @@ void kvm_arm_resume_guest(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.pause = false; - swake_up_one(kvm_arch_vcpu_wq(vcpu)); + rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu)); } } static void vcpu_req_sleep(struct kvm_vcpu *vcpu) { - struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); - swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) && - (!vcpu->arch.pause))); + rcuwait_wait_event(wait, + (!vcpu->arch.power_off) &&(!vcpu->arch.pause), + TASK_INTERRUPTIBLE); if (vcpu->arch.power_off || vcpu->arch.pause) { /* Awaken to handle a signal, request we sleep again later. */ diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 15e5b037f92d..10b533f641a6 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -80,8 +80,7 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, cr2_or_gpa); - if (swq_has_sleeper(&vcpu->wq)) - swake_up_one(&vcpu->wq); + rcuwait_wake_up(&vcpu->wait); mmput(mm); kvm_put_kvm(vcpu->kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7525f3838160..e12317f32c5e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -349,7 +349,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; - init_swait_queue_head(&vcpu->wq); + rcuwait_init(&vcpu->wait); kvm_async_pf_vcpu_init(vcpu); vcpu->pre_pcpu = -1; @@ -2678,7 +2678,6 @@ out: void kvm_vcpu_block(struct kvm_vcpu *vcpu) { ktime_t start, cur; - DECLARE_SWAITQUEUE(wait); bool waited = false; u64 block_ns; @@ -2704,8 +2703,9 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } while (single_task_running() && ktime_before(cur, stop)); } + prepare_to_rcuwait(&vcpu->wait); for (;;) { - prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (kvm_vcpu_check_block(vcpu) < 0) break; @@ -2713,8 +2713,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) waited = true; schedule(); } - - finish_swait(&vcpu->wq, &wait); + finish_rcuwait(&vcpu->wait); cur = ktime_get(); out: kvm_arch_vcpu_unblocking(vcpu); @@ -2746,11 +2745,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_block); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { - struct swait_queue_head *wqp; + struct rcuwait *waitp; - wqp = kvm_arch_vcpu_wq(vcpu); - if (swq_has_sleeper(wqp)) { - swake_up_one(wqp); + waitp = kvm_arch_vcpu_get_wait(vcpu); + if (rcuwait_wake_up(waitp)) { WRITE_ONCE(vcpu->ready, true); ++vcpu->stat.halt_wakeup; return true; @@ -2892,7 +2890,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (vcpu == me) continue; - if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) + if (rcuwait_active(&vcpu->wait) && + !vcpu_dy_runnable(vcpu)) continue; if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) -- cgit v1.2.3 From 0995a5dfbe49badff78e78761fb66f46579f2f9a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Mar 2020 13:09:50 +0100 Subject: tracing: Provide lockdep less trace_hardirqs_on/off() variants trace_hardirqs_on/off() is only partially safe vs. RCU idle. The tracer core itself is safe, but the resulting tracepoints can be utilized by e.g. BPF which is unsafe. Provide variants which do not contain the lockdep invocation so the lockdep and tracer invocations can be split at the call site and placed properly. This is required because lockdep needs to be aware of the state before switching away from RCU idle and after switching to RCU idle because these transitions can take locks. As these code pathes are going to be non-instrumentable the tracer can be invoked after RCU is turned on and before the switch to RCU idle. So for these new variants there is no need to invoke the rcuidle aware tracer functions. Name them so they match the lockdep counterparts. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134100.270771162@linutronix.de --- include/linux/irqflags.h | 4 ++++ kernel/trace/trace_preemptirq.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 61a9ced3aa50..f150e69ab81d 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -29,6 +29,8 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS + extern void trace_hardirqs_on_prepare(void); + extern void trace_hardirqs_off_prepare(void); extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); # define lockdep_hardirq_context(p) ((p)->hardirq_context) @@ -96,6 +98,8 @@ do { \ } while (0) #else +# define trace_hardirqs_on_prepare() do { } while (0) +# define trace_hardirqs_off_prepare() do { } while (0) # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) # define lockdep_hardirq_context(p) 0 diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 4d8e99fdbbbe..c00880162b06 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -19,6 +19,24 @@ /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); +/* + * Like trace_hardirqs_on() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_on_prepare(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } +} +EXPORT_SYMBOL(trace_hardirqs_on_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); + void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { @@ -33,6 +51,25 @@ void trace_hardirqs_on(void) EXPORT_SYMBOL(trace_hardirqs_on); NOKPROBE_SYMBOL(trace_hardirqs_on); +/* + * Like trace_hardirqs_off() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_off_prepare(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + } + +} +EXPORT_SYMBOL(trace_hardirqs_off_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_off_prepare); + void trace_hardirqs_off(void) { if (!this_cpu_read(tracing_irq_cpu)) { -- cgit v1.2.3 From c86e9b987cea3dd0209203e714553a47f5d7c6dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Mar 2020 14:22:03 +0100 Subject: lockdep: Prepare for noinstr sections Force inlining and prevent instrumentation of all sorts by marking the functions which are invoked from low level entry code with 'noinstr'. Split the irqflags tracking into two parts. One which does the heavy lifting while RCU is watching and the final one which can be invoked after RCU is turned off. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.484532537@linutronix.de --- include/linux/irqflags.h | 2 + include/linux/sched.h | 1 + kernel/locking/lockdep.c | 86 +++++++++++++++++++++++++++++++---------- kernel/trace/trace_preemptirq.c | 2 + lib/debug_locks.c | 2 +- 5 files changed, 71 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index f150e69ab81d..d7f7e436c3af 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -19,11 +19,13 @@ #ifdef CONFIG_PROVE_LOCKING extern void lockdep_softirqs_on(unsigned long ip); extern void lockdep_softirqs_off(unsigned long ip); + extern void lockdep_hardirqs_on_prepare(unsigned long ip); extern void lockdep_hardirqs_on(unsigned long ip); extern void lockdep_hardirqs_off(unsigned long ip); #else static inline void lockdep_softirqs_on(unsigned long ip) { } static inline void lockdep_softirqs_off(unsigned long ip) { } + static inline void lockdep_hardirqs_on_prepare(unsigned long ip) { } static inline void lockdep_hardirqs_on(unsigned long ip) { } static inline void lockdep_hardirqs_off(unsigned long ip) { } #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..658de6164853 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -983,6 +983,7 @@ struct task_struct { unsigned int hardirq_disable_event; int hardirqs_enabled; int hardirq_context; + u64 hardirq_chain_key; unsigned long softirq_disable_ip; unsigned long softirq_enable_ip; unsigned int softirq_disable_event; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..9ccd675a8b5a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3635,13 +3635,10 @@ mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) /* * Hardirqs will be enabled: */ -static void __trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(void) { struct task_struct *curr = current; - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: @@ -3654,15 +3651,19 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); } -void lockdep_hardirqs_on(unsigned long ip) +/** + * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts + * @ip: Caller address + * + * Invoked before a possible transition to RCU idle from exit to user or + * guest mode. This ensures that all RCU operations are done before RCU + * stops watching. After the RCU transition lockdep_hardirqs_on() has to be + * invoked to set the final state. + */ +void lockdep_hardirqs_on_prepare(unsigned long ip) { if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3698,20 +3699,62 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; + current->hardirq_chain_key = current->curr_chain_key; + current->lockdep_recursion++; - __trace_hardirqs_on_caller(ip); + __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } -NOKPROBE_SYMBOL(lockdep_hardirqs_on); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); + +void noinstr lockdep_hardirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || curr->lockdep_recursion)) + return; + + if (curr->hardirqs_enabled) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + /* + * Ensure the lock stack remained unchanged between + * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on(). + */ + DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != + current->curr_chain_key); + + /* we'll do an OFF -> ON transition: */ + curr->hardirqs_enabled = 1; + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_on_events); +} +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); /* * Hardirqs were disabled: */ -void lockdep_hardirqs_off(unsigned long ip) +void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks || curr->lockdep_recursion)) return; /* @@ -3729,10 +3772,11 @@ void lockdep_hardirqs_off(unsigned long ip) curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); - } else + } else { debug_atomic_inc(redundant_hardirqs_off); + } } -NOKPROBE_SYMBOL(lockdep_hardirqs_off); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -4408,8 +4452,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, dump_stack(); } -static int match_held_lock(const struct held_lock *hlock, - const struct lockdep_map *lock) +static noinstr int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; @@ -4696,7 +4740,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) return 0; } -static nokprobe_inline +static __always_inline int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; @@ -4956,7 +5000,7 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(const struct lockdep_map *lock, int read) +noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index c00880162b06..fb0691b8a88d 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -46,6 +46,7 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); @@ -93,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); diff --git a/lib/debug_locks.c b/lib/debug_locks.c index a75ee30b77cb..06d3135bd184 100644 --- a/lib/debug_locks.c +++ b/lib/debug_locks.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(debug_locks_silent); /* * Generic 'turn off all lock debugging' function: */ -int debug_locks_off(void) +noinstr int debug_locks_off(void) { if (debug_locks && __debug_locks_off()) { if (!debug_locks_silent) { -- cgit v1.2.3 From af1e56b78534c38bb0e0c712ca70e59f816b74e9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Mar 2020 14:53:56 +0100 Subject: context_tracking: Make guest_enter/exit() .noinstr ready Force inlining of the helpers and mark the instrumentable parts accordingly. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134341.672545766@linutronix.de --- include/linux/context_tracking.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 8150f5ac176c..8cac62ee6add 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -101,12 +101,14 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN /* must be called with irqs disabled */ -static inline void guest_enter_irqoff(void) +static __always_inline void guest_enter_irqoff(void) { + instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) vtime_guest_enter(current); else current->flags |= PF_VCPU; + instrumentation_end(); if (context_tracking_enabled()) __context_tracking_enter(CONTEXT_GUEST); @@ -118,39 +120,48 @@ static inline void guest_enter_irqoff(void) * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. */ - if (!context_tracking_enabled_this_cpu()) + if (!context_tracking_enabled_this_cpu()) { + instrumentation_begin(); rcu_virt_note_context_switch(smp_processor_id()); + instrumentation_end(); + } } -static inline void guest_exit_irqoff(void) +static __always_inline void guest_exit_irqoff(void) { if (context_tracking_enabled()) __context_tracking_exit(CONTEXT_GUEST); + instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) vtime_guest_exit(current); else current->flags &= ~PF_VCPU; + instrumentation_end(); } #else -static inline void guest_enter_irqoff(void) +static __always_inline void guest_enter_irqoff(void) { /* * This is running in ioctl context so its safe * to assume that it's the stime pending cputime * to flush. */ + instrumentation_begin(); vtime_account_kernel(current); current->flags |= PF_VCPU; rcu_virt_note_context_switch(smp_processor_id()); + instrumentation_end(); } -static inline void guest_exit_irqoff(void) +static __always_inline void guest_exit_irqoff(void) { + instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_kernel(current); current->flags &= ~PF_VCPU; + instrumentation_end(); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3 From febd668d375caf13a7fcd93b3498366854de854a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2020 06:30:09 -0400 Subject: rcuwait: avoid lockdep splats from rcuwait_active() rcuwait_active only returns whether w->task is not NULL. This is exactly one of the usecases that are mentioned in the documentation for rcu_access_pointer() where it is correct to bypass lockdep checks. This avoids a splat from kvm_vcpu_on_spin(). Reported-by: Wanpeng Li Tested-by: Wanpeng Li Acked-by: Davidlohr Bueso Cc: Peter Zijlstra Signed-off-by: Paolo Bonzini --- include/linux/rcuwait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index c1414ce44abc..61c56cca95c4 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -31,7 +31,7 @@ static inline void rcuwait_init(struct rcuwait *w) */ static inline int rcuwait_active(struct rcuwait *w) { - return !!rcu_dereference(w->task); + return !!rcu_access_pointer(w->task); } extern int rcuwait_wake_up(struct rcuwait *w); -- cgit v1.2.3 From fc5d1f1a42fba6266ab95dc3b84937933a9b5a66 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 1 Dec 2018 08:41:28 -0800 Subject: KVM: arm64: vgic-v3: Take cpu_if pointer directly instead of vcpu If we move the used_lrs field to the version-specific cpu interface structure, the following functions only operate on the struct vgic_v3_cpu_if and not the full vcpu: __vgic_v3_save_state __vgic_v3_restore_state __vgic_v3_activate_traps __vgic_v3_deactivate_traps __vgic_v3_save_aprs __vgic_v3_restore_aprs This is going to be very useful for nested virt, so move the used_lrs field and change the prototypes and implementations of these functions to take the cpu_if parameter directly. No functional change. Reviewed-by: James Morse Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_hyp.h | 12 ++++++------ arch/arm64/kvm/hyp/switch.c | 8 ++++---- arch/arm64/kvm/hyp/vgic-v3-sr.c | 33 ++++++++++----------------------- arch/arm64/kvm/vgic/vgic-v2.c | 10 +++++----- arch/arm64/kvm/vgic/vgic-v3.c | 14 ++++++++------ arch/arm64/kvm/vgic/vgic.c | 25 +++++++++++++++++-------- include/kvm/arm_vgic.h | 5 ++++- 7 files changed, 54 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index fe57f60f06a8..4f67b0cdffe8 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -56,12 +56,12 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); -void __vgic_v3_save_state(struct kvm_vcpu *vcpu); -void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); -void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu); -void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu); -void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu); -void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu); +void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if); +void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); void __timer_enable_traps(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 8a1e81a400e0..c07a45643cd4 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -270,8 +270,8 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { - __vgic_v3_save_state(vcpu); - __vgic_v3_deactivate_traps(vcpu); + __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); } } @@ -279,8 +279,8 @@ static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu) static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu) { if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { - __vgic_v3_activate_traps(vcpu); - __vgic_v3_restore_state(vcpu); + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); + __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); } } diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 6b85773e15c4..10ed539835c1 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -194,10 +194,9 @@ static u32 __hyp_text __vgic_v3_read_ap1rn(int n) return val; } -void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = cpu_if->used_lrs; /* * Make sure stores to the GIC via the memory mapped interface @@ -230,10 +229,9 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) } } -void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = cpu_if->used_lrs; int i; if (used_lrs || cpu_if->its_vpe.its_vm) { @@ -257,10 +255,8 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) } } -void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - /* * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a * Group0 interrupt (as generated in GICv2 mode) to be @@ -306,9 +302,8 @@ void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } -void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u64 val; if (!cpu_if->vgic_sre) { @@ -333,15 +328,11 @@ void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu) write_gicreg(0, ICH_HCR_EL2); } -void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if; u64 val; u32 nr_pre_bits; - vcpu = kern_hyp_va(vcpu); - cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - val = read_gicreg(ICH_VTR_EL2); nr_pre_bits = vtr_to_nr_pre_bits(val); @@ -370,15 +361,11 @@ void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu) } } -void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu) +void __hyp_text __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if) { - struct vgic_v3_cpu_if *cpu_if; u64 val; u32 nr_pre_bits; - vcpu = kern_hyp_va(vcpu); - cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - val = read_gicreg(ICH_VTR_EL2); nr_pre_bits = vtr_to_nr_pre_bits(val); @@ -451,7 +438,7 @@ static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, u32 vmcr, u64 *lr_val) { - unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; + unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; u8 priority = GICv3_IDLE_PRIORITY; int i, lr = -1; @@ -490,7 +477,7 @@ static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, int intid, u64 *lr_val) { - unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs; + unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; int i; for (i = 0; i < used_lrs; i++) { diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 621cc168fe3f..ebf53a4e1296 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -56,7 +56,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->vgic_hcr &= ~GICH_HCR_UIE; - for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { + for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) { u32 val = cpuif->vgic_lr[lr]; u32 cpuid, intid = val & GICH_LR_VIRTUALID; struct vgic_irq *irq; @@ -120,7 +120,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) vgic_put_irq(vcpu->kvm, irq); } - vgic_cpu->used_lrs = 0; + cpuif->used_lrs = 0; } /* @@ -427,7 +427,7 @@ out: static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = cpu_if->used_lrs; u64 elrsr; int i; @@ -448,7 +448,7 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) void vgic_v2_save_state(struct kvm_vcpu *vcpu) { void __iomem *base = kvm_vgic_global_state.vctrl_base; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; if (!base) return; @@ -463,7 +463,7 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; void __iomem *base = kvm_vgic_global_state.vctrl_base; - u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; + u64 used_lrs = cpu_if->used_lrs; int i; if (!base) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 3ccd6d3cb4d3..76e2d85789ed 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -39,7 +39,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) cpuif->vgic_hcr &= ~ICH_HCR_UIE; - for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { + for (lr = 0; lr < cpuif->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; u32 intid, cpuid; struct vgic_irq *irq; @@ -111,7 +111,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) vgic_put_irq(vcpu->kvm, irq); } - vgic_cpu->used_lrs = 0; + cpuif->used_lrs = 0; } /* Requires the irq to be locked already */ @@ -662,10 +662,10 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) if (likely(cpu_if->vgic_sre)) kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - kvm_call_hyp(__vgic_v3_restore_aprs, vcpu); + kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if)); if (has_vhe()) - __vgic_v3_activate_traps(vcpu); + __vgic_v3_activate_traps(cpu_if); WARN_ON(vgic_v4_load(vcpu)); } @@ -680,12 +680,14 @@ void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu) void vgic_v3_put(struct kvm_vcpu *vcpu) { + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + WARN_ON(vgic_v4_put(vcpu, false)); vgic_v3_vmcr_sync(vcpu); - kvm_call_hyp(__vgic_v3_save_aprs, vcpu); + kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if)); if (has_vhe()) - __vgic_v3_deactivate_traps(vcpu); + __vgic_v3_deactivate_traps(cpu_if); } diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 99b02ca730a8..c3643b7f101b 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -786,6 +786,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) int count; bool multi_sgi; u8 prio = 0xff; + int i = 0; lockdep_assert_held(&vgic_cpu->ap_list_lock); @@ -827,11 +828,14 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) } } - vcpu->arch.vgic_cpu.used_lrs = count; - /* Nuke remaining LRs */ - for ( ; count < kvm_vgic_global_state.nr_lr; count++) - vgic_clear_lr(vcpu, count); + for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + vgic_clear_lr(vcpu, i); + + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count; + else + vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count; } static inline bool can_access_vgic_from_kernel(void) @@ -849,13 +853,13 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu) if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_save_state(vcpu); else - __vgic_v3_save_state(vcpu); + __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); } /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + int used_lrs; /* An empty ap_list_head implies used_lrs == 0 */ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) @@ -864,7 +868,12 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) if (can_access_vgic_from_kernel()) vgic_save_state(vcpu); - if (vgic_cpu->used_lrs) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs; + else + used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs; + + if (used_lrs) vgic_fold_lr_state(vcpu); vgic_prune_ap_list(vcpu); } @@ -874,7 +883,7 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu) if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_restore_state(vcpu); else - __vgic_v3_restore_state(vcpu); + __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); } /* Flush our emulation state into the GIC hardware before entering the guest. */ diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 69f4164d6477..a8d8fdcd3723 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -274,6 +274,8 @@ struct vgic_v2_cpu_if { u32 vgic_vmcr; u32 vgic_apr; u32 vgic_lr[VGIC_V2_MAX_LRS]; + + unsigned int used_lrs; }; struct vgic_v3_cpu_if { @@ -291,6 +293,8 @@ struct vgic_v3_cpu_if { * linking the Linux IRQ subsystem and the ITS together. */ struct its_vpe its_vpe; + + unsigned int used_lrs; }; struct vgic_cpu { @@ -300,7 +304,6 @@ struct vgic_cpu { struct vgic_v3_cpu_if vgic_v3; }; - unsigned int used_lrs; struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; raw_spinlock_t ap_list_lock; /* Protects the ap_list */ -- cgit v1.2.3 From 0958f0cefede403037653e44de0e3332d10b0e1a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:19 +0200 Subject: KVM: introduce kvm_read_guest_offset_cached() We already have kvm_write_guest_offset_cached(), introduce read analogue. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 +++ virt/kvm/kvm_main.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 161684696610..f43b59b1294c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -734,6 +734,9 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len); +int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned int offset, + unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c9c6db5f77c2..e2af6ffce9c9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2510,13 +2510,15 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, } EXPORT_SYMBOL_GPL(kvm_write_guest_cached); -int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned int offset, + unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; + gpa_t gpa = ghc->gpa + offset; - BUG_ON(len > ghc->len); + BUG_ON(len + offset > ghc->len); if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) @@ -2527,14 +2529,21 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return -EFAULT; if (unlikely(!ghc->memslot)) - return kvm_read_guest(kvm, ghc->gpa, data, len); + return kvm_read_guest(kvm, gpa, data, len); - r = __copy_from_user(data, (void __user *)ghc->hva, len); + r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); if (r) return -EFAULT; return 0; } +EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); + +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) +{ + return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); +} EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) -- cgit v1.2.3 From 72de5fa4c16195827252b961ba44028a39dfeaff Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:22 +0200 Subject: KVM: x86: announce KVM_FEATURE_ASYNC_PF_INT Introduce new capability to indicate that KVM supports interrupt based delivery of 'page ready' APF events. This includes support for both MSR_KVM_ASYNC_PF_INT and MSR_KVM_ASYNC_PF_ACK. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-8-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/cpuid.rst | 6 ++++++ Documentation/virt/kvm/msr.rst | 12 ++++++++---- arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 1 + include/uapi/linux/kvm.h | 1 + 6 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst index f721c89327ec..a7dff9186bed 100644 --- a/Documentation/virt/kvm/cpuid.rst +++ b/Documentation/virt/kvm/cpuid.rst @@ -86,6 +86,12 @@ KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit before using paravirtualized sched yield. +KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit + before using the second async + pf control msr 0x4b564d06 and + async pf acknowledgment msr + 0x4b564d07. + KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side per-cpu warps are expeced in kvmclock diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst index 9b107889b033..e37a14c323d2 100644 --- a/Documentation/virt/kvm/msr.rst +++ b/Documentation/virt/kvm/msr.rst @@ -213,7 +213,8 @@ data: cpl == 0. Bit 2 is 1 if asynchronous page faults are delivered to L1 as #PF vmexits. Bit 2 can be set only if KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. Bit 3 enables interrupt based delivery of 'page ready' - events. + events. Bit 3 can only be set if KVM_FEATURE_ASYNC_PF_INT is present in + CPUID. 'Page not present' events are currently always delivered as synthetic #PF exception. During delivery of these events APF CR2 register contains @@ -242,7 +243,8 @@ data: Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page ready' APF delivery needs to be written to before enabling APF mechanism - in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected. + in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected. The MSR is + available if KVM_FEATURE_ASYNC_PF_INT is present in CPUID. Note, previously, 'page ready' events were delivered via the same #PF exception as 'page not present' events but this is now deprecated. If @@ -360,7 +362,8 @@ data: Interrupt vector for asynchnonous 'page ready' notifications delivery. The vector has to be set up before asynchronous page fault mechanism - is enabled in MSR_KVM_ASYNC_PF_EN. + is enabled in MSR_KVM_ASYNC_PF_EN. The MSR is only available if + KVM_FEATURE_ASYNC_PF_INT is present in CPUID. MSR_KVM_ASYNC_PF_ACK: 0x4b564d07 @@ -371,4 +374,5 @@ data: When the guest is done processing 'page ready' APF event and 'token' field in 'struct kvm_vcpu_pv_apf_data' is cleared it is supposed to write '1' to bit 0 of the MSR, this causes the host to re-scan its queue - and check if there are more notifications pending. + and check if there are more notifications pending. The MSR is available + if KVM_FEATURE_ASYNC_PF_INT is present in CPUID. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 7ac20df80ba8..812e9b4c1114 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -31,6 +31,7 @@ #define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_FEATURE_POLL_CONTROL 12 #define KVM_FEATURE_PV_SCHED_YIELD 13 +#define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index cd708b0b460a..a9f1905896fb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -711,7 +711,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | (1 << KVM_FEATURE_PV_SEND_IPI) | (1 << KVM_FEATURE_POLL_CONTROL) | - (1 << KVM_FEATURE_PV_SCHED_YIELD); + (1 << KVM_FEATURE_PV_SCHED_YIELD) | + (1 << KVM_FEATURE_ASYNC_PF_INT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9d709a672f3..ae3a7f2fbda2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3446,6 +3446,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ac9eba0289d1..fc0075988b80 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1018,6 +1018,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_PROTECTED 180 #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 +#define KVM_CAP_ASYNC_PF_INT 183 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From f7d31e65368aeef973fab788aa22c4f1d5a6af66 Mon Sep 17 00:00:00 2001 From: Jon Doron Date: Fri, 24 Apr 2020 14:37:40 +0300 Subject: x86/kvm/hyper-v: Explicitly align hcall param for kvm_hyperv_exit The problem the patch is trying to address is the fact that 'struct kvm_hyperv_exit' has different layout on when compiling in 32 and 64 bit modes. In 64-bit mode the default alignment boundary is 64 bits thus forcing extra gaps after 'type' and 'msr' but in 32-bit mode the boundary is at 32 bits thus no extra gaps. This is an issue as even when the kernel is 64 bit, the userspace using the interface can be both 32 and 64 bit but the same 32 bit userspace has to work with 32 bit kernel. The issue is fixed by forcing the 64 bit layout, this leads to ABI change for 32 bit builds and while we are obviously breaking '32 bit userspace with 32 bit kernel' case, we're fixing the '32 bit userspace with 64 bit kernel' one. As the interface has no (known) users and 32 bit KVM is rather baroque nowadays, this seems like a reasonable decision. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Jon Doron Message-Id: <20200424113746.3473563-2-arilou@gmail.com> Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 2 ++ include/uapi/linux/kvm.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d871dacb984e..d0569db2b1e2 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5067,9 +5067,11 @@ EOI was received. #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 __u32 type; + __u32 pad1; union { struct { __u32 msr; + __u32 pad2; __u64 control; __u64 evt_page; __u64 msg_page; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index fc0075988b80..cc04aceb8793 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -189,9 +189,11 @@ struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 __u32 type; + __u32 pad1; union { struct { __u32 msr; + __u32 pad2; __u64 control; __u64 evt_page; __u64 msg_page; -- cgit v1.2.3 From f97f5a56f5977311f3833056a73cdbb0ee56cb1e Mon Sep 17 00:00:00 2001 From: Jon Doron Date: Fri, 29 May 2020 16:45:40 +0300 Subject: x86/kvm/hyper-v: Add support for synthetic debugger interface Add support for Hyper-V synthetic debugger (syndbg) interface. The syndbg interface is using MSRs to emulate a way to send/recv packets data. The debug transport dll (kdvm/kdnet) will identify if Hyper-V is enabled and if it supports the synthetic debugger interface it will attempt to use it, instead of trying to initialize a network adapter. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Jon Doron Message-Id: <20200529134543.1127440-4-arilou@gmail.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 16 ++++ arch/x86/include/asm/kvm_host.h | 13 ++++ arch/x86/kvm/hyperv.c | 158 +++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/hyperv.h | 5 ++ arch/x86/kvm/trace.h | 51 +++++++++++++ arch/x86/kvm/x86.c | 8 ++ include/uapi/linux/kvm.h | 10 +++ 7 files changed, 258 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index db22e834ce2a..aad60be4884e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5070,6 +5070,7 @@ EOI was received. struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 + #define KVM_EXIT_HYPERV_SYNDBG 3 __u32 type; __u32 pad1; union { @@ -5085,6 +5086,15 @@ EOI was received. __u64 result; __u64 params[2]; } hcall; + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 status; + __u64 send_page; + __u64 recv_page; + __u64 pending_page; + } syndbg; } u; }; /* KVM_EXIT_HYPERV */ @@ -5101,6 +5111,12 @@ Hyper-V SynIC state change. Notification is used to remap SynIC event/message pages and to enable/disable SynIC messages/events processing in userspace. + - KVM_EXIT_HYPERV_SYNDBG -- synchronously notify user-space about + +Hyper-V Synthetic debugger state change. Notification is used to either update +the pending_page location or to send a control command (send the buffer located +in send_page or recv a buffer to recv_page). + :: /* KVM_EXIT_ARM_NISV */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b878fcd164ce..58337a25396a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -863,6 +863,18 @@ struct kvm_apic_map { struct kvm_lapic *phys_map[]; }; +/* Hyper-V synthetic debugger (SynDbg)*/ +struct kvm_hv_syndbg { + struct { + u64 control; + u64 status; + u64 send_page; + u64 recv_page; + u64 pending_page; + } control; + u64 options; +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -886,6 +898,7 @@ struct kvm_hv { atomic_t num_mismatched_vp_indexes; struct hv_partition_assist_pg *hv_pa_pg; + struct kvm_hv_syndbg hv_syndbg; }; enum kvm_irqchip_mode { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f9d3b919823c..c21f99357ad5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -21,6 +21,7 @@ #include "x86.h" #include "lapic.h" #include "ioapic.h" +#include "cpuid.h" #include "hyperv.h" #include @@ -266,6 +267,123 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } +static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, + HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, + 0); + if (!entry) + return false; + + return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; +} + +static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) + hv->hv_syndbg.control.status = + vcpu->run->hyperv.u.syndbg.status; + return 1; +} + +static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + + hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; + hv_vcpu->exit.u.syndbg.msr = msr; + hv_vcpu->exit.u.syndbg.control = syndbg->control.control; + hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; + hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; + hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; + vcpu->arch.complete_userspace_io = + kvm_hv_syndbg_complete_userspace; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); +} + +static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data); + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + syndbg->control.control = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_STATUS: + syndbg->control.status = data; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + syndbg->control.send_page = data; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + syndbg->control.recv_page = data; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + syndbg->control.pending_page = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + syndbg->options = data; + break; + default: + break; + } + + return 0; +} + +static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + *pdata = syndbg->control.control; + break; + case HV_X64_MSR_SYNDBG_STATUS: + *pdata = syndbg->control.status; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + *pdata = syndbg->control.send_page; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + *pdata = syndbg->control.recv_page; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + *pdata = syndbg->control.pending_page; + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + *pdata = syndbg->options; + break; + default: + break; + } + + trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, + *pdata); + + return 0; +} + static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { @@ -800,6 +918,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } @@ -1061,6 +1181,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_set_msr(vcpu, msr, data, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1190,7 +1313,8 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 0; } -static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; @@ -1227,6 +1351,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_get_msr(vcpu, msr, pdata, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1316,7 +1443,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) int r; mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); - r = kvm_hv_get_msr_pw(vcpu, msr, pdata); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else @@ -1795,6 +1922,9 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, + { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, + { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, + { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); @@ -1820,7 +1950,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); - ent->eax = HYPERV_CPUID_NESTED_FEATURES; + ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; @@ -1859,6 +1989,10 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + ent->ebx |= HV_X64_DEBUGGING; + ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; + ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + /* * Direct Synthetic timers only make sense with in-kernel * LAPIC @@ -1902,6 +2036,24 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; + case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: + memcpy(signature, "Linux KVM Hv", 12); + + ent->eax = 0; + ent->ebx = signature[0]; + ent->ecx = signature[1]; + ent->edx = signature[2]; + break; + + case HYPERV_CPUID_SYNDBG_INTERFACE: + memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); + ent->eax = signature[0]; + break; + + case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: + ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + break; + default: break; } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 7f50ff0bad07..e68c6c2e9649 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -73,6 +73,11 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); } +static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu) +{ + return &vcpu->kvm->arch.hyperv.hv_syndbg; +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 54a10c98d746..b66432b015d2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1541,6 +1541,57 @@ TRACE_EVENT(kvm_nested_vmenter_failed, __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); +/* + * Tracepoint for syndbg_set_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_set_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); + +/* + * Tracepoint for syndbg_get_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_get_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca8a57312291..9e41b5135340 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1246,6 +1246,10 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, @@ -3011,6 +3015,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -3272,6 +3278,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cc04aceb8793..6721eb563eda 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -188,6 +188,7 @@ struct kvm_s390_cmma_log { struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 +#define KVM_EXIT_HYPERV_SYNDBG 3 __u32 type; __u32 pad1; union { @@ -203,6 +204,15 @@ struct kvm_hyperv_exit { __u64 result; __u64 params[2]; } hcall; + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 status; + __u64 send_page; + __u64 recv_page; + __u64 pending_page; + } syndbg; } u; }; -- cgit v1.2.3