From bba82fd75694832b59433bf4e9d7ede8de1dfd42 Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Wed, 1 Feb 2017 17:06:11 +1300 Subject: KVM: x86: never specify a sample period for virtualized in_tx_cp counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pmc_reprogram_counter() always sets a sample period based on the value of pmc->counter. However, hsw_hw_config() rejects sample periods less than 2^31 - 1. So for example, if a KVM guest does struct perf_event_attr attr; memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_RAW; attr.size = sizeof(attr); attr.config = 0x2005101c4; // conditional branches retired IN_TXCP attr.sample_period = 0; int fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); the guest kernel counts some conditional branch events, then updates the virtual PMU register with a nonzero count. The host reaches pmc_reprogram_counter() with nonzero pmc->counter, triggers EOPNOTSUPP in hsw_hw_config(), prints "kvm_pmu: event creation failed" in pmc_reprogram_counter(), and silently (from the guest's point of view) stops counting events. We fix event counting by forcing attr.sample_period to always be zero for in_tx_cp counters. Sampling doesn't work, but it already didn't work and can't be fixed without major changes to the approach in hsw_hw_config(). Signed-off-by: Robert O'Callahan Signed-off-by: Radim Krčmář --- arch/x86/kvm/pmu.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 06ce377dcbc9..026db42a86c3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -113,12 +113,19 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .config = config, }; + attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + if (in_tx) attr.config |= HSW_IN_TX; - if (in_tx_cp) + if (in_tx_cp) { + /* + * HSW_IN_TX_CHECKPOINTED is not supported with nonzero + * period. Just clear the sample period so at least + * allocating the counter doesn't fail. + */ + attr.sample_period = 0; attr.config |= HSW_IN_TX_CHECKPOINTED; - - attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + } event = perf_event_create_kernel_counter(&attr, -1, current, intr ? kvm_perf_overflow_intr : -- cgit v1.2.3 From b7ceaec112aa35aa287325754d8c52b8304892cd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Feb 2017 07:36:16 -0800 Subject: x86/asm: Tidy up TSS limit code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In an earlier version of the patch ("x86/kvm/vmx: Defer TR reload after VM exit") that introduced TSS limit validity tracking, I confused which helper was which. On reflection, the names I chose sucked. Rename the helpers to make it more obvious what's going on and add some comments. While I'm at it, clear __tss_limit_invalid when force-reloading as well as when contitionally reloading, since any TR reload fixes the limit. Signed-off-by: Andy Lutomirski Signed-off-by: Radim Krčmář --- arch/x86/include/asm/desc.h | 18 +++++++++++------- arch/x86/kernel/ioport.c | 8 +++++++- arch/x86/kernel/process.c | 6 +++--- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index cb8f9149f6c8..1548ca92ad3f 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -205,6 +205,8 @@ static inline void native_load_tr_desc(void) asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } +DECLARE_PER_CPU(bool, __tss_limit_invalid); + static inline void force_reload_TR(void) { struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); @@ -220,18 +222,20 @@ static inline void force_reload_TR(void) write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS); load_TR_desc(); + this_cpu_write(__tss_limit_invalid, false); } -DECLARE_PER_CPU(bool, need_tr_refresh); - -static inline void refresh_TR(void) +/* + * Call this if you need the TSS limit to be correct, which should be the case + * if and only if you have TIF_IO_BITMAP set or you're switching to a task + * with TIF_IO_BITMAP set. + */ +static inline void refresh_tss_limit(void) { DEBUG_LOCKS_WARN_ON(preemptible()); - if (unlikely(this_cpu_read(need_tr_refresh))) { + if (unlikely(this_cpu_read(__tss_limit_invalid))) force_reload_TR(); - this_cpu_write(need_tr_refresh, false); - } } /* @@ -250,7 +254,7 @@ static inline void invalidate_tss_limit(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) force_reload_TR(); else - this_cpu_write(need_tr_refresh, true); + this_cpu_write(__tss_limit_invalid, true); } static inline void native_load_gdt(const struct desc_ptr *dtr) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index b01bc8517450..875d3d25dd6a 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -47,8 +47,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) t->io_bitmap_ptr = bitmap; set_thread_flag(TIF_IO_BITMAP); + /* + * Now that we have an IO bitmap, we need our TSS limit to be + * correct. It's fine if we are preempted after doing this: + * with TIF_IO_BITMAP set, context switches will keep our TSS + * limit correct. + */ preempt_disable(); - refresh_TR(); + refresh_tss_limit(); preempt_enable(); } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7780efa635b9..0b302591b51f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -65,8 +65,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { }; EXPORT_PER_CPU_SYMBOL(cpu_tss); -DEFINE_PER_CPU(bool, need_tr_refresh); -EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh); +DEFINE_PER_CPU(bool, __tss_limit_invalid); +EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); /* * this gets called so that we can store lazy state into memory and copy the @@ -218,7 +218,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * Make sure that the TSS limit is correct for the CPU * to notice the IO bitmap. */ - refresh_TR(); + refresh_tss_limit(); } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { /* * Clear any possible leftover bits: -- cgit v1.2.3 From 0fce546f9f07b94ccc9de09cf48d35e18946d2fa Mon Sep 17 00:00:00 2001 From: Jérémy Lefaure Date: Sat, 25 Feb 2017 17:46:53 -0500 Subject: x86/kvm/vmx: remove unused variable in segment_base() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pointer 'struct desc_struct *d' is unused since commit 8c2e41f7ae12 ("x86/kvm/vmx: Simplify segment_base()") so let's remove it. Signed-off-by: Jérémy Lefaure Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ef4ba71dbb66..764f1f897847 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2053,7 +2053,6 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) static unsigned long segment_base(u16 selector) { struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - struct desc_struct *d; struct desc_struct *table; unsigned long v; -- cgit v1.2.3 From acc9ab60132739e0c5ab40644444cd7b22d12886 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 27 Feb 2017 04:24:39 -0800 Subject: KVM: nVMX: Fix pending events injection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L2 fails to boot on a non-APICv box dues to 'commit 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running")' KVM internal error. Suberror: 3 extra data[0]: 800000ef extra data[1]: 1 RAX=0000000000000000 RBX=ffffffff81f36140 RCX=0000000000000000 RDX=0000000000000000 RSI=0000000000000000 RDI=0000000000000000 RBP=ffff88007c92fe90 RSP=ffff88007c92fe90 R8 =ffff88007fccdca0 R9 =0000000000000000 R10=00000000fffedb3d R11=0000000000000000 R12=0000000000000003 R13=0000000000000000 R14=0000000000000000 R15=ffff88007c92c000 RIP=ffffffff810645e6 RFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0000 0000000000000000 ffffffff 00c00000 CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA] SS =0000 0000000000000000 ffffffff 00c00000 DS =0000 0000000000000000 ffffffff 00c00000 FS =0000 0000000000000000 ffffffff 00c00000 GS =0000 ffff88007fcc0000 ffffffff 00c00000 LDT=0000 0000000000000000 ffffffff 00c00000 TR =0040 ffff88007fcd4200 00002087 00008b00 DPL=0 TSS64-busy GDT= ffff88007fcc9000 0000007f IDT= ffffffffff578000 00000fff CR0=80050033 CR2=00000000ffffffff CR3=0000000001e0a000 CR4=003406e0 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000fffe0ff0 DR7=0000000000000400 EFER=0000000000000d01 We should try to reinject previous events if any before trying to inject new event if pending. If vmexit is triggered by L2 guest and L0 interested in, we should reinject IDT-vectoring info to L2 through vmcs02 if any, otherwise, we can consider new IRQs/NMIs which can be injected and call nested events callback to switch from L2 to L1 if needed and inject the proper vmexit events. However, 'commit 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running")' results in the handle events order reversely on non-APICv box. This patch fixes it by bailing out for pending events and not consider new events in this scenario. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jan Kiszka Signed-off-by: Wanpeng Li Fixes: 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running") Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 764f1f897847..283aa8601833 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10641,6 +10641,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vcpu->arch.exception.pending || + vcpu->arch.nmi_injected || + vcpu->arch.interrupt.pending) + return -EBUSY; + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && vmx->nested.preemption_timer_expired) { if (vmx->nested.nested_run_pending) @@ -10650,8 +10655,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) } if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { - if (vmx->nested.nested_run_pending || - vcpu->arch.interrupt.pending) + if (vmx->nested.nested_run_pending) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | -- cgit v1.2.3