From 321c5658c5e9192dea0d58ab67cf1791e45b2b26 Mon Sep 17 00:00:00 2001 From: Yuki Shibuya Date: Thu, 24 Mar 2016 05:17:03 +0000 Subject: KVM: x86: Inject pending interrupt even if pending nmi exist Non maskable interrupts (NMI) are preferred to interrupts in current implementation. If a NMI is pending and NMI is blocked by the result of nmi_allowed(), pending interrupt is not injected and enable_irq_window() is not executed, even if interrupts injection is allowed. In old kernel (e.g. 2.6.32), schedule() is often called in NMI context. In this case, interrupts are needed to execute iret that intends end of NMI. The flag of blocking new NMI is not cleared until the guest execute the iret, and interrupts are blocked by pending NMI. Due to this, iret can't be invoked in the guest, and the guest is starved until block is cleared by some events (e.g. canceling injection). This patch injects pending interrupts, when it's allowed, even if NMI is blocked. And, If an interrupts is pending after executing inject_pending_event(), enable_irq_window() is executed regardless of NMI pending counter. Cc: stable@vger.kernel.org Signed-off-by: Yuki Shibuya Suggested-by: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 742d0f7d3556..0a2c70e43bc8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6095,12 +6095,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } /* try to inject new event if pending */ - if (vcpu->arch.nmi_pending) { - if (kvm_x86_ops->nmi_allowed(vcpu)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); - } + if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + --vcpu->arch.nmi_pending; + vcpu->arch.nmi_injected = true; + kvm_x86_ops->set_nmi(vcpu); } else if (kvm_cpu_has_injectable_intr(vcpu)) { /* * Because interrupts can be injected asynchronously, we are @@ -6569,10 +6567,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; /* enable NMI/IRQ window open exits if needed */ - else if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); + else { + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + } if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); -- cgit v1.2.3 From a2b5c3c0c8eea2d5d0eefcfc0fc0bdf386daa260 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Mar 2016 11:23:25 +0200 Subject: KVM: Hyper-V: do not do hypercall userspace exits if SynIC is disabled If SynIC is disabled, there is nothing that userspace can do to handle these exits; on the other hand, userspace probably will not know about KVM_EXIT_HYPERV_HCALL and complain about it or even exit. Just prevent anything bad from happening by handling the hypercall in KVM and returning an "invalid hypercall" code. Fixes: 83326e43f27e9a8a501427a0060f8af519a39bb2 Cc: Andrey Smetanin Reviewed-by: Roman Kagan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5ff3485acb60..01bd7b7a6866 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1116,6 +1116,11 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) break; case HVCALL_POST_MESSAGE: case HVCALL_SIGNAL_EVENT: + /* don't bother userspace if it has no way to handle it */ + if (!vcpu_to_synic(vcpu)->active) { + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; vcpu->run->hyperv.u.hcall.input = param; -- cgit v1.2.3 From 14ebda3394fd3e5388747e742e510b0802a65d24 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Mar 2016 17:56:57 +0200 Subject: KVM: x86: reduce default value of halt_poll_ns parameter Windows lets applications choose the frequency of the timer tick, and in Windows 10 the maximum rate was changed from 1024 Hz to 2048 Hz. Unfortunately, because of the way the Windows API works, most applications who need a higher rate than the default 64 Hz will just do timeGetDevCaps(&tc, sizeof(tc)); timeBeginPeriod(tc.wPeriodMin); and pick the maximum rate. This causes very high CPU usage when playing media or games on Windows 10, even if the guest does not actually use the CPU very much, because the frequent timer tick causes halt_poll_ns to kick in. There is no really good solution, especially because Microsoft could sooner or later bump the limit to 4096 Hz, but for now the best we can do is lower a bit the upper limit for halt_poll_ns. :-( Reported-by: Jon Panozzo Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f62a9f37f79f..b7e394485a5f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,7 +43,7 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 -#define KVM_HALT_POLL_NS_DEFAULT 500000 +#define KVM_HALT_POLL_NS_DEFAULT 400000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS -- cgit v1.2.3 From 14f4760562e41d50817d56b42c821d70ad10b483 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 30 Mar 2016 13:38:09 -0700 Subject: kvm: set page dirty only if page has been writable In absence of shadow dirty mask, there is no need to set page dirty if page has never been writable. This is a tiny optimization but good to have for people who care much about dirty page tracking. Signed-off-by: Yu Zhao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 70e95d097ef1..1ff4dbb73fb7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -557,8 +557,15 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) !is_writable_pte(new_spte)) ret = true; - if (!shadow_accessed_mask) + if (!shadow_accessed_mask) { + /* + * We don't set page dirty when dropping non-writable spte. + * So do it now if the new spte is becoming non-writable. + */ + if (ret) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); return ret; + } /* * Flush TLB when accessed/dirty bits are changed in the page tables, @@ -605,7 +612,8 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) + if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : + PT_WRITABLE_MASK)) kvm_set_pfn_dirty(pfn); return 1; } -- cgit v1.2.3 From 61abdbe0bcc2b32745ab4479cc550f4c1f518ee2 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 4 Apr 2016 16:46:07 -0400 Subject: kvm: x86: make lapic hrtimer pinned When a vCPU runs on a nohz_full core, the hrtimer used by the lapic emulation code can be migrated to another core. When this happens, it's possible to observe milisecond latency when delivering timer IRQs to KVM guests. The huge latency is mainly due to the fact that apic_timer_fn() expects to run during a kvm exit. It sets KVM_REQ_PENDING_TIMER and let it be handled on kvm entry. However, if the timer fires on a different core, we have to wait until the next kvm exit for the guest to see KVM_REQ_PENDING_TIMER set. This problem became visible after commit 9642d18ee. This commit changed the timer migration code to always attempt to migrate timers away from nohz_full cores. While it's discussable if this is correct/desirable (I don't think it is), it's clear that the lapic emulation code has a requirement on firing the hrtimer in the same core where it was started. This is achieved by making the hrtimer pinned. Lastly, note that KVM has code to migrate timers when a vCPU is scheduled to run in different core. However, this forced migration may fail. When this happens, we can have the same problem. If we want 100% correctness, we'll have to modify apic_timer_fn() to cause a kvm exit when it runs on a different core than the vCPU. Not sure if this is possible. Here's a reproducer for the issue being fixed: 1. Set all cores but core0 to be nohz_full cores 2. Start a guest with a single vCPU 3. Trace apic_timer_fn() and kvm_inject_apic_timer_irqs() You'll see that apic_timer_fn() will run in core0 while kvm_inject_apic_timer_irqs() runs in a different core. If you get both on core0, try running a program that takes 100% of the CPU and pin it to core0 to force the vCPU out. Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 443d2a57ad3d..1a2da0e5a373 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1369,7 +1369,7 @@ static void start_apic_timer(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " @@ -1402,7 +1402,7 @@ static void start_apic_timer(struct kvm_lapic *apic) expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -1868,7 +1868,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; /* @@ -2003,7 +2003,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* -- cgit v1.2.3