From dea6e140d927b8d9b299f972eac5574de71bc75f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 30 Aug 2022 15:37:15 +0200 Subject: KVM: x86: hyper-v: Cache HYPERV_CPUID_NESTED_FEATURES CPUID leaf KVM has to check guest visible HYPERV_CPUID_NESTED_FEATURES.EBX CPUID leaf to know which Enlightened VMCS definition to use (original or 2022 update). Cache the leaf along with other Hyper-V CPUID feature leaves to make the check quick. Reviewed-by: Maxim Levitsky Signed-off-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220830133737.1539624-12-vkuznets@redhat.com Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2c96c43c313a..9411348e4223 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -615,6 +615,8 @@ struct kvm_vcpu_hv { u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */ u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ + u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */ + u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */ } cpuid_cache; }; -- cgit v1.2.3 From 6ad75c5c99f78e28b6ff2a44be167cd857270405 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Aug 2022 23:16:00 +0000 Subject: KVM: x86: Rename kvm_x86_ops.queue_exception to inject_exception Rename the kvm_x86_ops hook for exception injection to better reflect reality, and to align with pretty much every other related function name in KVM. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20220830231614.3580124-14-seanjc@google.com Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 51f777071584..82ba4a564e58 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -67,7 +67,7 @@ KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) KVM_X86_OP(inject_irq) KVM_X86_OP(inject_nmi) -KVM_X86_OP(queue_exception) +KVM_X86_OP(inject_exception) KVM_X86_OP(cancel_injection) KVM_X86_OP(interrupt_allowed) KVM_X86_OP(nmi_allowed) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9411348e4223..032d99796445 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1525,7 +1525,7 @@ struct kvm_x86_ops { unsigned char *hypercall_addr); void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); - void (*queue_exception)(struct kvm_vcpu *vcpu); + void (*inject_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ec2b42a5abe3..65a72afbcdd5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -461,7 +461,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) return 0; } -static void svm_queue_exception(struct kvm_vcpu *vcpu) +static void svm_inject_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); unsigned nr = vcpu->arch.exception.nr; @@ -4790,7 +4790,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .patch_hypercall = svm_patch_hypercall, .inject_irq = svm_inject_irq, .inject_nmi = svm_inject_nmi, - .queue_exception = svm_queue_exception, + .inject_exception = svm_inject_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 497d14e515ed..d046df660752 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1684,7 +1684,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } -static void vmx_queue_exception(struct kvm_vcpu *vcpu) +static void vmx_inject_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned nr = vcpu->arch.exception.nr; @@ -8054,7 +8054,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .patch_hypercall = vmx_patch_hypercall, .inject_irq = vmx_inject_irq, .inject_nmi = vmx_inject_nmi, - .queue_exception = vmx_queue_exception, + .inject_exception = vmx_inject_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ee3041da222b..4cb177b616c5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9725,7 +9725,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) vcpu->arch.exception.error_code = false; - static_call(kvm_x86_queue_exception)(vcpu); + static_call(kvm_x86_inject_exception)(vcpu); } static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) -- cgit v1.2.3 From d4963e319f1f7851a098df6610a27f9f4cf6d42a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Aug 2022 23:16:01 +0000 Subject: KVM: x86: Make kvm_queued_exception a properly named, visible struct Move the definition of "struct kvm_queued_exception" out of kvm_vcpu_arch in anticipation of adding a second instance in kvm_vcpu_arch to handle exceptions that occur when vectoring an injected exception and are morphed to VM-Exit instead of leading to #DF. Opportunistically take advantage of the churn to rename "nr" to "vector". No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20220830231614.3580124-15-seanjc@google.com Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 23 ++++++------ arch/x86/kvm/svm/nested.c | 47 ++++++++++++------------ arch/x86/kvm/svm/svm.c | 14 ++++---- arch/x86/kvm/vmx/nested.c | 42 +++++++++++----------- arch/x86/kvm/vmx/vmx.c | 20 +++++------ arch/x86/kvm/x86.c | 80 ++++++++++++++++++++--------------------- arch/x86/kvm/x86.h | 3 +- 7 files changed, 113 insertions(+), 116 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 032d99796445..76fda10baa3d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -641,6 +641,17 @@ struct kvm_vcpu_xen { struct timer_list poll_timer; }; +struct kvm_queued_exception { + bool pending; + bool injected; + bool has_error_code; + u8 vector; + u32 error_code; + unsigned long payload; + bool has_payload; + u8 nested_apf; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -739,16 +750,8 @@ struct kvm_vcpu_arch { u8 event_exit_inst_len; - struct kvm_queued_exception { - bool pending; - bool injected; - bool has_error_code; - u8 nr; - u32 error_code; - unsigned long payload; - bool has_payload; - u8 nested_apf; - } exception; + /* Exceptions to be injected to the guest. */ + struct kvm_queued_exception exception; struct kvm_queued_interrupt { bool injected; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 66f63e58efcc..bbfbceaca6ad 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -468,7 +468,7 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, unsigned int nr; if (vcpu->arch.exception.injected) { - nr = vcpu->arch.exception.nr; + nr = vcpu->arch.exception.vector; exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT; if (vcpu->arch.exception.has_error_code) { @@ -1310,42 +1310,45 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu) static bool nested_exit_on_exception(struct vcpu_svm *svm) { - unsigned int nr = svm->vcpu.arch.exception.nr; + unsigned int vector = svm->vcpu.arch.exception.vector; - return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr)); + return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector)); } -static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) +static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) { - unsigned int nr = svm->vcpu.arch.exception.nr; + struct kvm_queued_exception *ex = &vcpu->arch.exception; + struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; vmcb->control.exit_code_hi = 0; - if (svm->vcpu.arch.exception.has_error_code) - vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; + if (ex->has_error_code) + vmcb->control.exit_info_1 = ex->error_code; /* * EXITINFO2 is undefined for all exception intercepts other * than #PF. */ - if (nr == PF_VECTOR) { - if (svm->vcpu.arch.exception.nested_apf) - vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; - else if (svm->vcpu.arch.exception.has_payload) - vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + if (ex->vector == PF_VECTOR) { + if (ex->nested_apf) + vmcb->control.exit_info_2 = vcpu->arch.apf.nested_apf_token; + else if (ex->has_payload) + vmcb->control.exit_info_2 = ex->payload; else - vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - } else if (nr == DB_VECTOR) { + vmcb->control.exit_info_2 = vcpu->arch.cr2; + } else if (ex->vector == DB_VECTOR) { /* See inject_pending_event. */ - kvm_deliver_exception_payload(&svm->vcpu); - if (svm->vcpu.arch.dr7 & DR7_GD) { - svm->vcpu.arch.dr7 &= ~DR7_GD; - kvm_update_dr7(&svm->vcpu); + kvm_deliver_exception_payload(vcpu, ex); + + if (vcpu->arch.dr7 & DR7_GD) { + vcpu->arch.dr7 &= ~DR7_GD; + kvm_update_dr7(vcpu); } - } else - WARN_ON(svm->vcpu.arch.exception.has_payload); + } else { + WARN_ON(ex->has_payload); + } nested_svm_vmexit(svm); } @@ -1383,7 +1386,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_exception(svm)) return 0; - nested_svm_inject_exception_vmexit(svm); + nested_svm_inject_exception_vmexit(vcpu); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 65a72afbcdd5..0d7b8f7d8d4c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -463,22 +463,20 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) static void svm_inject_exception(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; struct vcpu_svm *svm = to_svm(vcpu); - unsigned nr = vcpu->arch.exception.nr; - bool has_error_code = vcpu->arch.exception.has_error_code; - u32 error_code = vcpu->arch.exception.error_code; - kvm_deliver_exception_payload(vcpu); + kvm_deliver_exception_payload(vcpu, ex); - if (kvm_exception_is_soft(nr) && + if (kvm_exception_is_soft(ex->vector) && svm_update_soft_interrupt_rip(vcpu)) return; - svm->vmcb->control.event_inj = nr + svm->vmcb->control.event_inj = ex->vector | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0) | SVM_EVTINJ_TYPE_EXEPT; - svm->vmcb->control.event_inj_err = error_code; + svm->vmcb->control.event_inj_err = ex->error_code; } static void svm_init_erratum_383(void) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index cb1b3d1dec0e..8e7f8cebce4d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -446,29 +446,27 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, */ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - if (nr == PF_VECTOR) { - if (vcpu->arch.exception.nested_apf) { + if (ex->vector == PF_VECTOR) { + if (ex->nested_apf) { *exit_qual = vcpu->arch.apf.nested_apf_token; return 1; } - if (nested_vmx_is_page_fault_vmexit(vmcs12, - vcpu->arch.exception.error_code)) { - *exit_qual = has_payload ? payload : vcpu->arch.cr2; + if (nested_vmx_is_page_fault_vmexit(vmcs12, ex->error_code)) { + *exit_qual = ex->has_payload ? ex->payload : vcpu->arch.cr2; return 1; } - } else if (vmcs12->exception_bitmap & (1u << nr)) { - if (nr == DB_VECTOR) { - if (!has_payload) { - payload = vcpu->arch.dr6; - payload &= ~DR6_BT; - payload ^= DR6_ACTIVE_LOW; + } else if (vmcs12->exception_bitmap & (1u << ex->vector)) { + if (ex->vector == DB_VECTOR) { + if (ex->has_payload) { + *exit_qual = ex->payload; + } else { + *exit_qual = vcpu->arch.dr6; + *exit_qual &= ~DR6_BT; + *exit_qual ^= DR6_ACTIVE_LOW; } - *exit_qual = payload; } else *exit_qual = 0; return 1; @@ -3764,7 +3762,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, is_double_fault(exit_intr_info))) { vmcs12->idt_vectoring_info_field = 0; } else if (vcpu->arch.exception.injected) { - nr = vcpu->arch.exception.nr; + nr = vcpu->arch.exception.vector; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; if (kvm_exception_is_soft(nr)) { @@ -3868,11 +3866,11 @@ mmio_needed: static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, unsigned long exit_qual) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; + u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (vcpu->arch.exception.has_error_code) { + if (ex->has_error_code) { /* * Intel CPUs do not generate error codes with bits 31:16 set, * and more importantly VMX disallows setting bits 31:16 in the @@ -3882,11 +3880,11 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, * generate "full" 32-bit error codes, so KVM allows userspace * to inject exception error codes with bits 31:16 set. */ - vmcs12->vm_exit_intr_error_code = (u16)vcpu->arch.exception.error_code; + vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; intr_info |= INTR_INFO_DELIVER_CODE_MASK; } - if (kvm_exception_is_soft(nr)) + if (kvm_exception_is_soft(ex->vector)) intr_info |= INTR_TYPE_SOFT_EXCEPTION; else intr_info |= INTR_TYPE_HARD_EXCEPTION; @@ -3917,7 +3915,7 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, static inline unsigned long vmx_get_pending_dbg_trap(struct kvm_vcpu *vcpu) { if (!vcpu->arch.exception.pending || - vcpu->arch.exception.nr != DB_VECTOR) + vcpu->arch.exception.vector != DB_VECTOR) return 0; /* General Detect #DBs are always fault-like. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d046df660752..f555be2be993 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1659,7 +1659,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) */ if (nested_cpu_has_mtf(vmcs12) && (!vcpu->arch.exception.pending || - vcpu->arch.exception.nr == DB_VECTOR)) + vcpu->arch.exception.vector == DB_VECTOR)) vmx->nested.mtf_pending = true; else vmx->nested.mtf_pending = false; @@ -1686,15 +1686,13 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) static void vmx_inject_exception(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; + u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned nr = vcpu->arch.exception.nr; - bool has_error_code = vcpu->arch.exception.has_error_code; - u32 error_code = vcpu->arch.exception.error_code; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - kvm_deliver_exception_payload(vcpu); + kvm_deliver_exception_payload(vcpu, ex); - if (has_error_code) { + if (ex->has_error_code) { /* * Despite the error code being architecturally defined as 32 * bits, and the VMCS field being 32 bits, Intel CPUs and thus @@ -1705,21 +1703,21 @@ static void vmx_inject_exception(struct kvm_vcpu *vcpu) * the upper bits to avoid VM-Fail, losing information that * does't really exist is preferable to killing the VM. */ - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)error_code); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; } if (vmx->rmode.vm86_active) { int inc_eip = 0; - if (kvm_exception_is_soft(nr)) + if (kvm_exception_is_soft(ex->vector)) inc_eip = vcpu->arch.event_exit_inst_len; - kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); + kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); return; } WARN_ON_ONCE(vmx->emulation_required); - if (kvm_exception_is_soft(nr)) { + if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cb177b616c5..b156dde99504 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -561,16 +561,13 @@ static int exception_type(int vector) return EXCPT_FAULT; } -void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, + struct kvm_queued_exception *ex) { - unsigned nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - - if (!has_payload) + if (!ex->has_payload) return; - switch (nr) { + switch (ex->vector) { case DB_VECTOR: /* * "Certain debug exceptions may clear bit 0-3. The @@ -595,8 +592,8 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * So they need to be flipped for DR6. */ vcpu->arch.dr6 |= DR6_ACTIVE_LOW; - vcpu->arch.dr6 |= payload; - vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; + vcpu->arch.dr6 |= ex->payload; + vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW; /* * The #DB payload is defined as compatible with the 'pending @@ -607,12 +604,12 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: - vcpu->arch.cr2 = payload; + vcpu->arch.cr2 = ex->payload; break; } - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; + ex->has_payload = false; + ex->payload = 0; } EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); @@ -651,17 +648,18 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.injected = false; } vcpu->arch.exception.has_error_code = has_error; - vcpu->arch.exception.nr = nr; + vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; if (!is_guest_mode(vcpu)) - kvm_deliver_exception_payload(vcpu); + kvm_deliver_exception_payload(vcpu, + &vcpu->arch.exception); return; } /* to check exception */ - prev_nr = vcpu->arch.exception.nr; + prev_nr = vcpu->arch.exception.vector; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -679,7 +677,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.vector = DF_VECTOR; vcpu->arch.exception.error_code = 0; vcpu->arch.exception.has_payload = false; vcpu->arch.exception.payload = 0; @@ -5015,25 +5013,24 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; + process_nmi(vcpu); if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); /* - * In guest mode, payload delivery should be deferred, - * so that the L1 hypervisor can intercept #PF before - * CR2 is modified (or intercept #DB before DR6 is - * modified under nVMX). Unless the per-VM capability, - * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of - * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we - * opportunistically defer the exception payload, deliver it if the - * capability hasn't been requested before processing a - * KVM_GET_VCPU_EVENTS. + * In guest mode, payload delivery should be deferred if the exception + * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 + * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not + * propagate the payload and so it cannot be safely deferred. Deliver + * the payload if the capability hasn't been requested. */ if (!vcpu->kvm->arch.exception_payload_enabled && - vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) - kvm_deliver_exception_payload(vcpu); + ex->pending && ex->has_payload) + kvm_deliver_exception_payload(vcpu, ex); /* * The API doesn't provide the instruction length for software @@ -5041,26 +5038,25 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, * isn't advanced, we should expect to encounter the exception * again. */ - if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { + if (kvm_exception_is_soft(ex->vector)) { events->exception.injected = 0; events->exception.pending = 0; } else { - events->exception.injected = vcpu->arch.exception.injected; - events->exception.pending = vcpu->arch.exception.pending; + events->exception.injected = ex->injected; + events->exception.pending = ex->pending; /* * For ABI compatibility, deliberately conflate * pending and injected exceptions when * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. */ if (!vcpu->kvm->arch.exception_payload_enabled) - events->exception.injected |= - vcpu->arch.exception.pending; + events->exception.injected |= ex->pending; } - events->exception.nr = vcpu->arch.exception.nr; - events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.error_code = vcpu->arch.exception.error_code; - events->exception_has_payload = vcpu->arch.exception.has_payload; - events->exception_payload = vcpu->arch.exception.payload; + events->exception.nr = ex->vector; + events->exception.has_error_code = ex->has_error_code; + events->exception.error_code = ex->error_code; + events->exception_has_payload = ex->has_payload; + events->exception_payload = ex->payload; events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; @@ -5132,7 +5128,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, process_nmi(vcpu); vcpu->arch.exception.injected = events->exception.injected; vcpu->arch.exception.pending = events->exception.pending; - vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.vector = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; vcpu->arch.exception.error_code = events->exception.error_code; vcpu->arch.exception.has_payload = events->exception_has_payload; @@ -9718,7 +9714,7 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, + trace_kvm_inj_exception(vcpu->arch.exception.vector, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, vcpu->arch.exception.injected); @@ -9790,12 +9786,12 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) * describe the behavior of General Detect #DBs, which are * fault-like. They do _not_ set RF, a la code breakpoints. */ - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) + if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); - if (vcpu->arch.exception.nr == DB_VECTOR) { - kvm_deliver_exception_payload(vcpu); + if (vcpu->arch.exception.vector == DB_VECTOR) { + kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception); if (vcpu->arch.dr7 & DR7_GD) { vcpu->arch.dr7 &= ~DR7_GD; kvm_update_dr7(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 1926d2cb8e79..4147d27f9fbc 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -286,7 +286,8 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, int handle_ud(struct kvm_vcpu *vcpu); -void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu); +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, + struct kvm_queued_exception *ex); void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); -- cgit v1.2.3 From 7709aba8f71613ae5d18d8c00adb54948e6bedb3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Aug 2022 23:16:08 +0000 Subject: KVM: x86: Morph pending exceptions to pending VM-Exits at queue time Morph pending exceptions to pending VM-Exits (due to interception) when the exception is queued instead of waiting until nested events are checked at VM-Entry. This fixes a longstanding bug where KVM fails to handle an exception that occurs during delivery of a previous exception, KVM (L0) and L1 both want to intercept the exception (e.g. #PF for shadow paging), and KVM determines that the exception is in the guest's domain, i.e. queues the new exception for L2. Deferring the interception check causes KVM to esclate various combinations of injected+pending exceptions to double fault (#DF) without consulting L1's interception desires, and ends up injecting a spurious #DF into L2. KVM has fudged around the issue for #PF by special casing emulated #PF injection for shadow paging, but the underlying issue is not unique to shadow paging in L0, e.g. if KVM is intercepting #PF because the guest has a smaller maxphyaddr and L1 (but not L0) is using shadow paging. Other exceptions are affected as well, e.g. if KVM is intercepting #GP for one of SVM's workaround or for the VMware backdoor emulation stuff. The other cases have gone unnoticed because the #DF is spurious if and only if L1 resolves the exception, e.g. KVM's goofs go unnoticed if L1 would have injected #DF anyways. The hack-a-fix has also led to ugly code, e.g. bailing from the emulator if #PF injection forced a nested VM-Exit and the emulator finds itself back in L1. Allowing for direct-to-VM-Exit queueing also neatly solves the async #PF in L2 mess; no need to set a magic flag and token, simply queue a #PF nested VM-Exit. Deal with event migration by flagging that a pending exception was queued by userspace and check for interception at the next KVM_RUN, e.g. so that KVM does the right thing regardless of the order in which userspace restores nested state vs. event state. When "getting" events from userspace, simply drop any pending excpetion that is destined to be intercepted if there is also an injected exception to be migrated. Ideally, KVM would migrate both events, but that would require new ABI, and practically speaking losing the event is unlikely to be noticed, let alone fatal. The injected exception is captured, RIP still points at the original faulting instruction, etc... So either the injection on the target will trigger the same intercepted exception, or the source of the intercepted exception was transient and/or non-deterministic, thus dropping it is ok-ish. Fixes: a04aead144fd ("KVM: nSVM: fix running nested guests when npt=0") Fixes: feaf0c7dc473 ("KVM: nVMX: Do not generate #DF if #PF happens during exception delivery into L2") Cc: Jim Mattson Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Link: https://lore.kernel.org/r/20220830231614.3580124-22-seanjc@google.com Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 12 +-- arch/x86/kvm/svm/nested.c | 45 ++++-------- arch/x86/kvm/vmx/nested.c | 109 ++++++++++++--------------- arch/x86/kvm/vmx/vmx.c | 6 +- arch/x86/kvm/x86.c | 159 +++++++++++++++++++++++++++------------- arch/x86/kvm/x86.h | 7 ++ 6 files changed, 188 insertions(+), 150 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 76fda10baa3d..b3ce723efb43 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -649,7 +649,6 @@ struct kvm_queued_exception { u32 error_code; unsigned long payload; bool has_payload; - u8 nested_apf; }; struct kvm_vcpu_arch { @@ -750,8 +749,12 @@ struct kvm_vcpu_arch { u8 event_exit_inst_len; + bool exception_from_userspace; + /* Exceptions to be injected to the guest. */ struct kvm_queued_exception exception; + /* Exception VM-Exits to be synthesized to L1. */ + struct kvm_queued_exception exception_vmexit; struct kvm_queued_interrupt { bool injected; @@ -862,7 +865,6 @@ struct kvm_vcpu_arch { u32 id; bool send_user_only; u32 host_apf_flags; - unsigned long nested_apf_token; bool delivery_as_pf_vmexit; bool pageready_pending; } apf; @@ -1638,9 +1640,9 @@ struct kvm_x86_ops { struct kvm_x86_nested_ops { void (*leave_nested)(struct kvm_vcpu *vcpu); + bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code); int (*check_events)(struct kvm_vcpu *vcpu); - bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu, - struct x86_exception *fault); bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, @@ -1867,7 +1869,7 @@ void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long pay void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); -bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, +void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 2ecc64c3f6ee..cf22900f7c54 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -55,28 +55,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, nested_svm_vmexit(svm); } -static bool nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; - - WARN_ON(!is_guest_mode(vcpu)); - - if (vmcb12_is_intercept(&svm->nested.ctl, - INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && - !WARN_ON_ONCE(svm->nested.nested_run_pending)) { - vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; - vmcb->control.exit_code_hi = 0; - vmcb->control.exit_info_1 = fault->error_code; - vmcb->control.exit_info_2 = fault->address; - nested_svm_vmexit(svm); - return true; - } - - return false; -} - static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1308,16 +1286,17 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu) return 0; } -static bool nested_exit_on_exception(struct vcpu_svm *svm) +static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code) { - unsigned int vector = svm->vcpu.arch.exception.vector; + struct vcpu_svm *svm = to_svm(vcpu); return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector)); } static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) { - struct kvm_queued_exception *ex = &vcpu->arch.exception; + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; @@ -1332,9 +1311,7 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) * than #PF. */ if (ex->vector == PF_VECTOR) { - if (ex->nested_apf) - vmcb->control.exit_info_2 = vcpu->arch.apf.nested_apf_token; - else if (ex->has_payload) + if (ex->has_payload) vmcb->control.exit_info_2 = ex->payload; else vmcb->control.exit_info_2 = vcpu->arch.cr2; @@ -1387,15 +1364,19 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (vcpu->arch.exception.pending) { + if (vcpu->arch.exception_vmexit.pending) { if (block_nested_exceptions) return -EBUSY; - if (!nested_exit_on_exception(svm)) - return 0; nested_svm_inject_exception_vmexit(vcpu); return 0; } + if (vcpu->arch.exception.pending) { + if (block_nested_exceptions) + return -EBUSY; + return 0; + } + if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; @@ -1733,8 +1714,8 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) struct kvm_x86_nested_ops svm_nested_ops = { .leave_nested = svm_leave_nested, + .is_exception_vmexit = nested_svm_is_exception_vmexit, .check_events = svm_check_nested_events, - .handle_page_fault_workaround = nested_svm_handle_page_fault_workaround, .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e0a93d974829..4da0558943ce 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -439,59 +439,22 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, return inequality ^ bit; } - -/* - * KVM wants to inject page-faults which it got to the guest. This function - * checks whether in a nested guest, we need to inject them to L1 or L2. - */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) -{ - struct kvm_queued_exception *ex = &vcpu->arch.exception; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (ex->vector == PF_VECTOR) { - if (ex->nested_apf) { - *exit_qual = vcpu->arch.apf.nested_apf_token; - return 1; - } - if (nested_vmx_is_page_fault_vmexit(vmcs12, ex->error_code)) { - *exit_qual = ex->has_payload ? ex->payload : vcpu->arch.cr2; - return 1; - } - } else if (vmcs12->exception_bitmap & (1u << ex->vector)) { - if (ex->vector == DB_VECTOR) { - if (ex->has_payload) { - *exit_qual = ex->payload; - } else { - *exit_qual = vcpu->arch.dr6; - *exit_qual &= ~DR6_BT; - *exit_qual ^= DR6_ACTIVE_LOW; - } - } else - *exit_qual = 0; - return 1; - } - - return 0; -} - -static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu, - struct x86_exception *fault) +static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - WARN_ON(!is_guest_mode(vcpu)); + /* + * Drop bits 31:16 of the error code when performing the #PF mask+match + * check. All VMCS fields involved are 32 bits, but Intel CPUs never + * set bits 31:16 and VMX disallows setting bits 31:16 in the injected + * error code. Including the to-be-dropped bits in the check might + * result in an "impossible" or missed exit from L1's perspective. + */ + if (vector == PF_VECTOR) + return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && - !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) { - vmcs12->vm_exit_intr_error_code = fault->error_code; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | - INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, - fault->address); - return true; - } - return false; + return (vmcs12->exception_bitmap & (1u << vector)); } static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, @@ -3863,12 +3826,24 @@ mmio_needed: return -ENXIO; } -static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, - unsigned long exit_qual) +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) { - struct kvm_queued_exception *ex = &vcpu->arch.exception; + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long exit_qual; + + if (ex->has_payload) { + exit_qual = ex->payload; + } else if (ex->vector == PF_VECTOR) { + exit_qual = vcpu->arch.cr2; + } else if (ex->vector == DB_VECTOR) { + exit_qual = vcpu->arch.dr6; + exit_qual &= ~DR6_BT; + exit_qual ^= DR6_ACTIVE_LOW; + } else { + exit_qual = 0; + } if (ex->has_error_code) { /* @@ -4041,7 +4016,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qual; /* * Only a pending nested run blocks a pending exception. If there is a * previously injected event, the pending exception occurred while said @@ -4095,14 +4069,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) * across SMI/RSM as it should; that needs to be addressed in order to * prioritize SMI over MTF and trap-like #DBs. */ + if (vcpu->arch.exception_vmexit.pending && + !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { + if (block_nested_exceptions) + return -EBUSY; + + nested_vmx_inject_exception_vmexit(vcpu); + return 0; + } + if (vcpu->arch.exception.pending && !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { if (block_nested_exceptions) return -EBUSY; - if (!nested_vmx_check_exception(vcpu, &exit_qual)) - goto no_vmexit; - nested_vmx_inject_exception_vmexit(vcpu, exit_qual); - return 0; + goto no_vmexit; } if (vmx->nested.mtf_pending) { @@ -4113,15 +4093,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (vcpu->arch.exception.pending) { + if (vcpu->arch.exception_vmexit.pending) { if (block_nested_exceptions) return -EBUSY; - if (!nested_vmx_check_exception(vcpu, &exit_qual)) - goto no_vmexit; - nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + + nested_vmx_inject_exception_vmexit(vcpu); return 0; } + if (vcpu->arch.exception.pending) { + if (block_nested_exceptions) + return -EBUSY; + goto no_vmexit; + } + if (nested_vmx_preemption_timer_pending(vcpu)) { if (block_nested_events) return -EBUSY; @@ -6984,8 +6969,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) struct kvm_x86_nested_ops vmx_nested_ops = { .leave_nested = vmx_leave_nested, + .is_exception_vmexit = nested_vmx_is_exception_vmexit, .check_events = vmx_check_nested_events, - .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround, .hv_timer_pending = nested_vmx_preemption_timer_pending, .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f555be2be993..8d793260db6a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1659,7 +1659,9 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) */ if (nested_cpu_has_mtf(vmcs12) && (!vcpu->arch.exception.pending || - vcpu->arch.exception.vector == DB_VECTOR)) + vcpu->arch.exception.vector == DB_VECTOR) && + (!vcpu->arch.exception_vmexit.pending || + vcpu->arch.exception_vmexit.vector == DB_VECTOR)) vmx->nested.mtf_pending = true; else vmx->nested.mtf_pending = false; @@ -5692,7 +5694,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); return vmx->emulation_required && !vmx->rmode.vm86_active && - (vcpu->arch.exception.pending || vcpu->arch.exception.injected); + (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); } static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 458c7e35731a..734b206ab8b7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -613,6 +613,21 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); +static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, + bool has_error_code, u32 error_code, + bool has_payload, unsigned long payload) +{ + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + + ex->vector = vector; + ex->injected = false; + ex->pending = true; + ex->has_error_code = has_error_code; + ex->error_code = error_code; + ex->has_payload = has_payload; + ex->payload = payload; +} + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject) @@ -622,18 +637,31 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); + /* + * If the exception is destined for L2 and isn't being reinjected, + * morph it to a VM-Exit if L1 wants to intercept the exception. A + * previously injected exception is not checked because it was checked + * when it was original queued, and re-checking is incorrect if _L1_ + * injected the exception, in which case it's exempt from interception. + */ + if (!reinject && is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { + kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, + has_payload, payload); + return; + } + if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: if (reinject) { /* - * On vmentry, vcpu->arch.exception.pending is only - * true if an event injection was blocked by - * nested_run_pending. In that case, however, - * vcpu_enter_guest requests an immediate exit, - * and the guest shouldn't proceed far enough to - * need reinjection. + * On VM-Entry, an exception can be pending if and only + * if event injection was blocked by nested_run_pending. + * In that case, however, vcpu_enter_guest() requests an + * immediate exit, and the guest shouldn't proceed far + * enough to need reinjection. */ - WARN_ON_ONCE(vcpu->arch.exception.pending); + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); vcpu->arch.exception.injected = true; if (WARN_ON_ONCE(has_payload)) { /* @@ -736,20 +764,22 @@ static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; - vcpu->arch.exception.nested_apf = - is_guest_mode(vcpu) && fault->async_page_fault; - if (vcpu->arch.exception.nested_apf) { - vcpu->arch.apf.nested_apf_token = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); - } else { + + /* + * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of + * whether or not L1 wants to intercept "regular" #PF. + */ + if (is_guest_mode(vcpu) && fault->async_page_fault) + kvm_queue_exception_vmexit(vcpu, PF_VECTOR, + true, fault->error_code, + true, fault->address); + else kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); - } } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -/* Returns true if the page fault was immediately morphed into a VM-Exit. */ -bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, +void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct kvm_mmu *fault_mmu; @@ -767,26 +797,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, fault_mmu->root.hpa); - /* - * A workaround for KVM's bad exception handling. If KVM injected an - * exception into L2, and L2 encountered a #PF while vectoring the - * injected exception, manually check to see if L1 wants to intercept - * #PF, otherwise queuing the #PF will lead to #DF or a lost exception. - * In all other cases, defer the check to nested_ops->check_events(), - * which will correctly handle priority (this does not). Note, other - * exceptions, e.g. #GP, are theoretically affected, #PF is simply the - * most problematic, e.g. when L0 and L1 are both intercepting #PF for - * shadow paging. - * - * TODO: Rewrite exception handling to track injected and pending - * (VM-Exit) exceptions separately. - */ - if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) && - kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault)) - return true; - fault_mmu->inject_page_fault(vcpu, fault); - return false; } EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); @@ -4835,7 +4846,7 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) return (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_accept_dm_intr(vcpu) && !kvm_event_needs_reinjection(vcpu) && - !vcpu->arch.exception.pending); + !kvm_is_exception_pending(vcpu)); } static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, @@ -5010,13 +5021,27 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - struct kvm_queued_exception *ex = &vcpu->arch.exception; + struct kvm_queued_exception *ex; process_nmi(vcpu); if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); + /* + * KVM's ABI only allows for one exception to be migrated. Luckily, + * the only time there can be two queued exceptions is if there's a + * non-exiting _injected_ exception, and a pending exiting exception. + * In that case, ignore the VM-Exiting exception as it's an extension + * of the injected exception. + */ + if (vcpu->arch.exception_vmexit.pending && + !vcpu->arch.exception.pending && + !vcpu->arch.exception.injected) + ex = &vcpu->arch.exception_vmexit; + else + ex = &vcpu->arch.exception; + /* * In guest mode, payload delivery should be deferred if the exception * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 @@ -5123,6 +5148,19 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; process_nmi(vcpu); + + /* + * Flag that userspace is stuffing an exception, the next KVM_RUN will + * morph the exception to a VM-Exit if appropriate. Do this only for + * pending exceptions, already-injected exceptions are not subject to + * intercpetion. Note, userspace that conflates pending and injected + * is hosed, and will incorrectly convert an injected exception into a + * pending exception, which in turn may cause a spurious VM-Exit. + */ + vcpu->arch.exception_from_userspace = events->exception.pending; + + vcpu->arch.exception_vmexit.pending = false; + vcpu->arch.exception.injected = events->exception.injected; vcpu->arch.exception.pending = events->exception.pending; vcpu->arch.exception.vector = events->exception.nr; @@ -8167,18 +8205,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) } } -static bool inject_emulated_exception(struct kvm_vcpu *vcpu) +static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; - if (ctxt->exception.vector == PF_VECTOR) - return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); - if (ctxt->exception.error_code_valid) + if (ctxt->exception.vector == PF_VECTOR) + kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); + else if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, ctxt->exception.error_code); else kvm_queue_exception(vcpu, ctxt->exception.vector); - return false; } static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -8813,8 +8850,7 @@ restart: if (ctxt->have_exception) { r = 1; - if (inject_emulated_exception(vcpu)) - return r; + inject_emulated_exception(vcpu); } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ @@ -9761,7 +9797,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) */ if (vcpu->arch.exception.injected) kvm_inject_exception(vcpu); - else if (vcpu->arch.exception.pending) + else if (kvm_is_exception_pending(vcpu)) ; /* see above */ else if (vcpu->arch.nmi_injected) static_call(kvm_x86_inject_nmi)(vcpu); @@ -9788,6 +9824,14 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) if (r < 0) goto out; + /* + * A pending exception VM-Exit should either result in nested VM-Exit + * or force an immediate re-entry and exit to/from L2, and exception + * VM-Exits cannot be injected (flag should _never_ be set). + */ + WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected || + vcpu->arch.exception_vmexit.pending); + /* * New events, other than exceptions, cannot be injected if KVM needs * to re-inject a previous event. See above comments on re-injecting @@ -9887,7 +9931,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) *req_immediate_exit = true; - WARN_ON(vcpu->arch.exception.pending); + WARN_ON(kvm_is_exception_pending(vcpu)); return 0; out: @@ -10905,6 +10949,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; int r; @@ -10963,6 +11008,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } } + /* + * If userspace set a pending exception and L2 is active, convert it to + * a pending VM-Exit if L1 wants to intercept the exception. + */ + if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector, + ex->error_code)) { + kvm_queue_exception_vmexit(vcpu, ex->vector, + ex->has_error_code, ex->error_code, + ex->has_payload, ex->payload); + ex->injected = false; + ex->pending = false; + } + vcpu->arch.exception_from_userspace = false; + if (unlikely(vcpu->arch.complete_userspace_io)) { int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; vcpu->arch.complete_userspace_io = NULL; @@ -11069,6 +11129,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); vcpu->arch.exception.pending = false; + vcpu->arch.exception_vmexit.pending = false; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -11436,7 +11497,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; - if (vcpu->arch.exception.pending) + if (kvm_is_exception_pending(vcpu)) goto out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); @@ -12670,7 +12731,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (vcpu->arch.pv.pv_unhalted) return true; - if (vcpu->arch.exception.pending) + if (kvm_is_exception_pending(vcpu)) return true; if (kvm_test_request(KVM_REQ_NMI, vcpu) || @@ -12925,7 +12986,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu) || - vcpu->arch.exception.pending)) + kvm_is_exception_pending(vcpu))) return false; if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4147d27f9fbc..256745d1a2c3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -82,10 +82,17 @@ static inline unsigned int __shrink_ple_window(unsigned int val, void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); +static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.exception.pending || + vcpu->arch.exception_vmexit.pending; +} + static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { vcpu->arch.exception.pending = false; vcpu->arch.exception.injected = false; + vcpu->arch.exception_vmexit.pending = false; } static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, -- cgit v1.2.3 From 5b4ac1a1b713736f906fb0378a5bde612fdad538 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 21 Sep 2022 00:31:50 +0000 Subject: KVM: x86: make vendor code check for all nested events Interrupts, NMIs etc. sent while in guest mode are already handled properly by the *_interrupt_allowed callbacks, but other events can cause a vCPU to be runnable that are specific to guest mode. In the case of VMX there are two, the preemption timer and the monitor trap. The VMX preemption timer is already special cased via the hv_timer_pending callback, but the purpose of the callback can be easily extended to MTF or in fact any other event that can occur only in guest mode. Rename the callback and add an MTF check; kvm_arch_vcpu_runnable() now can return true if an MTF is pending, without relying on kvm_vcpu_running()'s call to kvm_check_nested_events(). Until that call is removed, however, the patch introduces no functional change. Reviewed-by: Maxim Levitsky Signed-off-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20220921003201.1441511-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx/nested.c | 8 +++++++- arch/x86/kvm/x86.c | 8 ++++---- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86/include/asm/kvm_host.h') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b3ce723efb43..d40206b16d6c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1643,7 +1643,7 @@ struct kvm_x86_nested_ops { bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector, u32 error_code); int (*check_events)(struct kvm_vcpu *vcpu); - bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); + bool (*has_events)(struct kvm_vcpu *vcpu); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4da0558943ce..85318d803f4f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3929,6 +3929,12 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.preemption_timer_expired; } +static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) +{ + return nested_vmx_preemption_timer_pending(vcpu) || + to_vmx(vcpu)->nested.mtf_pending; +} + /* * Per the Intel SDM's table "Priority Among Concurrent Events", with minor * edits to fill in missing examples, e.g. #DB due to split-lock accesses, @@ -6971,7 +6977,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .leave_nested = vmx_leave_nested, .is_exception_vmexit = nested_vmx_is_exception_vmexit, .check_events = vmx_check_nested_events, - .hv_timer_pending = nested_vmx_preemption_timer_pending, + .has_events = vmx_has_nested_events, .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a532b9dea57b..10f289543785 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9968,8 +9968,8 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, } if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->hv_timer_pending && - kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu)) *req_immediate_exit = true; WARN_ON(kvm_is_exception_pending(vcpu)); @@ -12794,8 +12794,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->hv_timer_pending && - kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu)) return true; if (kvm_xen_has_pending_events(vcpu)) -- cgit v1.2.3