From 46a010dd6896045484c7749b3d30b685c649eb18 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Mar 2020 10:41:51 +0100 Subject: kVM SVM: Move SVM related files to own sub-directory Move svm.c and pmu_amd.c into their own arch/x86/kvm/svm/ subdirectory. Signed-off-by: Joerg Roedel Message-Id: <20200324094154.32352-2-joro@8bytes.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/Makefile') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index e553f0fdd87d..0ef050982c37 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,7 +14,7 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o -kvm-amd-y += svm.o pmu_amd.o +kvm-amd-y += svm/svm.o svm/pmu.o obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o -- cgit v1.2.3 From 883b0a91f41ab705daa04c24e59d708e457a0bed Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Mar 2020 10:41:52 +0100 Subject: KVM: SVM: Move Nested SVM Implementation to nested.c Split out the code for the nested SVM implementation and move it to a separate file. Signed-off-by: Joerg Roedel Message-Id: <20200324094154.32352-3-joro@8bytes.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/svm/nested.c | 823 ++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 1155 +-------------------------------------------- arch/x86/kvm/svm/svm.h | 381 +++++++++++++++ 4 files changed, 1216 insertions(+), 1145 deletions(-) create mode 100644 arch/x86/kvm/svm/nested.c create mode 100644 arch/x86/kvm/svm/svm.h (limited to 'arch/x86/kvm/Makefile') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 0ef050982c37..0acec73ef236 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,7 +14,7 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o -kvm-amd-y += svm/svm.o svm/pmu.o +kvm-amd-y += svm/svm.o svm/pmu.o svm/nested.o obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c new file mode 100644 index 000000000000..90a1ca939627 --- /dev/null +++ b/arch/x86/kvm/svm/nested.c @@ -0,0 +1,823 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + */ + +#define pr_fmt(fmt) "SVM: " fmt + +#include +#include +#include + +#include + +#include "kvm_emulate.h" +#include "trace.h" +#include "mmu.h" +#include "x86.h" +#include "svm.h" + +static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { + /* + * TODO: track the cause of the nested page fault, and + * correctly fill in the high bits of exit_info_1. + */ + svm->vmcb->control.exit_code = SVM_EXIT_NPF; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = (1ULL << 32); + svm->vmcb->control.exit_info_2 = fault->address; + } + + svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; + svm->vmcb->control.exit_info_1 |= fault->error_code; + + /* + * The present bit is always zero for page structure faults on real + * hardware. + */ + if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) + svm->vmcb->control.exit_info_1 &= ~1; + + nested_svm_vmexit(svm); +} + +static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 cr3 = svm->nested.nested_cr3; + u64 pdpte; + int ret; + + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, + offset_in_page(cr3) + index * 8, 8); + if (ret) + return 0; + return pdpte; +} + +static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->nested.nested_cr3; +} + +static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +{ + WARN_ON(mmu_is_nested(vcpu)); + + vcpu->arch.mmu = &vcpu->arch.guest_mmu; + kvm_init_shadow_mmu(vcpu); + vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; + vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; + vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; + vcpu->arch.mmu->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); + reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; +} + +static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; +} + +void recalc_intercepts(struct vcpu_svm *svm) +{ + struct vmcb_control_area *c, *h; + struct nested_state *g; + + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + + if (!is_guest_mode(&svm->vcpu)) + return; + + c = &svm->vmcb->control; + h = &svm->nested.hsave->control; + g = &svm->nested; + + c->intercept_cr = h->intercept_cr; + c->intercept_dr = h->intercept_dr; + c->intercept_exceptions = h->intercept_exceptions; + c->intercept = h->intercept; + + if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + /* We only want the cr8 intercept bits of L1 */ + c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); + c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); + + /* + * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not + * affect any interrupt we may want to inject; therefore, + * interrupt window vmexits are irrelevant to L0. + */ + c->intercept &= ~(1ULL << INTERCEPT_VINTR); + } + + /* We don't want to see VMMCALLs from a nested guest */ + c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); + + c->intercept_cr |= g->intercept_cr; + c->intercept_dr |= g->intercept_dr; + c->intercept_exceptions |= g->intercept_exceptions; + c->intercept |= g->intercept; +} + +static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) +{ + struct vmcb_control_area *dst = &dst_vmcb->control; + struct vmcb_control_area *from = &from_vmcb->control; + + dst->intercept_cr = from->intercept_cr; + dst->intercept_dr = from->intercept_dr; + dst->intercept_exceptions = from->intercept_exceptions; + dst->intercept = from->intercept; + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->nested_cr3 = from->nested_cr3; + dst->virt_ext = from->virt_ext; + dst->pause_filter_count = from->pause_filter_count; + dst->pause_filter_thresh = from->pause_filter_thresh; +} + +static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) +{ + /* + * This function merges the msr permission bitmaps of kvm and the + * nested vmcb. It is optimized in that it only merges the parts where + * the kvm msr permission bitmap may contain zero bits + */ + int i; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return true; + + for (i = 0; i < MSRPM_OFFSETS; i++) { + u32 value, p; + u64 offset; + + if (msrpm_offsets[i] == 0xffffffff) + break; + + p = msrpm_offsets[i]; + offset = svm->nested.vmcb_msrpm + (p * 4); + + if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) + return false; + + svm->nested.msrpm[p] = svm->msrpm[p] | value; + } + + svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); + + return true; +} + +static bool nested_vmcb_checks(struct vmcb *vmcb) +{ + if ((vmcb->save.efer & EFER_SVME) == 0) + return false; + + if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + return false; + + if (vmcb->control.asid == 0) + return false; + + if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && + !npt_enabled) + return false; + + return true; +} + +void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb, struct kvm_host_map *map) +{ + bool evaluate_pending_interrupts = + is_intercept(svm, INTERCEPT_VINTR) || + is_intercept(svm, INTERCEPT_IRET); + + if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) + svm->vcpu.arch.hflags |= HF_HIF_MASK; + else + svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + + if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { + svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; + nested_svm_init_mmu_context(&svm->vcpu); + } + + /* Load the nested guest state */ + svm->vmcb->save.es = nested_vmcb->save.es; + svm->vmcb->save.cs = nested_vmcb->save.cs; + svm->vmcb->save.ss = nested_vmcb->save.ss; + svm->vmcb->save.ds = nested_vmcb->save.ds; + svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; + svm->vmcb->save.idtr = nested_vmcb->save.idtr; + kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); + svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); + svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); + svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); + if (npt_enabled) { + svm->vmcb->save.cr3 = nested_vmcb->save.cr3; + svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; + } else + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + + /* Guest paging mode is active - reset mmu */ + kvm_mmu_reset_context(&svm->vcpu); + + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; + kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); + kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); + kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); + + /* In case we don't even reach vcpu_run, the fields are not updated */ + svm->vmcb->save.rax = nested_vmcb->save.rax; + svm->vmcb->save.rsp = nested_vmcb->save.rsp; + svm->vmcb->save.rip = nested_vmcb->save.rip; + svm->vmcb->save.dr7 = nested_vmcb->save.dr7; + svm->vmcb->save.dr6 = nested_vmcb->save.dr6; + svm->vmcb->save.cpl = nested_vmcb->save.cpl; + + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; + svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; + + /* cache intercepts */ + svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; + svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; + svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; + svm->nested.intercept = nested_vmcb->control.intercept; + + svm_flush_tlb(&svm->vcpu, true); + svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; + if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) + svm->vcpu.arch.hflags |= HF_VINTR_MASK; + else + svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; + + svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + + svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; + svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; + svm->vmcb->control.int_state = nested_vmcb->control.int_state; + svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; + svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + + svm->vmcb->control.pause_filter_count = + nested_vmcb->control.pause_filter_count; + svm->vmcb->control.pause_filter_thresh = + nested_vmcb->control.pause_filter_thresh; + + kvm_vcpu_unmap(&svm->vcpu, map, true); + + /* Enter Guest-Mode */ + enter_guest_mode(&svm->vcpu); + + /* + * Merge guest and host intercepts - must be called with vcpu in + * guest-mode to take affect here + */ + recalc_intercepts(svm); + + svm->nested.vmcb = vmcb_gpa; + + /* + * If L1 had a pending IRQ/NMI before executing VMRUN, + * which wasn't delivered because it was disallowed (e.g. + * interrupts disabled), L0 needs to evaluate if this pending + * event should cause an exit from L2 to L1 or be delivered + * directly to L2. + * + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request. However, VMRUN can unblock interrupts + * by implicitly setting GIF, so force L0 to perform pending event + * evaluation by requesting a KVM_REQ_EVENT. + */ + enable_gif(svm); + if (unlikely(evaluate_pending_interrupts)) + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + + mark_all_dirty(svm->vmcb); +} + +int nested_svm_vmrun(struct vcpu_svm *svm) +{ + int ret; + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + struct kvm_host_map map; + u64 vmcb_gpa; + + vmcb_gpa = svm->vmcb->save.rax; + + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); + if (ret == -EINVAL) { + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } else if (ret) { + return kvm_skip_emulated_instruction(&svm->vcpu); + } + + ret = kvm_skip_emulated_instruction(&svm->vcpu); + + nested_vmcb = map.hva; + + if (!nested_vmcb_checks(nested_vmcb)) { + nested_vmcb->control.exit_code = SVM_EXIT_ERR; + nested_vmcb->control.exit_code_hi = 0; + nested_vmcb->control.exit_info_1 = 0; + nested_vmcb->control.exit_info_2 = 0; + + kvm_vcpu_unmap(&svm->vcpu, &map, true); + + return ret; + } + + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, + nested_vmcb->save.rip, + nested_vmcb->control.int_ctl, + nested_vmcb->control.event_inj, + nested_vmcb->control.nested_ctl); + + trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, + nested_vmcb->control.intercept_cr >> 16, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept); + + /* Clear internal status */ + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + /* + * Save the old vmcb, so we don't need to pick what we save, but can + * restore everything when a VMEXIT occurs + */ + hsave->save.es = vmcb->save.es; + hsave->save.cs = vmcb->save.cs; + hsave->save.ss = vmcb->save.ss; + hsave->save.ds = vmcb->save.ds; + hsave->save.gdtr = vmcb->save.gdtr; + hsave->save.idtr = vmcb->save.idtr; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); + hsave->save.rip = kvm_rip_read(&svm->vcpu); + hsave->save.rsp = vmcb->save.rsp; + hsave->save.rax = vmcb->save.rax; + if (npt_enabled) + hsave->save.cr3 = vmcb->save.cr3; + else + hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); + + copy_vmcb_control_area(hsave, vmcb); + + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); + + if (!nested_svm_vmrun_msrpm(svm)) { + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); + } + + return ret; +} + +void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +{ + to_vmcb->save.fs = from_vmcb->save.fs; + to_vmcb->save.gs = from_vmcb->save.gs; + to_vmcb->save.tr = from_vmcb->save.tr; + to_vmcb->save.ldtr = from_vmcb->save.ldtr; + to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; + to_vmcb->save.star = from_vmcb->save.star; + to_vmcb->save.lstar = from_vmcb->save.lstar; + to_vmcb->save.cstar = from_vmcb->save.cstar; + to_vmcb->save.sfmask = from_vmcb->save.sfmask; + to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; + to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; + to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; +} + +int nested_svm_vmexit(struct vcpu_svm *svm) +{ + int rc; + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + struct kvm_host_map map; + + trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, + vmcb->control.exit_info_1, + vmcb->control.exit_info_2, + vmcb->control.exit_int_info, + vmcb->control.exit_int_info_err, + KVM_ISA_SVM); + + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); + if (rc) { + if (rc == -EINVAL) + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } + + nested_vmcb = map.hva; + + /* Exit Guest-Mode */ + leave_guest_mode(&svm->vcpu); + svm->nested.vmcb = 0; + + /* Give the current vmcb to the guest */ + disable_gif(svm); + + nested_vmcb->save.es = vmcb->save.es; + nested_vmcb->save.cs = vmcb->save.cs; + nested_vmcb->save.ss = vmcb->save.ss; + nested_vmcb->save.ds = vmcb->save.ds; + nested_vmcb->save.gdtr = vmcb->save.gdtr; + nested_vmcb->save.idtr = vmcb->save.idtr; + nested_vmcb->save.efer = svm->vcpu.arch.efer; + nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); + nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); + nested_vmcb->save.cr2 = vmcb->save.cr2; + nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; + nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); + nested_vmcb->save.rip = vmcb->save.rip; + nested_vmcb->save.rsp = vmcb->save.rsp; + nested_vmcb->save.rax = vmcb->save.rax; + nested_vmcb->save.dr7 = vmcb->save.dr7; + nested_vmcb->save.dr6 = vmcb->save.dr6; + nested_vmcb->save.cpl = vmcb->save.cpl; + + nested_vmcb->control.int_ctl = vmcb->control.int_ctl; + nested_vmcb->control.int_vector = vmcb->control.int_vector; + nested_vmcb->control.int_state = vmcb->control.int_state; + nested_vmcb->control.exit_code = vmcb->control.exit_code; + nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; + nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; + nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; + nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; + nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + + if (svm->nrips_enabled) + nested_vmcb->control.next_rip = vmcb->control.next_rip; + + /* + * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have + * to make sure that we do not lose injected events. So check event_inj + * here and copy it to exit_int_info if it is valid. + * Exit_int_info and event_inj can't be both valid because the case + * below only happens on a VMRUN instruction intercept which has + * no valid exit_int_info set. + */ + if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { + struct vmcb_control_area *nc = &nested_vmcb->control; + + nc->exit_int_info = vmcb->control.event_inj; + nc->exit_int_info_err = vmcb->control.event_inj_err; + } + + nested_vmcb->control.tlb_ctl = 0; + nested_vmcb->control.event_inj = 0; + nested_vmcb->control.event_inj_err = 0; + + nested_vmcb->control.pause_filter_count = + svm->vmcb->control.pause_filter_count; + nested_vmcb->control.pause_filter_thresh = + svm->vmcb->control.pause_filter_thresh; + + /* We always set V_INTR_MASKING and remember the old value in hflags */ + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; + + /* Restore the original control entries */ + copy_vmcb_control_area(vmcb, hsave); + + svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + svm->nested.nested_cr3 = 0; + + /* Restore selected save entries */ + svm->vmcb->save.es = hsave->save.es; + svm->vmcb->save.cs = hsave->save.cs; + svm->vmcb->save.ss = hsave->save.ss; + svm->vmcb->save.ds = hsave->save.ds; + svm->vmcb->save.gdtr = hsave->save.gdtr; + svm->vmcb->save.idtr = hsave->save.idtr; + kvm_set_rflags(&svm->vcpu, hsave->save.rflags); + svm_set_efer(&svm->vcpu, hsave->save.efer); + svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); + svm_set_cr4(&svm->vcpu, hsave->save.cr4); + if (npt_enabled) { + svm->vmcb->save.cr3 = hsave->save.cr3; + svm->vcpu.arch.cr3 = hsave->save.cr3; + } else { + (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + } + kvm_rax_write(&svm->vcpu, hsave->save.rax); + kvm_rsp_write(&svm->vcpu, hsave->save.rsp); + kvm_rip_write(&svm->vcpu, hsave->save.rip); + svm->vmcb->save.dr7 = 0; + svm->vmcb->save.cpl = 0; + svm->vmcb->control.exit_int_info = 0; + + mark_all_dirty(svm->vmcb); + + kvm_vcpu_unmap(&svm->vcpu, &map, true); + + nested_svm_uninit_mmu_context(&svm->vcpu); + kvm_mmu_reset_context(&svm->vcpu); + kvm_mmu_load(&svm->vcpu); + + /* + * Drop what we picked up for L2 via svm_complete_interrupts() so it + * doesn't end up in L1. + */ + svm->vcpu.arch.nmi_injected = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + return 0; +} + +static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) +{ + u32 offset, msr, value; + int write, mask; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return NESTED_EXIT_HOST; + + msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + offset = svm_msrpm_offset(msr); + write = svm->vmcb->control.exit_info_1 & 1; + mask = 1 << ((2 * (msr & 0xf)) + write); + + if (offset == MSR_INVALID) + return NESTED_EXIT_DONE; + + /* Offset is in 32 bit units but need in 8 bit units */ + offset *= 4; + + if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) + return NESTED_EXIT_DONE; + + return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; +} + +/* DB exceptions for our internal use must not cause vmexit */ +static int nested_svm_intercept_db(struct vcpu_svm *svm) +{ + unsigned long dr6; + + /* if we're not singlestepping, it's not ours */ + if (!svm->nmi_singlestep) + return NESTED_EXIT_DONE; + + /* if it's not a singlestep exception, it's not ours */ + if (kvm_get_dr(&svm->vcpu, 6, &dr6)) + return NESTED_EXIT_DONE; + if (!(dr6 & DR6_BS)) + return NESTED_EXIT_DONE; + + /* if the guest is singlestepping, it should get the vmexit */ + if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { + disable_nmi_singlestep(svm); + return NESTED_EXIT_DONE; + } + + /* it's ours, the nested hypervisor must not see this one */ + return NESTED_EXIT_HOST; +} + +static int nested_svm_intercept_ioio(struct vcpu_svm *svm) +{ + unsigned port, size, iopm_len; + u16 val, mask; + u8 start_bit; + u64 gpa; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + return NESTED_EXIT_HOST; + + port = svm->vmcb->control.exit_info_1 >> 16; + size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> + SVM_IOIO_SIZE_SHIFT; + gpa = svm->nested.vmcb_iopm + (port / 8); + start_bit = port % 8; + iopm_len = (start_bit + size > 8) ? 2 : 1; + mask = (0xf >> (4 - size)) << start_bit; + val = 0; + + if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len)) + return NESTED_EXIT_DONE; + + return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; +} + +static int nested_svm_intercept(struct vcpu_svm *svm) +{ + u32 exit_code = svm->vmcb->control.exit_code; + int vmexit = NESTED_EXIT_HOST; + + switch (exit_code) { + case SVM_EXIT_MSR: + vmexit = nested_svm_exit_handled_msr(svm); + break; + case SVM_EXIT_IOIO: + vmexit = nested_svm_intercept_ioio(svm); + break; + case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { + u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); + if (svm->nested.intercept_cr & bit) + vmexit = NESTED_EXIT_DONE; + break; + } + case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { + u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); + if (svm->nested.intercept_dr & bit) + vmexit = NESTED_EXIT_DONE; + break; + } + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); + if (svm->nested.intercept_exceptions & excp_bits) { + if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) + vmexit = nested_svm_intercept_db(svm); + else + vmexit = NESTED_EXIT_DONE; + } + /* async page fault always cause vmexit */ + else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && + svm->vcpu.arch.exception.nested_apf != 0) + vmexit = NESTED_EXIT_DONE; + break; + } + case SVM_EXIT_ERR: { + vmexit = NESTED_EXIT_DONE; + break; + } + default: { + u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); + if (svm->nested.intercept & exit_bits) + vmexit = NESTED_EXIT_DONE; + } + } + + return vmexit; +} + +int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + int vmexit; + + vmexit = nested_svm_intercept(svm); + + if (vmexit == NESTED_EXIT_DONE) + nested_svm_vmexit(svm); + + return vmexit; +} + +int nested_svm_check_permissions(struct vcpu_svm *svm) +{ + if (!(svm->vcpu.arch.efer & EFER_SVME) || + !is_paging(&svm->vcpu)) { + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + + if (svm->vmcb->save.cpl) { + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } + + return 0; +} + +int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, + bool has_error_code, u32 error_code) +{ + int vmexit; + + if (!is_guest_mode(&svm->vcpu)) + return 0; + + vmexit = nested_svm_intercept(svm); + if (vmexit != NESTED_EXIT_DONE) + return 0; + + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = error_code; + + /* + * EXITINFO2 is undefined for all exception intercepts other + * than #PF. + */ + if (svm->vcpu.arch.exception.nested_apf) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else if (svm->vcpu.arch.exception.has_payload) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + else + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + + svm->nested.exit_required = true; + return vmexit; +} + +static void nested_svm_intr(struct vcpu_svm *svm) +{ + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + /* nested_svm_vmexit this gets called afterwards from handle_exit */ + svm->nested.exit_required = true; + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); +} + +static bool nested_exit_on_intr(struct vcpu_svm *svm) +{ + return (svm->nested.intercept & 1ULL); +} + +int svm_check_nested_events(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool block_nested_events = + kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required; + + if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) { + if (block_nested_events) + return -EBUSY; + nested_svm_intr(svm); + return 0; + } + + return 0; +} + +int nested_svm_exit_special(struct vcpu_svm *svm) +{ + u32 exit_code = svm->vmcb->control.exit_code; + + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + case SVM_EXIT_EXCP_BASE + MC_VECTOR: + return NESTED_EXIT_HOST; + case SVM_EXIT_NPF: + /* For now we are always handling NPFs when using them */ + if (npt_enabled) + return NESTED_EXIT_HOST; + break; + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + /* When we're shadowing, trap PFs, but not async PF */ + if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) + return NESTED_EXIT_HOST; + break; + default: + break; + } + + return NESTED_EXIT_CONTINUE; +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 851e9cc79930..a24e2de4acda 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -53,6 +53,8 @@ #include #include "trace.h" +#include "svm.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -82,10 +84,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SVM_AVIC_DOORBELL 0xc001011b -#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ -#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ -#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ - #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) #define TSC_RATIO_RSVD 0xffffff0000000000ULL @@ -119,68 +117,7 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); static bool erratum_383_found __read_mostly; -static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_TSC_AUX, -}; - -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) - -struct kvm_sev_info { - bool active; /* SEV enabled guest */ - unsigned int asid; /* ASID used for this guest */ - unsigned int handle; /* SEV firmware handle */ - int fd; /* SEV device fd */ - unsigned long pages_locked; /* Number of pages locked */ - struct list_head regions_list; /* List of registered regions */ -}; - -struct kvm_svm { - struct kvm kvm; - - /* Struct members for AVIC */ - u32 avic_vm_id; - struct page *avic_logical_id_table_page; - struct page *avic_physical_id_table_page; - struct hlist_node hnode; - - struct kvm_sev_info sev_info; -}; - -struct kvm_vcpu; - -struct nested_state { - struct vmcb *hsave; - u64 hsave_msr; - u64 vm_cr_msr; - u64 vmcb; - - /* These are the merged vectors */ - u32 *msrpm; - - /* gpa pointers to the real vectors */ - u64 vmcb_msrpm; - u64 vmcb_iopm; - - /* A VMEXIT is required but not yet emulated */ - bool exit_required; - - /* cache for intercepts of the guest */ - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - - /* Nested Paging related state */ - u64 nested_cr3; -}; - -#define MSRPM_OFFSETS 16 -static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; /* * Set osvw_len to higher value when updated Revision Guides @@ -188,70 +125,6 @@ static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; */ static uint64_t osvw_len = 4, osvw_status; -struct vcpu_svm { - struct kvm_vcpu vcpu; - struct vmcb *vmcb; - unsigned long vmcb_pa; - struct svm_cpu_data *svm_data; - uint64_t asid_generation; - uint64_t sysenter_esp; - uint64_t sysenter_eip; - uint64_t tsc_aux; - - u64 msr_decfg; - - u64 next_rip; - - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - struct { - u16 fs; - u16 gs; - u16 ldt; - u64 gs_base; - } host; - - u64 spec_ctrl; - /* - * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be - * translated into the appropriate L2_CFG bits on the host to - * perform speculative control. - */ - u64 virt_spec_ctrl; - - u32 *msrpm; - - ulong nmi_iret_rip; - - struct nested_state nested; - - bool nmi_singlestep; - u64 nmi_singlestep_guest_rflags; - - unsigned int3_injected; - unsigned long int3_rip; - - /* cached guest cpuid flags for faster access */ - bool nrips_enabled : 1; - - u32 ldr_reg; - u32 dfr_reg; - struct page *avic_backing_page; - u64 *avic_physical_id_cache; - bool avic_is_running; - - /* - * Per-vcpu list of struct amd_svm_iommu_ir: - * This is used mainly to store interrupt remapping information used - * when update the vcpu affinity. This avoids the need to scan for - * IRTE and try to match ga_tag in the IOMMU driver. - */ - struct list_head ir_list; - spinlock_t ir_list_lock; - - /* which host CPU was used for running this vcpu */ - unsigned int last_cpu; -}; - /* * This is a wrapper of struct amd_iommu_ir_data. */ @@ -272,8 +145,6 @@ struct amd_svm_iommu_ir { static DEFINE_PER_CPU(u64, current_tsc_ratio); #define TSC_RATIO_DEFAULT 0x0100000000ULL -#define MSR_INVALID 0xffffffffU - static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ @@ -299,9 +170,9 @@ static const struct svm_direct_access_msrs { /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -static bool npt_enabled = true; +bool npt_enabled = true; #else -static bool npt_enabled; +bool npt_enabled; #endif /* @@ -387,41 +258,10 @@ module_param(dump_invalid_vmcb, bool, 0644); static u8 rsm_ins_bytes[] = "\x0f\xaa"; -static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); static inline void avic_post_state_restore(struct kvm_vcpu *vcpu); -static int nested_svm_exit_handled(struct vcpu_svm *svm); -static int nested_svm_intercept(struct vcpu_svm *svm); -static int nested_svm_vmexit(struct vcpu_svm *svm); -static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, - bool has_error_code, u32 error_code); - -enum { - VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, - pause filter count */ - VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ - VMCB_ASID, /* ASID */ - VMCB_INTR, /* int_ctl, int_vector */ - VMCB_NPT, /* npt_en, nCR3, gPAT */ - VMCB_CR, /* CR0, CR3, CR4, EFER */ - VMCB_DR, /* DR6, DR7 */ - VMCB_DT, /* GDT, IDT */ - VMCB_SEG, /* CS, DS, SS, ES, CPL */ - VMCB_CR2, /* CR2 only */ - VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ - VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE, - * AVIC PHYSICAL_TABLE pointer, - * AVIC LOGICAL_TABLE pointer - */ - VMCB_DIRTY_MAX, -}; - -/* TPR and CR2 are always written before VMRUN */ -#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) - #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL static int sev_flush_asids(void); @@ -470,27 +310,6 @@ static inline int sev_get_asid(struct kvm *kvm) return sev->asid; } -static inline void mark_all_dirty(struct vmcb *vmcb) -{ - vmcb->control.clean = 0; -} - -static inline void mark_all_clean(struct vmcb *vmcb) -{ - vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) - & ~VMCB_ALWAYS_DIRTY_MASK; -} - -static inline void mark_dirty(struct vmcb *vmcb, int bit) -{ - vmcb->control.clean &= ~(1 << bit); -} - -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_svm, vcpu); -} - static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; @@ -508,183 +327,6 @@ static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); } -static void recalc_intercepts(struct vcpu_svm *svm) -{ - struct vmcb_control_area *c, *h; - struct nested_state *g; - - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - - if (!is_guest_mode(&svm->vcpu)) - return; - - c = &svm->vmcb->control; - h = &svm->nested.hsave->control; - g = &svm->nested; - - c->intercept_cr = h->intercept_cr; - c->intercept_dr = h->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions; - c->intercept = h->intercept; - - if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { - /* We only want the cr8 intercept bits of L1 */ - c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); - c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); - - /* - * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not - * affect any interrupt we may want to inject; therefore, - * interrupt window vmexits are irrelevant to L0. - */ - c->intercept &= ~(1ULL << INTERCEPT_VINTR); - } - - /* We don't want to see VMMCALLs from a nested guest */ - c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); - - c->intercept_cr |= g->intercept_cr; - c->intercept_dr |= g->intercept_dr; - c->intercept_exceptions |= g->intercept_exceptions; - c->intercept |= g->intercept; -} - -static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) -{ - if (is_guest_mode(&svm->vcpu)) - return svm->nested.hsave; - else - return svm->vmcb; -} - -static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr |= (1U << bit); - - recalc_intercepts(svm); -} - -static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr &= ~(1U << bit); - - recalc_intercepts(svm); -} - -static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - return vmcb->control.intercept_cr & (1U << bit); -} - -static inline void set_dr_intercepts(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) - | (1 << INTERCEPT_DR1_READ) - | (1 << INTERCEPT_DR2_READ) - | (1 << INTERCEPT_DR3_READ) - | (1 << INTERCEPT_DR4_READ) - | (1 << INTERCEPT_DR5_READ) - | (1 << INTERCEPT_DR6_READ) - | (1 << INTERCEPT_DR7_READ) - | (1 << INTERCEPT_DR0_WRITE) - | (1 << INTERCEPT_DR1_WRITE) - | (1 << INTERCEPT_DR2_WRITE) - | (1 << INTERCEPT_DR3_WRITE) - | (1 << INTERCEPT_DR4_WRITE) - | (1 << INTERCEPT_DR5_WRITE) - | (1 << INTERCEPT_DR6_WRITE) - | (1 << INTERCEPT_DR7_WRITE); - - recalc_intercepts(svm); -} - -static inline void clr_dr_intercepts(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_dr = 0; - - recalc_intercepts(svm); -} - -static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_exceptions |= (1U << bit); - - recalc_intercepts(svm); -} - -static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_exceptions &= ~(1U << bit); - - recalc_intercepts(svm); -} - -static inline void set_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept |= (1ULL << bit); - - recalc_intercepts(svm); -} - -static inline void clr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept &= ~(1ULL << bit); - - recalc_intercepts(svm); -} - -static inline bool is_intercept(struct vcpu_svm *svm, int bit) -{ - return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; -} - -static inline bool vgif_enabled(struct vcpu_svm *svm) -{ - return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); -} - -static inline void enable_gif(struct vcpu_svm *svm) -{ - if (vgif_enabled(svm)) - svm->vmcb->control.int_ctl |= V_GIF_MASK; - else - svm->vcpu.arch.hflags |= HF_GIF_MASK; -} - -static inline void disable_gif(struct vcpu_svm *svm) -{ - if (vgif_enabled(svm)) - svm->vmcb->control.int_ctl &= ~V_GIF_MASK; - else - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; -} - -static inline bool gif_set(struct vcpu_svm *svm) -{ - if (vgif_enabled(svm)) - return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); - else - return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); -} - static unsigned long iopm_base; struct kvm_ldttss_desc { @@ -720,7 +362,7 @@ static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define MSRS_RANGE_SIZE 2048 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) -static u32 svm_msrpm_offset(u32 msr) +u32 svm_msrpm_offset(u32 msr) { u32 offset; int i; @@ -767,7 +409,7 @@ static int get_npt_level(struct kvm_vcpu *vcpu) #endif } -static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; @@ -1198,7 +840,7 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } -static void disable_nmi_singlestep(struct vcpu_svm *svm) +void disable_nmi_singlestep(struct vcpu_svm *svm) { svm->nmi_singlestep = false; @@ -2652,7 +2294,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm) } } -static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2686,7 +2328,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) update_cr0_intercept(svm); } -static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; @@ -3022,776 +2664,6 @@ static int vmmcall_interception(struct vcpu_svm *svm) return kvm_emulate_hypercall(&svm->vcpu); } -static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - return svm->nested.nested_cr3; -} - -static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 cr3 = svm->nested.nested_cr3; - u64 pdpte; - int ret; - - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, - offset_in_page(cr3) + index * 8, 8); - if (ret) - return 0; - return pdpte; -} - -static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { - /* - * TODO: track the cause of the nested page fault, and - * correctly fill in the high bits of exit_info_1. - */ - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = (1ULL << 32); - svm->vmcb->control.exit_info_2 = fault->address; - } - - svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; - svm->vmcb->control.exit_info_1 |= fault->error_code; - - /* - * The present bit is always zero for page structure faults on real - * hardware. - */ - if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) - svm->vmcb->control.exit_info_1 &= ~1; - - nested_svm_vmexit(svm); -} - -static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) -{ - WARN_ON(mmu_is_nested(vcpu)); - - vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_mmu(vcpu); - vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; - vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; - vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu); - reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); - vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; -} - -static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) -{ - vcpu->arch.mmu = &vcpu->arch.root_mmu; - vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; -} - -static int nested_svm_check_permissions(struct vcpu_svm *svm) -{ - if (!(svm->vcpu.arch.efer & EFER_SVME) || - !is_paging(&svm->vcpu)) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; - } - - if (svm->vmcb->save.cpl) { - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } - - return 0; -} - -static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, - bool has_error_code, u32 error_code) -{ - int vmexit; - - if (!is_guest_mode(&svm->vcpu)) - return 0; - - vmexit = nested_svm_intercept(svm); - if (vmexit != NESTED_EXIT_DONE) - return 0; - - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; - - /* - * EXITINFO2 is undefined for all exception intercepts other - * than #PF. - */ - if (svm->vcpu.arch.exception.nested_apf) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; - else if (svm->vcpu.arch.exception.has_payload) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; - else - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - - svm->nested.exit_required = true; - return vmexit; -} - -static void nested_svm_intr(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_INTR; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - /* nested_svm_vmexit this gets called afterwards from handle_exit */ - svm->nested.exit_required = true; - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); -} - -static bool nested_exit_on_intr(struct vcpu_svm *svm) -{ - return (svm->nested.intercept & 1ULL); -} - -static int svm_check_nested_events(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - bool block_nested_events = - kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required; - - if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) { - if (block_nested_events) - return -EBUSY; - nested_svm_intr(svm); - return 0; - } - - return 0; -} - -/* This function returns true if it is save to enable the nmi window */ -static inline bool nested_svm_nmi(struct vcpu_svm *svm) -{ - if (!is_guest_mode(&svm->vcpu)) - return true; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) - return true; - - svm->vmcb->control.exit_code = SVM_EXIT_NMI; - svm->nested.exit_required = true; - - return false; -} - -static int nested_svm_intercept_ioio(struct vcpu_svm *svm) -{ - unsigned port, size, iopm_len; - u16 val, mask; - u8 start_bit; - u64 gpa; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) - return NESTED_EXIT_HOST; - - port = svm->vmcb->control.exit_info_1 >> 16; - size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> - SVM_IOIO_SIZE_SHIFT; - gpa = svm->nested.vmcb_iopm + (port / 8); - start_bit = port % 8; - iopm_len = (start_bit + size > 8) ? 2 : 1; - mask = (0xf >> (4 - size)) << start_bit; - val = 0; - - if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len)) - return NESTED_EXIT_DONE; - - return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; -} - -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) -{ - u32 offset, msr, value; - int write, mask; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return NESTED_EXIT_HOST; - - msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - offset = svm_msrpm_offset(msr); - write = svm->vmcb->control.exit_info_1 & 1; - mask = 1 << ((2 * (msr & 0xf)) + write); - - if (offset == MSR_INVALID) - return NESTED_EXIT_DONE; - - /* Offset is in 32 bit units but need in 8 bit units */ - offset *= 4; - - if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) - return NESTED_EXIT_DONE; - - return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; -} - -/* DB exceptions for our internal use must not cause vmexit */ -static int nested_svm_intercept_db(struct vcpu_svm *svm) -{ - unsigned long dr6; - - /* if we're not singlestepping, it's not ours */ - if (!svm->nmi_singlestep) - return NESTED_EXIT_DONE; - - /* if it's not a singlestep exception, it's not ours */ - if (kvm_get_dr(&svm->vcpu, 6, &dr6)) - return NESTED_EXIT_DONE; - if (!(dr6 & DR6_BS)) - return NESTED_EXIT_DONE; - - /* if the guest is singlestepping, it should get the vmexit */ - if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { - disable_nmi_singlestep(svm); - return NESTED_EXIT_DONE; - } - - /* it's ours, the nested hypervisor must not see this one */ - return NESTED_EXIT_HOST; -} - -static int nested_svm_exit_special(struct vcpu_svm *svm) -{ - u32 exit_code = svm->vmcb->control.exit_code; - - switch (exit_code) { - case SVM_EXIT_INTR: - case SVM_EXIT_NMI: - case SVM_EXIT_EXCP_BASE + MC_VECTOR: - return NESTED_EXIT_HOST; - case SVM_EXIT_NPF: - /* For now we are always handling NPFs when using them */ - if (npt_enabled) - return NESTED_EXIT_HOST; - break; - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) - return NESTED_EXIT_HOST; - break; - default: - break; - } - - return NESTED_EXIT_CONTINUE; -} - -static int nested_svm_intercept(struct vcpu_svm *svm) -{ - u32 exit_code = svm->vmcb->control.exit_code; - int vmexit = NESTED_EXIT_HOST; - - switch (exit_code) { - case SVM_EXIT_MSR: - vmexit = nested_svm_exit_handled_msr(svm); - break; - case SVM_EXIT_IOIO: - vmexit = nested_svm_intercept_ioio(svm); - break; - case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.intercept_cr & bit) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.intercept_dr & bit) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { - u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) { - if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) - vmexit = nested_svm_intercept_db(svm); - else - vmexit = NESTED_EXIT_DONE; - } - /* async page fault always cause vmexit */ - else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->vcpu.arch.exception.nested_apf != 0) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_ERR: { - vmexit = NESTED_EXIT_DONE; - break; - } - default: { - u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - if (svm->nested.intercept & exit_bits) - vmexit = NESTED_EXIT_DONE; - } - } - - return vmexit; -} - -static int nested_svm_exit_handled(struct vcpu_svm *svm) -{ - int vmexit; - - vmexit = nested_svm_intercept(svm); - - if (vmexit == NESTED_EXIT_DONE) - nested_svm_vmexit(svm); - - return vmexit; -} - -static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) -{ - struct vmcb_control_area *dst = &dst_vmcb->control; - struct vmcb_control_area *from = &from_vmcb->control; - - dst->intercept_cr = from->intercept_cr; - dst->intercept_dr = from->intercept_dr; - dst->intercept_exceptions = from->intercept_exceptions; - dst->intercept = from->intercept; - dst->iopm_base_pa = from->iopm_base_pa; - dst->msrpm_base_pa = from->msrpm_base_pa; - dst->tsc_offset = from->tsc_offset; - dst->asid = from->asid; - dst->tlb_ctl = from->tlb_ctl; - dst->int_ctl = from->int_ctl; - dst->int_vector = from->int_vector; - dst->int_state = from->int_state; - dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; - dst->exit_info_1 = from->exit_info_1; - dst->exit_info_2 = from->exit_info_2; - dst->exit_int_info = from->exit_int_info; - dst->exit_int_info_err = from->exit_int_info_err; - dst->nested_ctl = from->nested_ctl; - dst->event_inj = from->event_inj; - dst->event_inj_err = from->event_inj_err; - dst->nested_cr3 = from->nested_cr3; - dst->virt_ext = from->virt_ext; - dst->pause_filter_count = from->pause_filter_count; - dst->pause_filter_thresh = from->pause_filter_thresh; -} - -static int nested_svm_vmexit(struct vcpu_svm *svm) -{ - int rc; - struct vmcb *nested_vmcb; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; - struct kvm_host_map map; - - trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, - vmcb->control.exit_info_1, - vmcb->control.exit_info_2, - vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); - if (rc) { - if (rc == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } - - nested_vmcb = map.hva; - - /* Exit Guest-Mode */ - leave_guest_mode(&svm->vcpu); - svm->nested.vmcb = 0; - - /* Give the current vmcb to the guest */ - disable_gif(svm); - - nested_vmcb->save.es = vmcb->save.es; - nested_vmcb->save.cs = vmcb->save.cs; - nested_vmcb->save.ss = vmcb->save.ss; - nested_vmcb->save.ds = vmcb->save.ds; - nested_vmcb->save.gdtr = vmcb->save.gdtr; - nested_vmcb->save.idtr = vmcb->save.idtr; - nested_vmcb->save.efer = svm->vcpu.arch.efer; - nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); - nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); - nested_vmcb->save.cr2 = vmcb->save.cr2; - nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); - nested_vmcb->save.rip = vmcb->save.rip; - nested_vmcb->save.rsp = vmcb->save.rsp; - nested_vmcb->save.rax = vmcb->save.rax; - nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = vmcb->save.dr6; - nested_vmcb->save.cpl = vmcb->save.cpl; - - nested_vmcb->control.int_ctl = vmcb->control.int_ctl; - nested_vmcb->control.int_vector = vmcb->control.int_vector; - nested_vmcb->control.int_state = vmcb->control.int_state; - nested_vmcb->control.exit_code = vmcb->control.exit_code; - nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; - nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; - nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; - nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; - nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; - - if (svm->nrips_enabled) - nested_vmcb->control.next_rip = vmcb->control.next_rip; - - /* - * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have - * to make sure that we do not lose injected events. So check event_inj - * here and copy it to exit_int_info if it is valid. - * Exit_int_info and event_inj can't be both valid because the case - * below only happens on a VMRUN instruction intercept which has - * no valid exit_int_info set. - */ - if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { - struct vmcb_control_area *nc = &nested_vmcb->control; - - nc->exit_int_info = vmcb->control.event_inj; - nc->exit_int_info_err = vmcb->control.event_inj_err; - } - - nested_vmcb->control.tlb_ctl = 0; - nested_vmcb->control.event_inj = 0; - nested_vmcb->control.event_inj_err = 0; - - nested_vmcb->control.pause_filter_count = - svm->vmcb->control.pause_filter_count; - nested_vmcb->control.pause_filter_thresh = - svm->vmcb->control.pause_filter_thresh; - - /* We always set V_INTR_MASKING and remember the old value in hflags */ - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; - - /* Restore the original control entries */ - copy_vmcb_control_area(vmcb, hsave); - - svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - svm->nested.nested_cr3 = 0; - - /* Restore selected save entries */ - svm->vmcb->save.es = hsave->save.es; - svm->vmcb->save.cs = hsave->save.cs; - svm->vmcb->save.ss = hsave->save.ss; - svm->vmcb->save.ds = hsave->save.ds; - svm->vmcb->save.gdtr = hsave->save.gdtr; - svm->vmcb->save.idtr = hsave->save.idtr; - kvm_set_rflags(&svm->vcpu, hsave->save.rflags); - svm_set_efer(&svm->vcpu, hsave->save.efer); - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); - svm_set_cr4(&svm->vcpu, hsave->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = hsave->save.cr3; - svm->vcpu.arch.cr3 = hsave->save.cr3; - } else { - (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); - } - kvm_rax_write(&svm->vcpu, hsave->save.rax); - kvm_rsp_write(&svm->vcpu, hsave->save.rsp); - kvm_rip_write(&svm->vcpu, hsave->save.rip); - svm->vmcb->save.dr7 = 0; - svm->vmcb->save.cpl = 0; - svm->vmcb->control.exit_int_info = 0; - - mark_all_dirty(svm->vmcb); - - kvm_vcpu_unmap(&svm->vcpu, &map, true); - - nested_svm_uninit_mmu_context(&svm->vcpu); - kvm_mmu_reset_context(&svm->vcpu); - kvm_mmu_load(&svm->vcpu); - - /* - * Drop what we picked up for L2 via svm_complete_interrupts() so it - * doesn't end up in L1. - */ - svm->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - return 0; -} - -static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) -{ - /* - * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is optimized in that it only merges the parts where - * the kvm msr permission bitmap may contain zero bits - */ - int i; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return true; - - for (i = 0; i < MSRPM_OFFSETS; i++) { - u32 value, p; - u64 offset; - - if (msrpm_offsets[i] == 0xffffffff) - break; - - p = msrpm_offsets[i]; - offset = svm->nested.vmcb_msrpm + (p * 4); - - if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) - return false; - - svm->nested.msrpm[p] = svm->msrpm[p] | value; - } - - svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); - - return true; -} - -static bool nested_vmcb_checks(struct vmcb *vmcb) -{ - if ((vmcb->save.efer & EFER_SVME) == 0) - return false; - - if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) - return false; - - if (vmcb->control.asid == 0) - return false; - - if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && - !npt_enabled) - return false; - - return true; -} - -static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct kvm_host_map *map) -{ - bool evaluate_pending_interrupts = - is_intercept(svm, INTERCEPT_VINTR) || - is_intercept(svm, INTERCEPT_IRET); - - if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) - svm->vcpu.arch.hflags |= HF_HIF_MASK; - else - svm->vcpu.arch.hflags &= ~HF_HIF_MASK; - - if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { - svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; - nested_svm_init_mmu_context(&svm->vcpu); - } - - /* Load the nested guest state */ - svm->vmcb->save.es = nested_vmcb->save.es; - svm->vmcb->save.cs = nested_vmcb->save.cs; - svm->vmcb->save.ss = nested_vmcb->save.ss; - svm->vmcb->save.ds = nested_vmcb->save.ds; - svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; - svm->vmcb->save.idtr = nested_vmcb->save.idtr; - kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); - svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); - svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); - svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = nested_vmcb->save.cr3; - svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; - } else - (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - - /* Guest paging mode is active - reset mmu */ - kvm_mmu_reset_context(&svm->vcpu); - - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; - kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); - kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); - kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); - - /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = nested_vmcb->save.rax; - svm->vmcb->save.rsp = nested_vmcb->save.rsp; - svm->vmcb->save.rip = nested_vmcb->save.rip; - svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vmcb->save.dr6 = nested_vmcb->save.dr6; - svm->vmcb->save.cpl = nested_vmcb->save.cpl; - - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; - svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; - - /* cache intercepts */ - svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; - svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; - svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; - svm->nested.intercept = nested_vmcb->control.intercept; - - svm_flush_tlb(&svm->vcpu, true); - svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; - if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) - svm->vcpu.arch.hflags |= HF_VINTR_MASK; - else - svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; - - svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; - - svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; - svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; - svm->vmcb->control.int_state = nested_vmcb->control.int_state; - svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; - svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - - svm->vmcb->control.pause_filter_count = - nested_vmcb->control.pause_filter_count; - svm->vmcb->control.pause_filter_thresh = - nested_vmcb->control.pause_filter_thresh; - - kvm_vcpu_unmap(&svm->vcpu, map, true); - - /* Enter Guest-Mode */ - enter_guest_mode(&svm->vcpu); - - /* - * Merge guest and host intercepts - must be called with vcpu in - * guest-mode to take affect here - */ - recalc_intercepts(svm); - - svm->nested.vmcb = vmcb_gpa; - - /* - * If L1 had a pending IRQ/NMI before executing VMRUN, - * which wasn't delivered because it was disallowed (e.g. - * interrupts disabled), L0 needs to evaluate if this pending - * event should cause an exit from L2 to L1 or be delivered - * directly to L2. - * - * Usually this would be handled by the processor noticing an - * IRQ/NMI window request. However, VMRUN can unblock interrupts - * by implicitly setting GIF, so force L0 to perform pending event - * evaluation by requesting a KVM_REQ_EVENT. - */ - enable_gif(svm); - if (unlikely(evaluate_pending_interrupts)) - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - - mark_all_dirty(svm->vmcb); -} - -static int nested_svm_vmrun(struct vcpu_svm *svm) -{ - int ret; - struct vmcb *nested_vmcb; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; - struct kvm_host_map map; - u64 vmcb_gpa; - - vmcb_gpa = svm->vmcb->save.rax; - - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); - if (ret == -EINVAL) { - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } else if (ret) { - return kvm_skip_emulated_instruction(&svm->vcpu); - } - - ret = kvm_skip_emulated_instruction(&svm->vcpu); - - nested_vmcb = map.hva; - - if (!nested_vmcb_checks(nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; - - kvm_vcpu_unmap(&svm->vcpu, &map, true); - - return ret; - } - - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); - - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); - - /* Clear internal status */ - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs - */ - hsave->save.es = vmcb->save.es; - hsave->save.cs = vmcb->save.cs; - hsave->save.ss = vmcb->save.ss; - hsave->save.ds = vmcb->save.ds; - hsave->save.gdtr = vmcb->save.gdtr; - hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.efer; - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); - hsave->save.rip = kvm_rip_read(&svm->vcpu); - hsave->save.rsp = vmcb->save.rsp; - hsave->save.rax = vmcb->save.rax; - if (npt_enabled) - hsave->save.cr3 = vmcb->save.cr3; - else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - - copy_vmcb_control_area(hsave, vmcb); - - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); - - if (!nested_svm_vmrun_msrpm(svm)) { - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); - } - - return ret; -} - -static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) -{ - to_vmcb->save.fs = from_vmcb->save.fs; - to_vmcb->save.gs = from_vmcb->save.gs; - to_vmcb->save.tr = from_vmcb->save.tr; - to_vmcb->save.ldtr = from_vmcb->save.ldtr; - to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; - to_vmcb->save.star = from_vmcb->save.star; - to_vmcb->save.lstar = from_vmcb->save.lstar; - to_vmcb->save.cstar = from_vmcb->save.cstar; - to_vmcb->save.sfmask = from_vmcb->save.sfmask; - to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; - to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; - to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; -} - static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; @@ -5186,11 +4058,6 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) -{ - return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); -} - static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5632,7 +4499,7 @@ static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) return 0; } -static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { struct vcpu_svm *svm = to_svm(vcpu); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h new file mode 100644 index 000000000000..f4c446d7a31e --- /dev/null +++ b/arch/x86/kvm/svm/svm.h @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + */ + +#ifndef __SVM_SVM_H +#define __SVM_SVM_H + +#include +#include + +#include + +static const u32 host_save_user_msrs[] = { +#ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_TSC_AUX, +}; + +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) + +#define MSRPM_OFFSETS 16 +extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +extern bool npt_enabled; + +enum { + VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, + pause filter count */ + VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ + VMCB_ASID, /* ASID */ + VMCB_INTR, /* int_ctl, int_vector */ + VMCB_NPT, /* npt_en, nCR3, gPAT */ + VMCB_CR, /* CR0, CR3, CR4, EFER */ + VMCB_DR, /* DR6, DR7 */ + VMCB_DT, /* GDT, IDT */ + VMCB_SEG, /* CS, DS, SS, ES, CPL */ + VMCB_CR2, /* CR2 only */ + VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ + VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE, + * AVIC PHYSICAL_TABLE pointer, + * AVIC LOGICAL_TABLE pointer + */ + VMCB_DIRTY_MAX, +}; + +/* TPR and CR2 are always written before VMRUN */ +#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) + +struct kvm_sev_info { + bool active; /* SEV enabled guest */ + unsigned int asid; /* ASID used for this guest */ + unsigned int handle; /* SEV firmware handle */ + int fd; /* SEV device fd */ + unsigned long pages_locked; /* Number of pages locked */ + struct list_head regions_list; /* List of registered regions */ +}; + +struct kvm_svm { + struct kvm kvm; + + /* Struct members for AVIC */ + u32 avic_vm_id; + struct page *avic_logical_id_table_page; + struct page *avic_physical_id_table_page; + struct hlist_node hnode; + + struct kvm_sev_info sev_info; +}; + +struct kvm_vcpu; + +struct nested_state { + struct vmcb *hsave; + u64 hsave_msr; + u64 vm_cr_msr; + u64 vmcb; + + /* These are the merged vectors */ + u32 *msrpm; + + /* gpa pointers to the real vectors */ + u64 vmcb_msrpm; + u64 vmcb_iopm; + + /* A VMEXIT is required but not yet emulated */ + bool exit_required; + + /* cache for intercepts of the guest */ + u32 intercept_cr; + u32 intercept_dr; + u32 intercept_exceptions; + u64 intercept; + + /* Nested Paging related state */ + u64 nested_cr3; +}; + +struct vcpu_svm { + struct kvm_vcpu vcpu; + struct vmcb *vmcb; + unsigned long vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t tsc_aux; + + u64 msr_decfg; + + u64 next_rip; + + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + struct { + u16 fs; + u16 gs; + u16 ldt; + u64 gs_base; + } host; + + u64 spec_ctrl; + /* + * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be + * translated into the appropriate L2_CFG bits on the host to + * perform speculative control. + */ + u64 virt_spec_ctrl; + + u32 *msrpm; + + ulong nmi_iret_rip; + + struct nested_state nested; + + bool nmi_singlestep; + u64 nmi_singlestep_guest_rflags; + + unsigned int3_injected; + unsigned long int3_rip; + + /* cached guest cpuid flags for faster access */ + bool nrips_enabled : 1; + + u32 ldr_reg; + u32 dfr_reg; + struct page *avic_backing_page; + u64 *avic_physical_id_cache; + bool avic_is_running; + + /* + * Per-vcpu list of struct amd_svm_iommu_ir: + * This is used mainly to store interrupt remapping information used + * when update the vcpu affinity. This avoids the need to scan for + * IRTE and try to match ga_tag in the IOMMU driver. + */ + struct list_head ir_list; + spinlock_t ir_list_lock; + + /* which host CPU was used for running this vcpu */ + unsigned int last_cpu; +}; + +void recalc_intercepts(struct vcpu_svm *svm); + +static inline void mark_all_dirty(struct vmcb *vmcb) +{ + vmcb->control.clean = 0; +} + +static inline void mark_all_clean(struct vmcb *vmcb) +{ + vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) + & ~VMCB_ALWAYS_DIRTY_MASK; +} + +static inline void mark_dirty(struct vmcb *vmcb, int bit) +{ + vmcb->control.clean &= ~(1 << bit); +} + +static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_svm, vcpu); +} + +static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) +{ + if (is_guest_mode(&svm->vcpu)) + return svm->nested.hsave; + else + return svm->vmcb; +} + +static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_cr |= (1U << bit); + + recalc_intercepts(svm); +} + +static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_cr &= ~(1U << bit); + + recalc_intercepts(svm); +} + +static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + return vmcb->control.intercept_cr & (1U << bit); +} + +static inline void set_dr_intercepts(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) + | (1 << INTERCEPT_DR1_READ) + | (1 << INTERCEPT_DR2_READ) + | (1 << INTERCEPT_DR3_READ) + | (1 << INTERCEPT_DR4_READ) + | (1 << INTERCEPT_DR5_READ) + | (1 << INTERCEPT_DR6_READ) + | (1 << INTERCEPT_DR7_READ) + | (1 << INTERCEPT_DR0_WRITE) + | (1 << INTERCEPT_DR1_WRITE) + | (1 << INTERCEPT_DR2_WRITE) + | (1 << INTERCEPT_DR3_WRITE) + | (1 << INTERCEPT_DR4_WRITE) + | (1 << INTERCEPT_DR5_WRITE) + | (1 << INTERCEPT_DR6_WRITE) + | (1 << INTERCEPT_DR7_WRITE); + + recalc_intercepts(svm); +} + +static inline void clr_dr_intercepts(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_dr = 0; + + recalc_intercepts(svm); +} + +static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_exceptions |= (1U << bit); + + recalc_intercepts(svm); +} + +static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept_exceptions &= ~(1U << bit); + + recalc_intercepts(svm); +} + +static inline void set_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept |= (1ULL << bit); + + recalc_intercepts(svm); +} + +static inline void clr_intercept(struct vcpu_svm *svm, int bit) +{ + struct vmcb *vmcb = get_host_vmcb(svm); + + vmcb->control.intercept &= ~(1ULL << bit); + + recalc_intercepts(svm); +} + +static inline bool is_intercept(struct vcpu_svm *svm, int bit) +{ + return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; +} + +static inline bool vgif_enabled(struct vcpu_svm *svm) +{ + return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); +} + +static inline void enable_gif(struct vcpu_svm *svm) +{ + if (vgif_enabled(svm)) + svm->vmcb->control.int_ctl |= V_GIF_MASK; + else + svm->vcpu.arch.hflags |= HF_GIF_MASK; +} + +static inline void disable_gif(struct vcpu_svm *svm) +{ + if (vgif_enabled(svm)) + svm->vmcb->control.int_ctl &= ~V_GIF_MASK; + else + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; +} + +static inline bool gif_set(struct vcpu_svm *svm) +{ + if (vgif_enabled(svm)) + return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); + else + return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); +} + +/* svm.c */ +#define MSR_INVALID 0xffffffffU + +u32 svm_msrpm_offset(u32 msr); +void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); +void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); +void disable_nmi_singlestep(struct vcpu_svm *svm); + +/* nested.c */ + +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ + +/* This function returns true if it is save to enable the nmi window */ +static inline bool nested_svm_nmi(struct vcpu_svm *svm) +{ + if (!is_guest_mode(&svm->vcpu)) + return true; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) + return true; + + svm->vmcb->control.exit_code = SVM_EXIT_NMI; + svm->nested.exit_required = true; + + return false; +} + +static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +{ + return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); +} + +void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb, struct kvm_host_map *map); +int nested_svm_vmrun(struct vcpu_svm *svm); +void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); +int nested_svm_vmexit(struct vcpu_svm *svm); +int nested_svm_exit_handled(struct vcpu_svm *svm); +int nested_svm_check_permissions(struct vcpu_svm *svm); +int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, + bool has_error_code, u32 error_code); +int svm_check_nested_events(struct kvm_vcpu *vcpu); +int nested_svm_exit_special(struct vcpu_svm *svm); + +#endif -- cgit v1.2.3 From ef0f64960d012cbab8f55f305ef36bb6de4e1a9b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 31 Mar 2020 12:17:38 -0400 Subject: KVM: SVM: Move AVIC code to separate file Move the AVIC related functions from svm.c to the new avic.c file. Signed-off-by: Joerg Roedel Message-Id: <20200324094154.32352-4-joro@8bytes.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/svm/avic.c | 1027 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 1050 +---------------------------------------------- arch/x86/kvm/svm/svm.h | 62 +++ 4 files changed, 1091 insertions(+), 1050 deletions(-) create mode 100644 arch/x86/kvm/svm/avic.c (limited to 'arch/x86/kvm/Makefile') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 0acec73ef236..31aca07d8f12 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,7 +14,7 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o -kvm-amd-y += svm/svm.o svm/pmu.o svm/nested.o +kvm-amd-y += svm/svm.o svm/pmu.o svm/nested.o svm/avic.o obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c new file mode 100644 index 000000000000..e80daa98682f --- /dev/null +++ b/arch/x86/kvm/svm/avic.c @@ -0,0 +1,1027 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + */ + +#define pr_fmt(fmt) "SVM: " fmt + +#include +#include +#include +#include + +#include + +#include "trace.h" +#include "lapic.h" +#include "x86.h" +#include "irq.h" +#include "svm.h" + +/* enable / disable AVIC */ +int avic; +#ifdef CONFIG_X86_LOCAL_APIC +module_param(avic, int, S_IRUGO); +#endif + +#define SVM_AVIC_DOORBELL 0xc001011b + +#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 255 + +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + +/* AVIC GATAG is encoded using VM and VCPU IDs */ +#define AVIC_VCPU_ID_BITS 8 +#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) + +#define AVIC_VM_ID_BITS 24 +#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) +#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) + +#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ + (y & AVIC_VCPU_ID_MASK)) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) + +/* Note: + * This hash table is used to map VM_ID to a struct kvm_svm, + * when handling AMD IOMMU GALOG notification to schedule in + * a particular vCPU. + */ +#define SVM_VM_DATA_HASH_BITS 8 +static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static u32 next_vm_id = 0; +static bool next_vm_id_wrapped = 0; +static DEFINE_SPINLOCK(svm_vm_data_hash_lock); + +/* + * This is a wrapper of struct amd_iommu_ir_data. + */ +struct amd_svm_iommu_ir { + struct list_head node; /* Used by SVM for per-vcpu ir_list */ + void *data; /* Storing pointer to struct amd_ir_data */ +}; + +enum avic_ipi_failure_cause { + AVIC_IPI_FAILURE_INVALID_INT_TYPE, + AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, + AVIC_IPI_FAILURE_INVALID_TARGET, + AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, +}; + +/* Note: + * This function is called from IOMMU driver to notify + * SVM to schedule in a particular vCPU of a particular VM. + */ +int avic_ga_log_notifier(u32 ga_tag) +{ + unsigned long flags; + struct kvm_svm *kvm_svm; + struct kvm_vcpu *vcpu = NULL; + u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + + pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); + trace_kvm_avic_ga_log(vm_id, vcpu_id); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { + if (kvm_svm->avic_vm_id != vm_id) + continue; + vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); + break; + } + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + + /* Note: + * At this point, the IOMMU should have already set the pending + * bit in the vAPIC backing page. So, we just need to schedule + * in the vcpu. + */ + if (vcpu) + kvm_vcpu_wake_up(vcpu); + + return 0; +} + +void avic_vm_destroy(struct kvm *kvm) +{ + unsigned long flags; + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + if (!avic) + return; + + if (kvm_svm->avic_logical_id_table_page) + __free_page(kvm_svm->avic_logical_id_table_page); + if (kvm_svm->avic_physical_id_table_page) + __free_page(kvm_svm->avic_physical_id_table_page); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_del(&kvm_svm->hnode); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); +} + +int avic_vm_init(struct kvm *kvm) +{ + unsigned long flags; + int err = -ENOMEM; + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + struct kvm_svm *k2; + struct page *p_page; + struct page *l_page; + u32 vm_id; + + if (!avic) + return 0; + + /* Allocating physical APIC ID table (4KB) */ + p_page = alloc_page(GFP_KERNEL_ACCOUNT); + if (!p_page) + goto free_avic; + + kvm_svm->avic_physical_id_table_page = p_page; + clear_page(page_address(p_page)); + + /* Allocating logical APIC ID table (4KB) */ + l_page = alloc_page(GFP_KERNEL_ACCOUNT); + if (!l_page) + goto free_avic; + + kvm_svm->avic_logical_id_table_page = l_page; + clear_page(page_address(l_page)); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + again: + vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; + if (vm_id == 0) { /* id is 1-based, zero is not okay */ + next_vm_id_wrapped = 1; + goto again; + } + /* Is it still in use? Only possible if wrapped at least once */ + if (next_vm_id_wrapped) { + hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { + if (k2->avic_vm_id == vm_id) + goto again; + } + } + kvm_svm->avic_vm_id = vm_id; + hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + + return 0; + +free_avic: + avic_vm_destroy(kvm); + return err; +} + +void avic_init_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb; + struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); + phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); + phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); + phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); + + vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; + vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; + vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + if (kvm_apicv_activated(svm->vcpu.kvm)) + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + else + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; +} + +static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, + unsigned int index) +{ + u64 *avic_physical_id_table; + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); + + if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + return NULL; + + avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); + + return &avic_physical_id_table[index]; +} + +/** + * Note: + * AVIC hardware walks the nested page table to check permissions, + * but does not use the SPA address specified in the leaf page + * table entry since it uses address in the AVIC_BACKING_PAGE pointer + * field of the VMCB. Therefore, we set up the + * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. + */ +static int avic_update_access_page(struct kvm *kvm, bool activate) +{ + int ret = 0; + + mutex_lock(&kvm->slots_lock); + /* + * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger + * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT + * memory region. So, we need to ensure that kvm->mm == current->mm. + */ + if ((kvm->arch.apic_access_page_done == activate) || + (kvm->mm != current->mm)) + goto out; + + ret = __x86_set_memory_region(kvm, + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, + activate ? PAGE_SIZE : 0); + if (ret) + goto out; + + kvm->arch.apic_access_page_done = activate; +out: + mutex_unlock(&kvm->slots_lock); + return ret; +} + +static int avic_init_backing_page(struct kvm_vcpu *vcpu) +{ + u64 *entry, new_entry; + int id = vcpu->vcpu_id; + struct vcpu_svm *svm = to_svm(vcpu); + + if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + return -EINVAL; + + if (!svm->vcpu.arch.apic->regs) + return -EINVAL; + + if (kvm_apicv_activated(vcpu->kvm)) { + int ret; + + ret = avic_update_access_page(vcpu->kvm, true); + if (ret) + return ret; + } + + svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); + + /* Setting AVIC backing page address in the phy APIC ID table */ + entry = avic_get_physical_id_entry(vcpu, id); + if (!entry) + return -EINVAL; + + new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); + WRITE_ONCE(*entry, new_entry); + + svm->avic_physical_id_cache = entry; + + return 0; +} + +int avic_incomplete_ipi_interception(struct vcpu_svm *svm) +{ + u32 icrh = svm->vmcb->control.exit_info_1 >> 32; + u32 icrl = svm->vmcb->control.exit_info_1; + u32 id = svm->vmcb->control.exit_info_2 >> 32; + u32 index = svm->vmcb->control.exit_info_2 & 0xFF; + struct kvm_lapic *apic = svm->vcpu.arch.apic; + + trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); + + switch (id) { + case AVIC_IPI_FAILURE_INVALID_INT_TYPE: + /* + * AVIC hardware handles the generation of + * IPIs when the specified Message Type is Fixed + * (also known as fixed delivery mode) and + * the Trigger Mode is edge-triggered. The hardware + * also supports self and broadcast delivery modes + * specified via the Destination Shorthand(DSH) + * field of the ICRL. Logical and physical APIC ID + * formats are supported. All other IPI types cause + * a #VMEXIT, which needs to emulated. + */ + kvm_lapic_reg_write(apic, APIC_ICR2, icrh); + kvm_lapic_reg_write(apic, APIC_ICR, icrl); + break; + case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { + int i; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_lapic *apic = svm->vcpu.arch.apic; + + /* + * At this point, we expect that the AVIC HW has already + * set the appropriate IRR bits on the valid target + * vcpus. So, we just need to kick the appropriate vcpu. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, apic, + icrl & APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } + break; + } + case AVIC_IPI_FAILURE_INVALID_TARGET: + WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", + index, svm->vcpu.vcpu_id, icrh, icrl); + break; + case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: + WARN_ONCE(1, "Invalid backing page\n"); + break; + default: + pr_err("Unknown IPI interception\n"); + } + + return 1; +} + +static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); + int index; + u32 *logical_apic_id_table; + int dlid = GET_APIC_LOGICAL_ID(ldr); + + if (!dlid) + return NULL; + + if (flat) { /* flat */ + index = ffs(dlid) - 1; + if (index > 7) + return NULL; + } else { /* cluster */ + int cluster = (dlid & 0xf0) >> 4; + int apic = ffs(dlid & 0x0f) - 1; + + if ((apic < 0) || (apic > 7) || + (cluster >= 0xf)) + return NULL; + index = (cluster << 2) + apic; + } + + logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); + + return &logical_apic_id_table[index]; +} + +static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) +{ + bool flat; + u32 *entry, new_entry; + + flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; + entry = avic_get_logical_id_entry(vcpu, ldr, flat); + if (!entry) + return -EINVAL; + + new_entry = READ_ONCE(*entry); + new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); + new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + WRITE_ONCE(*entry, new_entry); + + return 0; +} + +static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool flat = svm->dfr_reg == APIC_DFR_FLAT; + u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + + if (entry) + clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); +} + +static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) +{ + int ret = 0; + struct vcpu_svm *svm = to_svm(vcpu); + u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + u32 id = kvm_xapic_id(vcpu->arch.apic); + + if (ldr == svm->ldr_reg) + return 0; + + avic_invalidate_logical_id_entry(vcpu); + + if (ldr) + ret = avic_ldr_write(vcpu, id, ldr); + + if (!ret) + svm->ldr_reg = ldr; + + return ret; +} + +static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) +{ + u64 *old, *new; + struct vcpu_svm *svm = to_svm(vcpu); + u32 id = kvm_xapic_id(vcpu->arch.apic); + + if (vcpu->vcpu_id == id) + return 0; + + old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); + new = avic_get_physical_id_entry(vcpu, id); + if (!new || !old) + return 1; + + /* We need to move physical_id_entry to new offset */ + *new = *old; + *old = 0ULL; + to_svm(vcpu)->avic_physical_id_cache = new; + + /* + * Also update the guest physical APIC ID in the logical + * APIC ID table entry if already setup the LDR. + */ + if (svm->ldr_reg) + avic_handle_ldr_update(vcpu); + + return 0; +} + +static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); + + if (svm->dfr_reg == dfr) + return; + + avic_invalidate_logical_id_entry(vcpu); + svm->dfr_reg = dfr; +} + +static int avic_unaccel_trap_write(struct vcpu_svm *svm) +{ + struct kvm_lapic *apic = svm->vcpu.arch.apic; + u32 offset = svm->vmcb->control.exit_info_1 & + AVIC_UNACCEL_ACCESS_OFFSET_MASK; + + switch (offset) { + case APIC_ID: + if (avic_handle_apic_id_update(&svm->vcpu)) + return 0; + break; + case APIC_LDR: + if (avic_handle_ldr_update(&svm->vcpu)) + return 0; + break; + case APIC_DFR: + avic_handle_dfr_update(&svm->vcpu); + break; + default: + break; + } + + kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); + + return 1; +} + +static bool is_avic_unaccelerated_access_trap(u32 offset) +{ + bool ret = false; + + switch (offset) { + case APIC_ID: + case APIC_EOI: + case APIC_RRR: + case APIC_LDR: + case APIC_DFR: + case APIC_SPIV: + case APIC_ESR: + case APIC_ICR: + case APIC_LVTT: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + case APIC_TMICT: + case APIC_TDCR: + ret = true; + break; + default: + break; + } + return ret; +} + +int avic_unaccelerated_access_interception(struct vcpu_svm *svm) +{ + int ret = 0; + u32 offset = svm->vmcb->control.exit_info_1 & + AVIC_UNACCEL_ACCESS_OFFSET_MASK; + u32 vector = svm->vmcb->control.exit_info_2 & + AVIC_UNACCEL_ACCESS_VECTOR_MASK; + bool write = (svm->vmcb->control.exit_info_1 >> 32) & + AVIC_UNACCEL_ACCESS_WRITE_MASK; + bool trap = is_avic_unaccelerated_access_trap(offset); + + trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, + trap, write, vector); + if (trap) { + /* Handling Trap */ + WARN_ONCE(!write, "svm: Handling trap read.\n"); + ret = avic_unaccel_trap_write(svm); + } else { + /* Handling Fault */ + ret = kvm_emulate_instruction(&svm->vcpu, 0); + } + + return ret; +} + +int avic_init_vcpu(struct vcpu_svm *svm) +{ + int ret; + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (!avic || !irqchip_in_kernel(vcpu->kvm)) + return 0; + + ret = avic_init_backing_page(&svm->vcpu); + if (ret) + return ret; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); + svm->dfr_reg = APIC_DFR_FLAT; + + return ret; +} + +void avic_post_state_restore(struct kvm_vcpu *vcpu) +{ + if (avic_handle_apic_id_update(vcpu) != 0) + return; + avic_handle_dfr_update(vcpu); + avic_handle_ldr_update(vcpu); +} + +void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) +{ + if (!avic || !lapic_in_kernel(vcpu)) + return; + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_request_apicv_update(vcpu->kvm, activate, + APICV_INHIBIT_REASON_IRQWIN); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); +} + +void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + return; +} + +void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ +} + +void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ +} + +static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; + + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + + if (list_empty(&svm->ir_list)) + goto out; + + list_for_each_entry(ir, &svm->ir_list, node) { + if (activate) + ret = amd_iommu_activate_guest_mode(ir->data); + else + ret = amd_iommu_deactivate_guest_mode(ir->data); + if (ret) + break; + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return ret; +} + +void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + bool activated = kvm_vcpu_apicv_active(vcpu); + + if (!avic) + return; + + if (activated) { + /** + * During AVIC temporary deactivation, guest could update + * APIC ID, DFR and LDR registers, which would not be trapped + * by avic_unaccelerated_access_interception(). In this case, + * we need to check and update the AVIC logical APIC ID table + * accordingly before re-activating. + */ + avic_post_state_restore(vcpu); + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + } else { + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + } + mark_dirty(vmcb, VMCB_AVIC); + + svm_set_pi_irte_mode(vcpu, activated); +} + +void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + return; +} + +int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) +{ + if (!vcpu->arch.apicv_active) + return -1; + + kvm_lapic_set_irr(vec, vcpu->arch.apic); + smp_mb__after_atomic(); + + if (avic_vcpu_is_running(vcpu)) { + int cpuid = vcpu->cpu; + + if (cpuid != get_cpu()) + wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); + put_cpu(); + } else + kvm_vcpu_wake_up(vcpu); + + return 0; +} + +bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return false; +} + +static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + unsigned long flags; + struct amd_svm_iommu_ir *cur; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_for_each_entry(cur, &svm->ir_list, node) { + if (cur->data != pi->ir_data) + continue; + list_del(&cur->node); + kfree(cur); + break; + } + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +} + +static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + + /** + * In some cases, the existing irte is updaed and re-set, + * so we need to check here if it's already been * added + * to the ir_list. + */ + if (pi->ir_data && (pi->prev_ga_tag != 0)) { + struct kvm *kvm = svm->vcpu.kvm; + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); + struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + struct vcpu_svm *prev_svm; + + if (!prev_vcpu) { + ret = -EINVAL; + goto out; + } + + prev_svm = to_svm(prev_vcpu); + svm_ir_list_del(prev_svm, pi); + } + + /** + * Allocating new amd_iommu_pi_data, which will get + * add to the per-vcpu ir_list. + */ + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + if (!ir) { + ret = -ENOMEM; + goto out; + } + ir->data = pi->ir_data; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_add(&ir->node, &svm->ir_list); + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +out: + return ret; +} + +/** + * Note: + * The HW cannot support posting multicast/broadcast + * interrupts to a vCPU. So, we still use legacy interrupt + * remapping for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + */ +static int +get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, + struct vcpu_data *vcpu_info, struct vcpu_svm **svm) +{ + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu = NULL; + + kvm_set_msi_irq(kvm, e, &irq); + + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { + pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", + __func__, irq.vector); + return -1; + } + + pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, + irq.vector); + *svm = to_svm(vcpu); + vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); + vcpu_info->vector = irq.vector; + + return 0; +} + +/* + * svm_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", + __func__, host_irq, guest_irq, set); + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + WARN_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + struct vcpu_data vcpu_info; + struct vcpu_svm *svm = NULL; + + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + + /** + * Here, we setup with legacy mode in the following cases: + * 1. When cannot target interrupt to a specific vcpu. + * 2. Unsetting posted interrupt. + * 3. APIC virtialization is disabled for the vcpu. + * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) + */ + if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && + kvm_vcpu_apicv_active(&svm->vcpu)) { + struct amd_iommu_pi_data pi; + + /* Try to enable guest_mode in IRTE */ + pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & + AVIC_HPA_MASK); + pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, + svm->vcpu.vcpu_id); + pi.is_guest_mode = true; + pi.vcpu_data = &vcpu_info; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Here, we successfully setting up vcpu affinity in + * IOMMU guest mode. Now, we need to store the posted + * interrupt information in a per-vcpu ir_list so that + * we can reference to them directly when we update vcpu + * scheduling information in IOMMU irte. + */ + if (!ret && pi.is_guest_mode) + svm_ir_list_add(svm, &pi); + } else { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } + + if (!ret && svm) { + trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, + e->gsi, vcpu_info.vector, + vcpu_info.pi_desc_addr, set); + } + + if (ret < 0) { + pr_err("%s: failed to update PI IRTE\n", __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + +bool svm_check_apicv_inhibit_reasons(ulong bit) +{ + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_HYPERV) | + BIT(APICV_INHIBIT_REASON_NESTED) | + BIT(APICV_INHIBIT_REASON_IRQWIN) | + BIT(APICV_INHIBIT_REASON_PIT_REINJ) | + BIT(APICV_INHIBIT_REASON_X2APIC); + + return supported & BIT(bit); +} + +void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) +{ + avic_update_access_page(kvm, activate); +} + +static inline int +avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; + + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + + if (list_empty(&svm->ir_list)) + goto out; + + list_for_each_entry(ir, &svm->ir_list, node) { + ret = amd_iommu_update_ga(cpu, r, ir->data); + if (ret) + break; + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return ret; +} + +void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + u64 entry; + /* ID = 0xff (broadcast), ID > 0xff (reserved) */ + int h_physical_id = kvm_cpu_get_apicid(cpu); + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + /* + * Since the host physical APIC id is 8 bits, + * we can support host APIC ID upto 255. + */ + if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) + return; + + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; + entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + if (svm->avic_is_running) + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + + WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, + svm->avic_is_running); +} + +void avic_vcpu_put(struct kvm_vcpu *vcpu) +{ + u64 entry; + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + entry = READ_ONCE(*(svm->avic_physical_id_cache)); + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + WRITE_ONCE(*(svm->avic_physical_id_cache), entry); +} + +/** + * This function is called during VCPU halt/unhalt. + */ +static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->avic_is_running = is_run; + if (is_run) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); +} + +void svm_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + avic_set_running(vcpu, false); +} + +void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) + kvm_vcpu_update_apicv(vcpu); + avic_set_running(vcpu, true); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a24e2de4acda..e32b4956fa06 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1,17 +1,3 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Kernel-based Virtual Machine driver for Linux - * - * AMD SVM support - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay - * Avi Kivity - */ - #define pr_fmt(fmt) "SVM: " fmt #include @@ -28,10 +14,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -82,39 +68,12 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SVM_FEATURE_DECODE_ASSIST (1 << 7) #define SVM_FEATURE_PAUSE_FILTER (1 << 10) -#define SVM_AVIC_DOORBELL 0xc001011b - #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) #define TSC_RATIO_RSVD 0xffffff0000000000ULL #define TSC_RATIO_MIN 0x0000000000000001ULL #define TSC_RATIO_MAX 0x000000ffffffffffULL -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - -/* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. - */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 255 - -#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 -#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 -#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF - -/* AVIC GATAG is encoded using VM and VCPU IDs */ -#define AVIC_VCPU_ID_BITS 8 -#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) - -#define AVIC_VM_ID_BITS 24 -#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) -#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) - -#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ - (y & AVIC_VCPU_ID_MASK)) -#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) -#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) - static bool erratum_383_found __read_mostly; u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; @@ -125,23 +84,6 @@ u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; */ static uint64_t osvw_len = 4, osvw_status; -/* - * This is a wrapper of struct amd_iommu_ir_data. - */ -struct amd_svm_iommu_ir { - struct list_head node; /* Used by SVM for per-vcpu ir_list */ - void *data; /* Storing pointer to struct amd_ir_data */ -}; - -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) -#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 -#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) - -#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) -#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) -#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) - static DEFINE_PER_CPU(u64, current_tsc_ratio); #define TSC_RATIO_DEFAULT 0x0100000000ULL @@ -231,12 +173,6 @@ module_param(npt, int, S_IRUGO); static int nested = true; module_param(nested, int, S_IRUGO); -/* enable / disable AVIC */ -static int avic; -#ifdef CONFIG_X86_LOCAL_APIC -module_param(avic, int, S_IRUGO); -#endif - /* enable/disable Next RIP Save */ static int nrips = true; module_param(nrips, int, 0444); @@ -259,10 +195,6 @@ module_param(dump_invalid_vmcb, bool, 0644); static u8 rsm_ins_bytes[] = "\x0f\xaa"; static void svm_complete_interrupts(struct vcpu_svm *svm); -static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); -static inline void avic_post_state_restore(struct kvm_vcpu *vcpu); - -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL static int sev_flush_asids(void); static DECLARE_RWSEM(sev_deactivate_lock); @@ -282,11 +214,6 @@ struct enc_region { }; -static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) -{ - return container_of(kvm, struct kvm_svm, kvm); -} - static inline bool svm_sev_enabled(void) { return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0; @@ -310,23 +237,6 @@ static inline int sev_get_asid(struct kvm *kvm) return sev->asid; } -static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) -{ - svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - mark_dirty(svm->vmcb, VMCB_AVIC); -} - -static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 *entry = svm->avic_physical_id_cache; - - if (!entry) - return false; - - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); -} - static unsigned long iopm_base; struct kvm_ldttss_desc { @@ -853,52 +763,6 @@ void disable_nmi_singlestep(struct vcpu_svm *svm) } } -/* Note: - * This hash table is used to map VM_ID to a struct kvm_svm, - * when handling AMD IOMMU GALOG notification to schedule in - * a particular vCPU. - */ -#define SVM_VM_DATA_HASH_BITS 8 -static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); -static u32 next_vm_id = 0; -static bool next_vm_id_wrapped = 0; -static DEFINE_SPINLOCK(svm_vm_data_hash_lock); - -/* Note: - * This function is called from IOMMU driver to notify - * SVM to schedule in a particular vCPU of a particular VM. - */ -static int avic_ga_log_notifier(u32 ga_tag) -{ - unsigned long flags; - struct kvm_svm *kvm_svm; - struct kvm_vcpu *vcpu = NULL; - u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); - - pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); - trace_kvm_avic_ga_log(vm_id, vcpu_id); - - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { - if (kvm_svm->avic_vm_id != vm_id) - continue; - vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); - break; - } - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - - /* Note: - * At this point, the IOMMU should have already set the pending - * bit in the vAPIC backing page. So, we just need to schedule - * in the vcpu. - */ - if (vcpu) - kvm_vcpu_wake_up(vcpu); - - return 0; -} - static __init int sev_hardware_setup(void) { struct sev_user_data_status *status; @@ -1227,24 +1091,6 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) return svm->vmcb->control.tsc_offset; } -static void avic_init_vmcb(struct vcpu_svm *svm) -{ - struct vmcb *vmcb = svm->vmcb; - struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); - phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); - phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); - phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); - - vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; - vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; - if (kvm_apicv_activated(svm->vcpu.kvm)) - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - else - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; -} - static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1404,92 +1250,6 @@ static void init_vmcb(struct vcpu_svm *svm) } -static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, - unsigned int index) -{ - u64 *avic_physical_id_table; - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - - if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) - return NULL; - - avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); - - return &avic_physical_id_table[index]; -} - -/** - * Note: - * AVIC hardware walks the nested page table to check permissions, - * but does not use the SPA address specified in the leaf page - * table entry since it uses address in the AVIC_BACKING_PAGE pointer - * field of the VMCB. Therefore, we set up the - * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. - */ -static int avic_update_access_page(struct kvm *kvm, bool activate) -{ - int ret = 0; - - mutex_lock(&kvm->slots_lock); - /* - * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger - * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT - * memory region. So, we need to ensure that kvm->mm == current->mm. - */ - if ((kvm->arch.apic_access_page_done == activate) || - (kvm->mm != current->mm)) - goto out; - - ret = __x86_set_memory_region(kvm, - APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, - activate ? PAGE_SIZE : 0); - if (ret) - goto out; - - kvm->arch.apic_access_page_done = activate; -out: - mutex_unlock(&kvm->slots_lock); - return ret; -} - -static int avic_init_backing_page(struct kvm_vcpu *vcpu) -{ - u64 *entry, new_entry; - int id = vcpu->vcpu_id; - struct vcpu_svm *svm = to_svm(vcpu); - - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) - return -EINVAL; - - if (!svm->vcpu.arch.apic->regs) - return -EINVAL; - - if (kvm_apicv_activated(vcpu->kvm)) { - int ret; - - ret = avic_update_access_page(vcpu->kvm, true); - if (ret) - return ret; - } - - svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); - - /* Setting AVIC backing page address in the phy APIC ID table */ - entry = avic_get_physical_id_entry(vcpu, id); - if (!entry) - return -EINVAL; - - new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); - WRITE_ONCE(*entry, new_entry); - - svm->avic_physical_id_cache = entry; - - return 0; -} - static void sev_asid_free(int asid) { struct svm_cpu_data *sd; @@ -1665,84 +1425,12 @@ static void sev_vm_destroy(struct kvm *kvm) sev_asid_free(sev->asid); } -static void avic_vm_destroy(struct kvm *kvm) -{ - unsigned long flags; - struct kvm_svm *kvm_svm = to_kvm_svm(kvm); - - if (!avic) - return; - - if (kvm_svm->avic_logical_id_table_page) - __free_page(kvm_svm->avic_logical_id_table_page); - if (kvm_svm->avic_physical_id_table_page) - __free_page(kvm_svm->avic_physical_id_table_page); - - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - hash_del(&kvm_svm->hnode); - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); -} - static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); } -static int avic_vm_init(struct kvm *kvm) -{ - unsigned long flags; - int err = -ENOMEM; - struct kvm_svm *kvm_svm = to_kvm_svm(kvm); - struct kvm_svm *k2; - struct page *p_page; - struct page *l_page; - u32 vm_id; - - if (!avic) - return 0; - - /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!p_page) - goto free_avic; - - kvm_svm->avic_physical_id_table_page = p_page; - clear_page(page_address(p_page)); - - /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!l_page) - goto free_avic; - - kvm_svm->avic_logical_id_table_page = l_page; - clear_page(page_address(l_page)); - - spin_lock_irqsave(&svm_vm_data_hash_lock, flags); - again: - vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; - if (vm_id == 0) { /* id is 1-based, zero is not okay */ - next_vm_id_wrapped = 1; - goto again; - } - /* Is it still in use? Only possible if wrapped at least once */ - if (next_vm_id_wrapped) { - hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { - if (k2->avic_vm_id == vm_id) - goto again; - } - } - kvm_svm->avic_vm_id = vm_id; - hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); - spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - - return 0; - -free_avic: - avic_vm_destroy(kvm); - return err; -} - static int svm_vm_init(struct kvm *kvm) { if (avic) { @@ -1755,98 +1443,6 @@ static int svm_vm_init(struct kvm *kvm) return 0; } -static inline int -avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. - */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - - if (list_empty(&svm->ir_list)) - goto out; - - list_for_each_entry(ir, &svm->ir_list, node) { - ret = amd_iommu_update_ga(cpu, r, ir->data); - if (ret) - break; - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; -} - -static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - u64 entry; - /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - int h_physical_id = kvm_cpu_get_apicid(cpu); - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - /* - * Since the host physical APIC id is 8 bits, - * we can support host APIC ID upto 255. - */ - if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) - return; - - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; - entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (svm->avic_is_running) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, - svm->avic_is_running); -} - -static void avic_vcpu_put(struct kvm_vcpu *vcpu) -{ - u64 entry; - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); -} - -/** - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->avic_is_running = is_run; - if (is_run) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); -} - static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1871,25 +1467,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } -static int avic_init_vcpu(struct vcpu_svm *svm) -{ - int ret; - struct kvm_vcpu *vcpu = &svm->vcpu; - - if (!avic || !irqchip_in_kernel(vcpu->kvm)) - return 0; - - ret = avic_init_backing_page(&svm->vcpu); - if (ret) - return ret; - - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); - svm->dfr_reg = APIC_DFR_FLAT; - - return ret; -} - static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; @@ -2046,18 +1623,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } -static void svm_vcpu_blocking(struct kvm_vcpu *vcpu) -{ - avic_set_running(vcpu, false); -} - -static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) -{ - if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) - kvm_vcpu_update_apicv(vcpu); - avic_set_running(vcpu, true); -} - static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3437,276 +3002,6 @@ static int mwait_interception(struct vcpu_svm *svm) return nop_interception(svm); } -enum avic_ipi_failure_cause { - AVIC_IPI_FAILURE_INVALID_INT_TYPE, - AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, - AVIC_IPI_FAILURE_INVALID_TARGET, - AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, -}; - -static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) -{ - u32 icrh = svm->vmcb->control.exit_info_1 >> 32; - u32 icrl = svm->vmcb->control.exit_info_1; - u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0xFF; - struct kvm_lapic *apic = svm->vcpu.arch.apic; - - trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); - - switch (id) { - case AVIC_IPI_FAILURE_INVALID_INT_TYPE: - /* - * AVIC hardware handles the generation of - * IPIs when the specified Message Type is Fixed - * (also known as fixed delivery mode) and - * the Trigger Mode is edge-triggered. The hardware - * also supports self and broadcast delivery modes - * specified via the Destination Shorthand(DSH) - * field of the ICRL. Logical and physical APIC ID - * formats are supported. All other IPI types cause - * a #VMEXIT, which needs to emulated. - */ - kvm_lapic_reg_write(apic, APIC_ICR2, icrh); - kvm_lapic_reg_write(apic, APIC_ICR, icrl); - break; - case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { - int i; - struct kvm_vcpu *vcpu; - struct kvm *kvm = svm->vcpu.kvm; - struct kvm_lapic *apic = svm->vcpu.arch.apic; - - /* - * At this point, we expect that the AVIC HW has already - * set the appropriate IRR bits on the valid target - * vcpus. So, we just need to kick the appropriate vcpu. - */ - kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, apic, - icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK); - - if (m && !avic_vcpu_is_running(vcpu)) - kvm_vcpu_wake_up(vcpu); - } - break; - } - case AVIC_IPI_FAILURE_INVALID_TARGET: - WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", - index, svm->vcpu.vcpu_id, icrh, icrl); - break; - case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: - WARN_ONCE(1, "Invalid backing page\n"); - break; - default: - pr_err("Unknown IPI interception\n"); - } - - return 1; -} - -static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) -{ - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - int index; - u32 *logical_apic_id_table; - int dlid = GET_APIC_LOGICAL_ID(ldr); - - if (!dlid) - return NULL; - - if (flat) { /* flat */ - index = ffs(dlid) - 1; - if (index > 7) - return NULL; - } else { /* cluster */ - int cluster = (dlid & 0xf0) >> 4; - int apic = ffs(dlid & 0x0f) - 1; - - if ((apic < 0) || (apic > 7) || - (cluster >= 0xf)) - return NULL; - index = (cluster << 2) + apic; - } - - logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); - - return &logical_apic_id_table[index]; -} - -static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) -{ - bool flat; - u32 *entry, new_entry; - - flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; - entry = avic_get_logical_id_entry(vcpu, ldr, flat); - if (!entry) - return -EINVAL; - - new_entry = READ_ONCE(*entry); - new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; - new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); - new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; - WRITE_ONCE(*entry, new_entry); - - return 0; -} - -static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - bool flat = svm->dfr_reg == APIC_DFR_FLAT; - u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); - - if (entry) - clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); -} - -static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) -{ - int ret = 0; - struct vcpu_svm *svm = to_svm(vcpu); - u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); - u32 id = kvm_xapic_id(vcpu->arch.apic); - - if (ldr == svm->ldr_reg) - return 0; - - avic_invalidate_logical_id_entry(vcpu); - - if (ldr) - ret = avic_ldr_write(vcpu, id, ldr); - - if (!ret) - svm->ldr_reg = ldr; - - return ret; -} - -static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) -{ - u64 *old, *new; - struct vcpu_svm *svm = to_svm(vcpu); - u32 id = kvm_xapic_id(vcpu->arch.apic); - - if (vcpu->vcpu_id == id) - return 0; - - old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); - new = avic_get_physical_id_entry(vcpu, id); - if (!new || !old) - return 1; - - /* We need to move physical_id_entry to new offset */ - *new = *old; - *old = 0ULL; - to_svm(vcpu)->avic_physical_id_cache = new; - - /* - * Also update the guest physical APIC ID in the logical - * APIC ID table entry if already setup the LDR. - */ - if (svm->ldr_reg) - avic_handle_ldr_update(vcpu); - - return 0; -} - -static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); - - if (svm->dfr_reg == dfr) - return; - - avic_invalidate_logical_id_entry(vcpu); - svm->dfr_reg = dfr; -} - -static int avic_unaccel_trap_write(struct vcpu_svm *svm) -{ - struct kvm_lapic *apic = svm->vcpu.arch.apic; - u32 offset = svm->vmcb->control.exit_info_1 & - AVIC_UNACCEL_ACCESS_OFFSET_MASK; - - switch (offset) { - case APIC_ID: - if (avic_handle_apic_id_update(&svm->vcpu)) - return 0; - break; - case APIC_LDR: - if (avic_handle_ldr_update(&svm->vcpu)) - return 0; - break; - case APIC_DFR: - avic_handle_dfr_update(&svm->vcpu); - break; - default: - break; - } - - kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); - - return 1; -} - -static bool is_avic_unaccelerated_access_trap(u32 offset) -{ - bool ret = false; - - switch (offset) { - case APIC_ID: - case APIC_EOI: - case APIC_RRR: - case APIC_LDR: - case APIC_DFR: - case APIC_SPIV: - case APIC_ESR: - case APIC_ICR: - case APIC_LVTT: - case APIC_LVTTHMR: - case APIC_LVTPC: - case APIC_LVT0: - case APIC_LVT1: - case APIC_LVTERR: - case APIC_TMICT: - case APIC_TDCR: - ret = true; - break; - default: - break; - } - return ret; -} - -static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) -{ - int ret = 0; - u32 offset = svm->vmcb->control.exit_info_1 & - AVIC_UNACCEL_ACCESS_OFFSET_MASK; - u32 vector = svm->vmcb->control.exit_info_2 & - AVIC_UNACCEL_ACCESS_VECTOR_MASK; - bool write = (svm->vmcb->control.exit_info_1 >> 32) & - AVIC_UNACCEL_ACCESS_WRITE_MASK; - bool trap = is_avic_unaccelerated_access_trap(offset); - - trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, - trap, write, vector); - if (trap) { - /* Handling Trap */ - WARN_ONCE(!write, "svm: Handling trap read.\n"); - ret = avic_unaccel_trap_write(svm); - } else { - /* Handling Fault */ - ret = kvm_emulate_instruction(&svm->vcpu, 0); - } - - return ret; -} - static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -4074,324 +3369,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } -static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) -{ - return; -} - -static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ -} - -static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) -{ -} - -static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) -{ - if (!avic || !lapic_in_kernel(vcpu)) - return; - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_request_apicv_update(vcpu->kvm, activate, - APICV_INHIBIT_REASON_IRQWIN); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); -} - -static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - struct vcpu_svm *svm = to_svm(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) - return 0; - - /* - * Here, we go through the per-vcpu ir_list to update all existing - * interrupt remapping table entry targeting this vcpu. - */ - spin_lock_irqsave(&svm->ir_list_lock, flags); - - if (list_empty(&svm->ir_list)) - goto out; - - list_for_each_entry(ir, &svm->ir_list, node) { - if (activate) - ret = amd_iommu_activate_guest_mode(ir->data); - else - ret = amd_iommu_deactivate_guest_mode(ir->data); - if (ret) - break; - } -out: - spin_unlock_irqrestore(&svm->ir_list_lock, flags); - return ret; -} - -static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; - bool activated = kvm_vcpu_apicv_active(vcpu); - - if (!avic) - return; - - if (activated) { - /** - * During AVIC temporary deactivation, guest could update - * APIC ID, DFR and LDR registers, which would not be trapped - * by avic_unaccelerated_access_interception(). In this case, - * we need to check and update the AVIC logical APIC ID table - * accordingly before re-activating. - */ - avic_post_state_restore(vcpu); - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - } else { - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; - } - mark_dirty(vmcb, VMCB_AVIC); - - svm_set_pi_irte_mode(vcpu, activated); -} - -static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -{ - return; -} - -static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) -{ - if (!vcpu->arch.apicv_active) - return -1; - - kvm_lapic_set_irr(vec, vcpu->arch.apic); - smp_mb__after_atomic(); - - if (avic_vcpu_is_running(vcpu)) { - int cpuid = vcpu->cpu; - - if (cpuid != get_cpu()) - wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); - put_cpu(); - } else - kvm_vcpu_wake_up(vcpu); - - return 0; -} - -static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return false; -} - -static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - unsigned long flags; - struct amd_svm_iommu_ir *cur; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_for_each_entry(cur, &svm->ir_list, node) { - if (cur->data != pi->ir_data) - continue; - list_del(&cur->node); - kfree(cur); - break; - } - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -} - -static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) -{ - int ret = 0; - unsigned long flags; - struct amd_svm_iommu_ir *ir; - - /** - * In some cases, the existing irte is updaed and re-set, - * so we need to check here if it's already been * added - * to the ir_list. - */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { - struct kvm *kvm = svm->vcpu.kvm; - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); - struct vcpu_svm *prev_svm; - - if (!prev_vcpu) { - ret = -EINVAL; - goto out; - } - - prev_svm = to_svm(prev_vcpu); - svm_ir_list_del(prev_svm, pi); - } - - /** - * Allocating new amd_iommu_pi_data, which will get - * add to the per-vcpu ir_list. - */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); - if (!ir) { - ret = -ENOMEM; - goto out; - } - ir->data = pi->ir_data; - - spin_lock_irqsave(&svm->ir_list_lock, flags); - list_add(&ir->node, &svm->ir_list); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); -out: - return ret; -} - -/** - * Note: - * The HW cannot support posting multicast/broadcast - * interrupts to a vCPU. So, we still use legacy interrupt - * remapping for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - */ -static int -get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, - struct vcpu_data *vcpu_info, struct vcpu_svm **svm) -{ - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu = NULL; - - kvm_set_msi_irq(kvm, e, &irq); - - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", - __func__, irq.vector); - return -1; - } - - pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, - irq.vector); - *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); - vcpu_info->vector = irq.vector; - - return 0; -} - -/* - * svm_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - int idx, ret = -EINVAL; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) - return 0; - - pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", - __func__, host_irq, guest_irq, set); - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - WARN_ON(guest_irq >= irq_rt->nr_rt_entries); - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - struct vcpu_data vcpu_info; - struct vcpu_svm *svm = NULL; - - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - - /** - * Here, we setup with legacy mode in the following cases: - * 1. When cannot target interrupt to a specific vcpu. - * 2. Unsetting posted interrupt. - * 3. APIC virtialization is disabled for the vcpu. - * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) - */ - if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && - kvm_vcpu_apicv_active(&svm->vcpu)) { - struct amd_iommu_pi_data pi; - - /* Try to enable guest_mode in IRTE */ - pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & - AVIC_HPA_MASK); - pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, - svm->vcpu.vcpu_id); - pi.is_guest_mode = true; - pi.vcpu_data = &vcpu_info; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Here, we successfully setting up vcpu affinity in - * IOMMU guest mode. Now, we need to store the posted - * interrupt information in a per-vcpu ir_list so that - * we can reference to them directly when we update vcpu - * scheduling information in IOMMU irte. - */ - if (!ret && pi.is_guest_mode) - svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } - } - - if (!ret && svm) { - trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, - e->gsi, vcpu_info.vector, - vcpu_info.pi_desc_addr, set); - } - - if (ret < 0) { - pr_err("%s: failed to update PI IRTE\n", __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5159,14 +4136,6 @@ static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) shrink_ple_window(vcpu); } -static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) -{ - if (avic_handle_apic_id_update(vcpu) != 0) - return; - avic_handle_dfr_update(vcpu); - avic_handle_ldr_update(vcpu); -} - static void svm_setup_mce(struct kvm_vcpu *vcpu) { /* [63:9] are reserved. */ @@ -6214,23 +5183,6 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); } -static bool svm_check_apicv_inhibit_reasons(ulong bit) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_NESTED) | - BIT(APICV_INHIBIT_REASON_IRQWIN) | - BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC); - - return supported & BIT(bit); -} - -static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) -{ - avic_update_access_page(kvm, activate); -} - static struct kvm_x86_ops svm_x86_ops __initdata = { .hardware_unsetup = svm_hardware_teardown, .hardware_enable = svm_hardware_enable, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f4c446d7a31e..c7abc1fede97 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -173,6 +173,11 @@ struct vcpu_svm { void recalc_intercepts(struct vcpu_svm *svm); +static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_svm, kvm); +} + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -378,4 +383,61 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, int svm_check_nested_events(struct kvm_vcpu *vcpu); int nested_svm_exit_special(struct vcpu_svm *svm); +/* avic.c */ + +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 +#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) + +#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) +#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) + +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + +extern int avic; + +static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) +{ + svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; + mark_dirty(svm->vmcb, VMCB_AVIC); +} + +static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 *entry = svm->avic_physical_id_cache; + + if (!entry) + return false; + + return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); +} + +int avic_ga_log_notifier(u32 ga_tag); +void avic_vm_destroy(struct kvm *kvm); +int avic_vm_init(struct kvm *kvm); +void avic_init_vmcb(struct vcpu_svm *svm); +void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); +int avic_incomplete_ipi_interception(struct vcpu_svm *svm); +int avic_unaccelerated_access_interception(struct vcpu_svm *svm); +int avic_init_vcpu(struct vcpu_svm *svm); +void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void avic_vcpu_put(struct kvm_vcpu *vcpu); +void avic_post_state_restore(struct kvm_vcpu *vcpu); +void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); +bool svm_check_apicv_inhibit_reasons(ulong bit); +void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate); +void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); +void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); +int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); +bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); +int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); +void svm_vcpu_blocking(struct kvm_vcpu *vcpu); +void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); + #endif -- cgit v1.2.3 From eaf78265a4ab33935d3a0f1407ce4a91aac4d4d5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Mar 2020 10:41:54 +0100 Subject: KVM: SVM: Move SEV code to separate file Move the SEV specific parts of svm.c into the new sev.c file. Signed-off-by: Joerg Roedel Message-Id: <20200324094154.32352-5-joro@8bytes.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/svm/sev.c | 1187 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 1241 +----------------------------------------------- arch/x86/kvm/svm/svm.h | 48 ++ 4 files changed, 1257 insertions(+), 1221 deletions(-) create mode 100644 arch/x86/kvm/svm/sev.c (limited to 'arch/x86/kvm/Makefile') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31aca07d8f12..e5a71aa0967b 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,7 +14,7 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o -kvm-amd-y += svm/svm.o svm/pmu.o svm/nested.o svm/avic.o +kvm-amd-y += svm/svm.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c new file mode 100644 index 000000000000..0e3fc311d7da --- /dev/null +++ b/arch/x86/kvm/svm/sev.c @@ -0,0 +1,1187 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM-SEV support + * + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + */ + +#include +#include +#include +#include +#include +#include + +#include "x86.h" +#include "svm.h" + +static int sev_flush_asids(void); +static DECLARE_RWSEM(sev_deactivate_lock); +static DEFINE_MUTEX(sev_bitmap_lock); +unsigned int max_sev_asid; +static unsigned int min_sev_asid; +static unsigned long *sev_asid_bitmap; +static unsigned long *sev_reclaim_asid_bitmap; +#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) + +struct enc_region { + struct list_head list; + unsigned long npages; + struct page **pages; + unsigned long uaddr; + unsigned long size; +}; + +static int sev_flush_asids(void) +{ + int ret, error = 0; + + /* + * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, + * so it must be guarded. + */ + down_write(&sev_deactivate_lock); + + wbinvd_on_all_cpus(); + ret = sev_guest_df_flush(&error); + + up_write(&sev_deactivate_lock); + + if (ret) + pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + + return ret; +} + +/* Must be called with the sev_bitmap_lock held */ +static bool __sev_recycle_asids(void) +{ + int pos; + + /* Check if there are any ASIDs to reclaim before performing a flush */ + pos = find_next_bit(sev_reclaim_asid_bitmap, + max_sev_asid, min_sev_asid - 1); + if (pos >= max_sev_asid) + return false; + + if (sev_flush_asids()) + return false; + + bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, + max_sev_asid); + bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); + + return true; +} + +static int sev_asid_new(void) +{ + bool retry = true; + int pos; + + mutex_lock(&sev_bitmap_lock); + + /* + * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. + */ +again: + pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); + if (pos >= max_sev_asid) { + if (retry && __sev_recycle_asids()) { + retry = false; + goto again; + } + mutex_unlock(&sev_bitmap_lock); + return -EBUSY; + } + + __set_bit(pos, sev_asid_bitmap); + + mutex_unlock(&sev_bitmap_lock); + + return pos + 1; +} + +static int sev_get_asid(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return sev->asid; +} + +static void sev_asid_free(int asid) +{ + struct svm_cpu_data *sd; + int cpu, pos; + + mutex_lock(&sev_bitmap_lock); + + pos = asid - 1; + __set_bit(pos, sev_reclaim_asid_bitmap); + + for_each_possible_cpu(cpu) { + sd = per_cpu(svm_data, cpu); + sd->sev_vmcbs[pos] = NULL; + } + + mutex_unlock(&sev_bitmap_lock); +} + +static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) +{ + struct sev_data_decommission *decommission; + struct sev_data_deactivate *data; + + if (!handle) + return; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + + /* deactivate handle */ + data->handle = handle; + + /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ + down_read(&sev_deactivate_lock); + sev_guest_deactivate(data, NULL); + up_read(&sev_deactivate_lock); + + kfree(data); + + decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); + if (!decommission) + return; + + /* decommission handle */ + decommission->handle = handle; + sev_guest_decommission(decommission, NULL); + + kfree(decommission); +} + +static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + int asid, ret; + + ret = -EBUSY; + if (unlikely(sev->active)) + return ret; + + asid = sev_asid_new(); + if (asid < 0) + return ret; + + ret = sev_platform_init(&argp->error); + if (ret) + goto e_free; + + sev->active = true; + sev->asid = asid; + INIT_LIST_HEAD(&sev->regions_list); + + return 0; + +e_free: + sev_asid_free(asid); + return ret; +} + +static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) +{ + struct sev_data_activate *data; + int asid = sev_get_asid(kvm); + int ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + /* activate ASID on the given handle */ + data->handle = handle; + data->asid = asid; + ret = sev_guest_activate(data, error); + kfree(data); + + return ret; +} + +static int __sev_issue_cmd(int fd, int id, void *data, int *error) +{ + struct fd f; + int ret; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = sev_issue_cmd_external_user(f.file, id, data, error); + + fdput(f); + return ret; +} + +static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return __sev_issue_cmd(sev->fd, id, data, error); +} + +static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_launch_start *start; + struct kvm_sev_launch_start params; + void *dh_blob, *session_blob; + int *error = &argp->error; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT); + if (!start) + return -ENOMEM; + + dh_blob = NULL; + if (params.dh_uaddr) { + dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); + if (IS_ERR(dh_blob)) { + ret = PTR_ERR(dh_blob); + goto e_free; + } + + start->dh_cert_address = __sme_set(__pa(dh_blob)); + start->dh_cert_len = params.dh_len; + } + + session_blob = NULL; + if (params.session_uaddr) { + session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len); + if (IS_ERR(session_blob)) { + ret = PTR_ERR(session_blob); + goto e_free_dh; + } + + start->session_address = __sme_set(__pa(session_blob)); + start->session_len = params.session_len; + } + + start->handle = params.handle; + start->policy = params.policy; + + /* create memory encryption context */ + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); + if (ret) + goto e_free_session; + + /* Bind ASID to this guest */ + ret = sev_bind_asid(kvm, start->handle, error); + if (ret) + goto e_free_session; + + /* return handle to userspace */ + params.handle = start->handle; + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { + sev_unbind_asid(kvm, start->handle); + ret = -EFAULT; + goto e_free_session; + } + + sev->handle = start->handle; + sev->fd = argp->sev_fd; + +e_free_session: + kfree(session_blob); +e_free_dh: + kfree(dh_blob); +e_free: + kfree(start); + return ret; +} + +static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, + unsigned long ulen, unsigned long *n, + int write) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + unsigned long npages, npinned, size; + unsigned long locked, lock_limit; + struct page **pages; + unsigned long first, last; + + if (ulen == 0 || uaddr + ulen < uaddr) + return NULL; + + /* Calculate number of pages. */ + first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; + last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT; + npages = (last - first + 1); + + locked = sev->pages_locked + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); + return NULL; + } + + /* Avoid using vmalloc for smaller buffers. */ + size = npages * sizeof(struct page *); + if (size > PAGE_SIZE) + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); + else + pages = kmalloc(size, GFP_KERNEL_ACCOUNT); + + if (!pages) + return NULL; + + /* Pin the user virtual address. */ + npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); + if (npinned != npages) { + pr_err("SEV: Failure locking %lu pages.\n", npages); + goto err; + } + + *n = npages; + sev->pages_locked = locked; + + return pages; + +err: + if (npinned > 0) + release_pages(pages, npinned); + + kvfree(pages); + return NULL; +} + +static void sev_unpin_memory(struct kvm *kvm, struct page **pages, + unsigned long npages) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + release_pages(pages, npages); + kvfree(pages); + sev->pages_locked -= npages; +} + +static void sev_clflush_pages(struct page *pages[], unsigned long npages) +{ + uint8_t *page_virtual; + unsigned long i; + + if (npages == 0 || pages == NULL) + return; + + for (i = 0; i < npages; i++) { + page_virtual = kmap_atomic(pages[i]); + clflush_cache_range(page_virtual, PAGE_SIZE); + kunmap_atomic(page_virtual); + } +} + +static unsigned long get_num_contig_pages(unsigned long idx, + struct page **inpages, unsigned long npages) +{ + unsigned long paddr, next_paddr; + unsigned long i = idx + 1, pages = 1; + + /* find the number of contiguous pages starting from idx */ + paddr = __sme_page_pa(inpages[idx]); + while (i < npages) { + next_paddr = __sme_page_pa(inpages[i++]); + if ((paddr + PAGE_SIZE) == next_paddr) { + pages++; + paddr = next_paddr; + continue; + } + break; + } + + return pages; +} + +static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_launch_update_data params; + struct sev_data_launch_update_data *data; + struct page **inpages; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + vaddr = params.uaddr; + size = params.len; + vaddr_end = vaddr + size; + + /* Lock the user memory. */ + inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + if (!inpages) { + ret = -ENOMEM; + goto e_free; + } + + /* + * The LAUNCH_UPDATE command will perform in-place encryption of the + * memory content (i.e it will write the same memory region with C=1). + * It's possible that the cache may contain the data with C=0, i.e., + * unencrypted so invalidate it first. + */ + sev_clflush_pages(inpages, npages); + + for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { + int offset, len; + + /* + * If the user buffer is not page-aligned, calculate the offset + * within the page. + */ + offset = vaddr & (PAGE_SIZE - 1); + + /* Calculate the number of pages that can be encrypted in one go. */ + pages = get_num_contig_pages(i, inpages, npages); + + len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); + + data->handle = sev->handle; + data->len = len; + data->address = __sme_page_pa(inpages[i]) + offset; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error); + if (ret) + goto e_unpin; + + size -= len; + next_vaddr = vaddr + len; + } + +e_unpin: + /* content of memory is updated, mark pages dirty */ + for (i = 0; i < npages; i++) { + set_page_dirty_lock(inpages[i]); + mark_page_accessed(inpages[i]); + } + /* unlock the user pages */ + sev_unpin_memory(kvm, inpages, npages); +e_free: + kfree(data); + return ret; +} + +static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + void __user *measure = (void __user *)(uintptr_t)argp->data; + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_launch_measure *data; + struct kvm_sev_launch_measure params; + void __user *p = NULL; + void *blob = NULL; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, measure, sizeof(params))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + /* User wants to query the blob length */ + if (!params.len) + goto cmd; + + p = (void __user *)(uintptr_t)params.uaddr; + if (p) { + if (params.len > SEV_FW_BLOB_MAX_SIZE) { + ret = -EINVAL; + goto e_free; + } + + ret = -ENOMEM; + blob = kmalloc(params.len, GFP_KERNEL); + if (!blob) + goto e_free; + + data->address = __psp_pa(blob); + data->len = params.len; + } + +cmd: + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error); + + /* + * If we query the session length, FW responded with expected data. + */ + if (!params.len) + goto done; + + if (ret) + goto e_free_blob; + + if (blob) { + if (copy_to_user(p, blob, params.len)) + ret = -EFAULT; + } + +done: + params.len = data->len; + if (copy_to_user(measure, ¶ms, sizeof(params))) + ret = -EFAULT; +e_free_blob: + kfree(blob); +e_free: + kfree(data); + return ret; +} + +static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_launch_finish *data; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error); + + kfree(data); + return ret; +} + +static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_guest_status params; + struct sev_data_guest_status *data; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error); + if (ret) + goto e_free; + + params.policy = data->policy; + params.state = data->state; + params.handle = data->handle; + + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + ret = -EFAULT; +e_free: + kfree(data); + return ret; +} + +static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, + unsigned long dst, int size, + int *error, bool enc) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_dbg *data; + int ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + data->dst_addr = dst; + data->src_addr = src; + data->len = size; + + ret = sev_issue_cmd(kvm, + enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, + data, error); + kfree(data); + return ret; +} + +static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, + unsigned long dst_paddr, int sz, int *err) +{ + int offset; + + /* + * Its safe to read more than we are asked, caller should ensure that + * destination has enough space. + */ + src_paddr = round_down(src_paddr, 16); + offset = src_paddr & 15; + sz = round_up(sz + offset, 16); + + return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); +} + +static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, + unsigned long __user dst_uaddr, + unsigned long dst_paddr, + int size, int *err) +{ + struct page *tpage = NULL; + int ret, offset; + + /* if inputs are not 16-byte then use intermediate buffer */ + if (!IS_ALIGNED(dst_paddr, 16) || + !IS_ALIGNED(paddr, 16) || + !IS_ALIGNED(size, 16)) { + tpage = (void *)alloc_page(GFP_KERNEL); + if (!tpage) + return -ENOMEM; + + dst_paddr = __sme_page_pa(tpage); + } + + ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); + if (ret) + goto e_free; + + if (tpage) { + offset = paddr & 15; + if (copy_to_user((void __user *)(uintptr_t)dst_uaddr, + page_address(tpage) + offset, size)) + ret = -EFAULT; + } + +e_free: + if (tpage) + __free_page(tpage); + + return ret; +} + +static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, + unsigned long __user vaddr, + unsigned long dst_paddr, + unsigned long __user dst_vaddr, + int size, int *error) +{ + struct page *src_tpage = NULL; + struct page *dst_tpage = NULL; + int ret, len = size; + + /* If source buffer is not aligned then use an intermediate buffer */ + if (!IS_ALIGNED(vaddr, 16)) { + src_tpage = alloc_page(GFP_KERNEL); + if (!src_tpage) + return -ENOMEM; + + if (copy_from_user(page_address(src_tpage), + (void __user *)(uintptr_t)vaddr, size)) { + __free_page(src_tpage); + return -EFAULT; + } + + paddr = __sme_page_pa(src_tpage); + } + + /* + * If destination buffer or length is not aligned then do read-modify-write: + * - decrypt destination in an intermediate buffer + * - copy the source buffer in an intermediate buffer + * - use the intermediate buffer as source buffer + */ + if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { + int dst_offset; + + dst_tpage = alloc_page(GFP_KERNEL); + if (!dst_tpage) { + ret = -ENOMEM; + goto e_free; + } + + ret = __sev_dbg_decrypt(kvm, dst_paddr, + __sme_page_pa(dst_tpage), size, error); + if (ret) + goto e_free; + + /* + * If source is kernel buffer then use memcpy() otherwise + * copy_from_user(). + */ + dst_offset = dst_paddr & 15; + + if (src_tpage) + memcpy(page_address(dst_tpage) + dst_offset, + page_address(src_tpage), size); + else { + if (copy_from_user(page_address(dst_tpage) + dst_offset, + (void __user *)(uintptr_t)vaddr, size)) { + ret = -EFAULT; + goto e_free; + } + } + + paddr = __sme_page_pa(dst_tpage); + dst_paddr = round_down(dst_paddr, 16); + len = round_up(size, 16); + } + + ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true); + +e_free: + if (src_tpage) + __free_page(src_tpage); + if (dst_tpage) + __free_page(dst_tpage); + return ret; +} + +static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) +{ + unsigned long vaddr, vaddr_end, next_vaddr; + unsigned long dst_vaddr; + struct page **src_p, **dst_p; + struct kvm_sev_dbg debug; + unsigned long n; + unsigned int size; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug))) + return -EFAULT; + + if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) + return -EINVAL; + if (!debug.dst_uaddr) + return -EINVAL; + + vaddr = debug.src_uaddr; + size = debug.len; + vaddr_end = vaddr + size; + dst_vaddr = debug.dst_uaddr; + + for (; vaddr < vaddr_end; vaddr = next_vaddr) { + int len, s_off, d_off; + + /* lock userspace source and destination page */ + src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); + if (!src_p) + return -EFAULT; + + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + if (!dst_p) { + sev_unpin_memory(kvm, src_p, n); + return -EFAULT; + } + + /* + * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the + * memory content (i.e it will write the same memory region with C=1). + * It's possible that the cache may contain the data with C=0, i.e., + * unencrypted so invalidate it first. + */ + sev_clflush_pages(src_p, 1); + sev_clflush_pages(dst_p, 1); + + /* + * Since user buffer may not be page aligned, calculate the + * offset within the page. + */ + s_off = vaddr & ~PAGE_MASK; + d_off = dst_vaddr & ~PAGE_MASK; + len = min_t(size_t, (PAGE_SIZE - s_off), size); + + if (dec) + ret = __sev_dbg_decrypt_user(kvm, + __sme_page_pa(src_p[0]) + s_off, + dst_vaddr, + __sme_page_pa(dst_p[0]) + d_off, + len, &argp->error); + else + ret = __sev_dbg_encrypt_user(kvm, + __sme_page_pa(src_p[0]) + s_off, + vaddr, + __sme_page_pa(dst_p[0]) + d_off, + dst_vaddr, + len, &argp->error); + + sev_unpin_memory(kvm, src_p, n); + sev_unpin_memory(kvm, dst_p, n); + + if (ret) + goto err; + + next_vaddr = vaddr + len; + dst_vaddr = dst_vaddr + len; + size -= len; + } +err: + return ret; +} + +static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_launch_secret *data; + struct kvm_sev_launch_secret params; + struct page **pages; + void *blob, *hdr; + unsigned long n; + int ret, offset; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + if (!pages) + return -ENOMEM; + + /* + * The secret must be copied into contiguous memory region, lets verify + * that userspace memory pages are contiguous before we issue command. + */ + if (get_num_contig_pages(0, pages, n) != n) { + ret = -EINVAL; + goto e_unpin_memory; + } + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + goto e_unpin_memory; + + offset = params.guest_uaddr & (PAGE_SIZE - 1); + data->guest_address = __sme_page_pa(pages[0]) + offset; + data->guest_len = params.guest_len; + + blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); + if (IS_ERR(blob)) { + ret = PTR_ERR(blob); + goto e_free; + } + + data->trans_address = __psp_pa(blob); + data->trans_len = params.trans_len; + + hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); + if (IS_ERR(hdr)) { + ret = PTR_ERR(hdr); + goto e_free_blob; + } + data->hdr_address = __psp_pa(hdr); + data->hdr_len = params.hdr_len; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error); + + kfree(hdr); + +e_free_blob: + kfree(blob); +e_free: + kfree(data); +e_unpin_memory: + sev_unpin_memory(kvm, pages, n); + return ret; +} + +int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +{ + struct kvm_sev_cmd sev_cmd; + int r; + + if (!svm_sev_enabled()) + return -ENOTTY; + + if (!argp) + return 0; + + if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) + return -EFAULT; + + mutex_lock(&kvm->lock); + + switch (sev_cmd.id) { + case KVM_SEV_INIT: + r = sev_guest_init(kvm, &sev_cmd); + break; + case KVM_SEV_LAUNCH_START: + r = sev_launch_start(kvm, &sev_cmd); + break; + case KVM_SEV_LAUNCH_UPDATE_DATA: + r = sev_launch_update_data(kvm, &sev_cmd); + break; + case KVM_SEV_LAUNCH_MEASURE: + r = sev_launch_measure(kvm, &sev_cmd); + break; + case KVM_SEV_LAUNCH_FINISH: + r = sev_launch_finish(kvm, &sev_cmd); + break; + case KVM_SEV_GUEST_STATUS: + r = sev_guest_status(kvm, &sev_cmd); + break; + case KVM_SEV_DBG_DECRYPT: + r = sev_dbg_crypt(kvm, &sev_cmd, true); + break; + case KVM_SEV_DBG_ENCRYPT: + r = sev_dbg_crypt(kvm, &sev_cmd, false); + break; + case KVM_SEV_LAUNCH_SECRET: + r = sev_launch_secret(kvm, &sev_cmd); + break; + default: + r = -EINVAL; + goto out; + } + + if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) + r = -EFAULT; + +out: + mutex_unlock(&kvm->lock); + return r; +} + +int svm_register_enc_region(struct kvm *kvm, + struct kvm_enc_region *range) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct enc_region *region; + int ret = 0; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (range->addr > ULONG_MAX || range->size > ULONG_MAX) + return -EINVAL; + + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); + if (!region) + return -ENOMEM; + + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + if (!region->pages) { + ret = -ENOMEM; + goto e_free; + } + + /* + * The guest may change the memory encryption attribute from C=0 -> C=1 + * or vice versa for this memory range. Lets make sure caches are + * flushed to ensure that guest data gets written into memory with + * correct C-bit. + */ + sev_clflush_pages(region->pages, region->npages); + + region->uaddr = range->addr; + region->size = range->size; + + mutex_lock(&kvm->lock); + list_add_tail(®ion->list, &sev->regions_list); + mutex_unlock(&kvm->lock); + + return ret; + +e_free: + kfree(region); + return ret; +} + +static struct enc_region * +find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct list_head *head = &sev->regions_list; + struct enc_region *i; + + list_for_each_entry(i, head, list) { + if (i->uaddr == range->addr && + i->size == range->size) + return i; + } + + return NULL; +} + +static void __unregister_enc_region_locked(struct kvm *kvm, + struct enc_region *region) +{ + sev_unpin_memory(kvm, region->pages, region->npages); + list_del(®ion->list); + kfree(region); +} + +int svm_unregister_enc_region(struct kvm *kvm, + struct kvm_enc_region *range) +{ + struct enc_region *region; + int ret; + + mutex_lock(&kvm->lock); + + if (!sev_guest(kvm)) { + ret = -ENOTTY; + goto failed; + } + + region = find_enc_region(kvm, range); + if (!region) { + ret = -EINVAL; + goto failed; + } + + /* + * Ensure that all guest tagged cache entries are flushed before + * releasing the pages back to the system for use. CLFLUSH will + * not do this, so issue a WBINVD. + */ + wbinvd_on_all_cpus(); + + __unregister_enc_region_locked(kvm, region); + + mutex_unlock(&kvm->lock); + return 0; + +failed: + mutex_unlock(&kvm->lock); + return ret; +} + +void sev_vm_destroy(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct list_head *head = &sev->regions_list; + struct list_head *pos, *q; + + if (!sev_guest(kvm)) + return; + + mutex_lock(&kvm->lock); + + /* + * Ensure that all guest tagged cache entries are flushed before + * releasing the pages back to the system for use. CLFLUSH will + * not do this, so issue a WBINVD. + */ + wbinvd_on_all_cpus(); + + /* + * if userspace was terminated before unregistering the memory regions + * then lets unpin all the registered memory. + */ + if (!list_empty(head)) { + list_for_each_safe(pos, q, head) { + __unregister_enc_region_locked(kvm, + list_entry(pos, struct enc_region, list)); + } + } + + mutex_unlock(&kvm->lock); + + sev_unbind_asid(kvm, sev->handle); + sev_asid_free(sev->asid); +} + +int __init sev_hardware_setup(void) +{ + struct sev_user_data_status *status; + int rc; + + /* Maximum number of encrypted guests supported simultaneously */ + max_sev_asid = cpuid_ecx(0x8000001F); + + if (!max_sev_asid) + return 1; + + /* Minimum ASID value that should be used for SEV guest */ + min_sev_asid = cpuid_edx(0x8000001F); + + /* Initialize SEV ASID bitmaps */ + sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + if (!sev_asid_bitmap) + return 1; + + sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + if (!sev_reclaim_asid_bitmap) + return 1; + + status = kmalloc(sizeof(*status), GFP_KERNEL); + if (!status) + return 1; + + /* + * Check SEV platform status. + * + * PLATFORM_STATUS can be called in any state, if we failed to query + * the PLATFORM status then either PSP firmware does not support SEV + * feature or SEV firmware is dead. + */ + rc = sev_platform_status(status, NULL); + if (rc) + goto err; + + pr_info("SEV supported\n"); + +err: + kfree(status); + return rc; +} + +void sev_hardware_teardown(void) +{ + bitmap_free(sev_asid_bitmap); + bitmap_free(sev_reclaim_asid_bitmap); + + sev_flush_asids(); +} + +void pre_sev_run(struct vcpu_svm *svm, int cpu) +{ + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + int asid = sev_get_asid(svm->vcpu.kvm); + + /* Assign the asid allocated with this SEV guest */ + svm->vmcb->control.asid = asid; + + /* + * Flush guest TLB: + * + * 1) when different VMCB for the same ASID is to be run on the same host CPU. + * 2) or this VMCB was executed on different host CPU in previous VMRUNs. + */ + if (sd->sev_vmcbs[asid] == svm->vmcb && + svm->last_cpu == cpu) + return; + + svm->last_cpu = cpu; + sd->sev_vmcbs[asid] = svm->vmcb; + svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; + mark_dirty(svm->vmcb, VMCB_ASID); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e32b4956fa06..05d77b395fc1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -196,47 +196,6 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa"; static void svm_complete_interrupts(struct vcpu_svm *svm); -static int sev_flush_asids(void); -static DECLARE_RWSEM(sev_deactivate_lock); -static DEFINE_MUTEX(sev_bitmap_lock); -static unsigned int max_sev_asid; -static unsigned int min_sev_asid; -static unsigned long *sev_asid_bitmap; -static unsigned long *sev_reclaim_asid_bitmap; -#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) - -struct enc_region { - struct list_head list; - unsigned long npages; - struct page **pages; - unsigned long uaddr; - unsigned long size; -}; - - -static inline bool svm_sev_enabled(void) -{ - return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0; -} - -static inline bool sev_guest(struct kvm *kvm) -{ -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->active; -#else - return false; -#endif -} - -static inline int sev_get_asid(struct kvm *kvm) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->asid; -} - static unsigned long iopm_base; struct kvm_ldttss_desc { @@ -248,23 +207,7 @@ struct kvm_ldttss_desc { u32 zero1; } __attribute__((packed)); -struct svm_cpu_data { - int cpu; - - u64 asid_generation; - u32 max_asid; - u32 next_asid; - u32 min_asid; - struct kvm_ldttss_desc *tss_desc; - - struct page *save_area; - struct vmcb *current_vmcb; - - /* index = sev_asid, value = vmcb pointer */ - struct vmcb **sev_vmcbs; -}; - -static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; @@ -763,51 +706,6 @@ void disable_nmi_singlestep(struct vcpu_svm *svm) } } -static __init int sev_hardware_setup(void) -{ - struct sev_user_data_status *status; - int rc; - - /* Maximum number of encrypted guests supported simultaneously */ - max_sev_asid = cpuid_ecx(0x8000001F); - - if (!max_sev_asid) - return 1; - - /* Minimum ASID value that should be used for SEV guest */ - min_sev_asid = cpuid_edx(0x8000001F); - - /* Initialize SEV ASID bitmaps */ - sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); - if (!sev_asid_bitmap) - return 1; - - sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); - if (!sev_reclaim_asid_bitmap) - return 1; - - status = kmalloc(sizeof(*status), GFP_KERNEL); - if (!status) - return 1; - - /* - * Check SEV platform status. - * - * PLATFORM_STATUS can be called in any state, if we failed to query - * the PLATFORM status then either PSP firmware does not support SEV - * feature or SEV firmware is dead. - */ - rc = sev_platform_status(status, NULL); - if (rc) - goto err; - - pr_info("SEV supported\n"); - -err: - kfree(status); - return rc; -} - static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -889,12 +787,8 @@ static void svm_hardware_teardown(void) { int cpu; - if (svm_sev_enabled()) { - bitmap_free(sev_asid_bitmap); - bitmap_free(sev_reclaim_asid_bitmap); - - sev_flush_asids(); - } + if (svm_sev_enabled()) + sev_hardware_teardown(); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1250,199 +1144,6 @@ static void init_vmcb(struct vcpu_svm *svm) } -static void sev_asid_free(int asid) -{ - struct svm_cpu_data *sd; - int cpu, pos; - - mutex_lock(&sev_bitmap_lock); - - pos = asid - 1; - __set_bit(pos, sev_reclaim_asid_bitmap); - - for_each_possible_cpu(cpu) { - sd = per_cpu(svm_data, cpu); - sd->sev_vmcbs[pos] = NULL; - } - - mutex_unlock(&sev_bitmap_lock); -} - -static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) -{ - struct sev_data_decommission *decommission; - struct sev_data_deactivate *data; - - if (!handle) - return; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return; - - /* deactivate handle */ - data->handle = handle; - - /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ - down_read(&sev_deactivate_lock); - sev_guest_deactivate(data, NULL); - up_read(&sev_deactivate_lock); - - kfree(data); - - decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); - if (!decommission) - return; - - /* decommission handle */ - decommission->handle = handle; - sev_guest_decommission(decommission, NULL); - - kfree(decommission); -} - -static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, - unsigned long ulen, unsigned long *n, - int write) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unsigned long npages, npinned, size; - unsigned long locked, lock_limit; - struct page **pages; - unsigned long first, last; - - if (ulen == 0 || uaddr + ulen < uaddr) - return NULL; - - /* Calculate number of pages. */ - first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; - last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT; - npages = (last - first + 1); - - locked = sev->pages_locked + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); - return NULL; - } - - /* Avoid using vmalloc for smaller buffers. */ - size = npages * sizeof(struct page *); - if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); - else - pages = kmalloc(size, GFP_KERNEL_ACCOUNT); - - if (!pages) - return NULL; - - /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); - if (npinned != npages) { - pr_err("SEV: Failure locking %lu pages.\n", npages); - goto err; - } - - *n = npages; - sev->pages_locked = locked; - - return pages; - -err: - if (npinned > 0) - release_pages(pages, npinned); - - kvfree(pages); - return NULL; -} - -static void sev_unpin_memory(struct kvm *kvm, struct page **pages, - unsigned long npages) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - release_pages(pages, npages); - kvfree(pages); - sev->pages_locked -= npages; -} - -static void sev_clflush_pages(struct page *pages[], unsigned long npages) -{ - uint8_t *page_virtual; - unsigned long i; - - if (npages == 0 || pages == NULL) - return; - - for (i = 0; i < npages; i++) { - page_virtual = kmap_atomic(pages[i]); - clflush_cache_range(page_virtual, PAGE_SIZE); - kunmap_atomic(page_virtual); - } -} - -static void __unregister_enc_region_locked(struct kvm *kvm, - struct enc_region *region) -{ - sev_unpin_memory(kvm, region->pages, region->npages); - list_del(®ion->list); - kfree(region); -} - -static void sev_vm_destroy(struct kvm *kvm) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct list_head *head = &sev->regions_list; - struct list_head *pos, *q; - - if (!sev_guest(kvm)) - return; - - mutex_lock(&kvm->lock); - - /* - * Ensure that all guest tagged cache entries are flushed before - * releasing the pages back to the system for use. CLFLUSH will - * not do this, so issue a WBINVD. - */ - wbinvd_on_all_cpus(); - - /* - * if userspace was terminated before unregistering the memory regions - * then lets unpin all the registered memory. - */ - if (!list_empty(head)) { - list_for_each_safe(pos, q, head) { - __unregister_enc_region_locked(kvm, - list_entry(pos, struct enc_region, list)); - } - } - - mutex_unlock(&kvm->lock); - - sev_unbind_asid(kvm, sev->handle); - sev_asid_free(sev->asid); -} - -static void svm_vm_destroy(struct kvm *kvm) -{ - avic_vm_destroy(kvm); - sev_vm_destroy(kvm); -} - -static int svm_vm_init(struct kvm *kvm) -{ - if (avic) { - int ret = avic_vm_init(kvm); - if (ret) - return ret; - } - - kvm_apicv_init(kvm, avic); - return 0; -} - static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3292,30 +2993,6 @@ static void reload_tss(struct kvm_vcpu *vcpu) load_TR_desc(); } -static void pre_sev_run(struct vcpu_svm *svm, int cpu) -{ - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - int asid = sev_get_asid(svm->vcpu.kvm); - - /* Assign the asid allocated with this SEV guest */ - svm->vmcb->control.asid = asid; - - /* - * Flush guest TLB: - * - * 1) when different VMCB for the same ASID is to be run on the same host CPU. - * 2) or this VMCB was executed on different host CPU in previous VMRUNs. - */ - if (sd->sev_vmcbs[asid] == svm->vmcb && - svm->last_cpu == cpu) - return; - - svm->last_cpu = cpu; - sd->sev_vmcbs[asid] = svm->vmcb; - svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; - mark_dirty(svm->vmcb, VMCB_ASID); -} - static void pre_svm_run(struct vcpu_svm *svm) { int cpu = raw_smp_processor_id(); @@ -4216,900 +3893,6 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } -static int sev_flush_asids(void) -{ - int ret, error; - - /* - * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, - * so it must be guarded. - */ - down_write(&sev_deactivate_lock); - - wbinvd_on_all_cpus(); - ret = sev_guest_df_flush(&error); - - up_write(&sev_deactivate_lock); - - if (ret) - pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); - - return ret; -} - -/* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(void) -{ - int pos; - - /* Check if there are any ASIDs to reclaim before performing a flush */ - pos = find_next_bit(sev_reclaim_asid_bitmap, - max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) - return false; - - if (sev_flush_asids()) - return false; - - bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, - max_sev_asid); - bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); - - return true; -} - -static int sev_asid_new(void) -{ - bool retry = true; - int pos; - - mutex_lock(&sev_bitmap_lock); - - /* - * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. - */ -again: - pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) { - if (retry && __sev_recycle_asids()) { - retry = false; - goto again; - } - mutex_unlock(&sev_bitmap_lock); - return -EBUSY; - } - - __set_bit(pos, sev_asid_bitmap); - - mutex_unlock(&sev_bitmap_lock); - - return pos + 1; -} - -static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - int asid, ret; - - ret = -EBUSY; - if (unlikely(sev->active)) - return ret; - - asid = sev_asid_new(); - if (asid < 0) - return ret; - - ret = sev_platform_init(&argp->error); - if (ret) - goto e_free; - - sev->active = true; - sev->asid = asid; - INIT_LIST_HEAD(&sev->regions_list); - - return 0; - -e_free: - sev_asid_free(asid); - return ret; -} - -static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) -{ - struct sev_data_activate *data; - int asid = sev_get_asid(kvm); - int ret; - - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - - /* activate ASID on the given handle */ - data->handle = handle; - data->asid = asid; - ret = sev_guest_activate(data, error); - kfree(data); - - return ret; -} - -static int __sev_issue_cmd(int fd, int id, void *data, int *error) -{ - struct fd f; - int ret; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - ret = sev_issue_cmd_external_user(f.file, id, data, error); - - fdput(f); - return ret; -} - -static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return __sev_issue_cmd(sev->fd, id, data, error); -} - -static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_start *start; - struct kvm_sev_launch_start params; - void *dh_blob, *session_blob; - int *error = &argp->error; - int ret; - - if (!sev_guest(kvm)) - return -ENOTTY; - - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) - return -EFAULT; - - start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT); - if (!start) - return -ENOMEM; - - dh_blob = NULL; - if (params.dh_uaddr) { - dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); - if (IS_ERR(dh_blob)) { - ret = PTR_ERR(dh_blob); - goto e_free; - } - - start->dh_cert_address = __sme_set(__pa(dh_blob)); - start->dh_cert_len = params.dh_len; - } - - session_blob = NULL; - if (params.session_uaddr) { - session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len); - if (IS_ERR(session_blob)) { - ret = PTR_ERR(session_blob); - goto e_free_dh; - } - - start->session_address = __sme_set(__pa(session_blob)); - start->session_len = params.session_len; - } - - start->handle = params.handle; - start->policy = params.policy; - - /* create memory encryption context */ - ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); - if (ret) - goto e_free_session; - - /* Bind ASID to this guest */ - ret = sev_bind_asid(kvm, start->handle, error); - if (ret) - goto e_free_session; - - /* return handle to userspace */ - params.handle = start->handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { - sev_unbind_asid(kvm, start->handle); - ret = -EFAULT; - goto e_free_session; - } - - sev->handle = start->handle; - sev->fd = argp->sev_fd; - -e_free_session: - kfree(session_blob); -e_free_dh: - kfree(dh_blob); -e_free: - kfree(start); - return ret; -} - -static unsigned long get_num_contig_pages(unsigned long idx, - struct page **inpages, unsigned long npages) -{ - unsigned long paddr, next_paddr; - unsigned long i = idx + 1, pages = 1; - - /* find the number of contiguous pages starting from idx */ - paddr = __sme_page_pa(inpages[idx]); - while (i < npages) { - next_paddr = __sme_page_pa(inpages[i++]); - if ((paddr + PAGE_SIZE) == next_paddr) { - pages++; - paddr = next_paddr; - continue; - } - break; - } - - return pages; -} - -static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) -{ - unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct kvm_sev_launch_update_data params; - struct sev_data_launch_update_data *data; - struct page **inpages; - int ret; - - if (!sev_guest(kvm)) - return -ENOTTY; - - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) - return -EFAULT; - - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - - vaddr = params.uaddr; - size = params.len; - vaddr_end = vaddr + size; - - /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); - if (!inpages) { - ret = -ENOMEM; - goto e_free; - } - - /* - * The LAUNCH_UPDATE command will perform in-place encryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. - */ - sev_clflush_pages(inpages, npages); - - for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { - int offset, len; - - /* - * If the user buffer is not page-aligned, calculate the offset - * within the page. - */ - offset = vaddr & (PAGE_SIZE - 1); - - /* Calculate the number of pages that can be encrypted in one go. */ - pages = get_num_contig_pages(i, inpages, npages); - - len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); - - data->handle = sev->handle; - data->len = len; - data->address = __sme_page_pa(inpages[i]) + offset; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error); - if (ret) - goto e_unpin; - - size -= len; - next_vaddr = vaddr + len; - } - -e_unpin: - /* content of memory is updated, mark pages dirty */ - for (i = 0; i < npages; i++) { - set_page_dirty_lock(inpages[i]); - mark_page_accessed(inpages[i]); - } - /* unlock the user pages */ - sev_unpin_memory(kvm, inpages, npages); -e_free: - kfree(data); - return ret; -} - -static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) -{ - void __user *measure = (void __user *)(uintptr_t)argp->data; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_measure *data; - struct kvm_sev_launch_measure params; - void __user *p = NULL; - void *blob = NULL; - int ret; - - if (!sev_guest(kvm)) - return -ENOTTY; - - if (copy_from_user(¶ms, measure, sizeof(params))) - return -EFAULT; - - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - - /* User wants to query the blob length */ - if (!params.len) - goto cmd; - - p = (void __user *)(uintptr_t)params.uaddr; - if (p) { - if (params.len > SEV_FW_BLOB_MAX_SIZE) { - ret = -EINVAL; - goto e_free; - } - - ret = -ENOMEM; - blob = kmalloc(params.len, GFP_KERNEL); - if (!blob) - goto e_free; - - data->address = __psp_pa(blob); - data->len = params.len; - } - -cmd: - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error); - - /* - * If we query the session length, FW responded with expected data. - */ - if (!params.len) - goto done; - - if (ret) - goto e_free_blob; - - if (blob) { - if (copy_to_user(p, blob, params.len)) - ret = -EFAULT; - } - -done: - params.len = data->len; - if (copy_to_user(measure, ¶ms, sizeof(params))) - ret = -EFAULT; -e_free_blob: - kfree(blob); -e_free: - kfree(data); - return ret; -} - -static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_finish *data; - int ret; - - if (!sev_guest(kvm)) - return -ENOTTY; - - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error); - - kfree(data); - return ret; -} - -static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct kvm_sev_guest_status params; - struct sev_data_guest_status *data; - int ret; - - if (!sev_guest(kvm)) - return -ENOTTY; - - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error); - if (ret) - goto e_free; - - params.policy = data->policy; - params.state = data->state; - params.handle = data->handle; - - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) - ret = -EFAULT; -e_free: - kfree(data); - return ret; -} - -static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, - unsigned long dst, int size, - int *error, bool enc) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_dbg *data; - int ret; - - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - - data->handle = sev->handle; - data->dst_addr = dst; - data->src_addr = src; - data->len = size; - - ret = sev_issue_cmd(kvm, - enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, - data, error); - kfree(data); - return ret; -} - -static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, - unsigned long dst_paddr, int sz, int *err) -{ - int offset; - - /* - * Its safe to read more than we are asked, caller should ensure that - * destination has enough space. - */ - src_paddr = round_down(src_paddr, 16); - offset = src_paddr & 15; - sz = round_up(sz + offset, 16); - - return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); -} - -static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, - unsigned long __user dst_uaddr, - unsigned long dst_paddr, - int size, int *err) -{ - struct page *tpage = NULL; - int ret, offset; - - /* if inputs are not 16-byte then use intermediate buffer */ - if (!IS_ALIGNED(dst_paddr, 16) || - !IS_ALIGNED(paddr, 16) || - !IS_ALIGNED(size, 16)) { - tpage = (void *)alloc_page(GFP_KERNEL); - if (!tpage) - return -ENOMEM; - - dst_paddr = __sme_page_pa(tpage); - } - - ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); - if (ret) - goto e_free; - - if (tpage) { - offset = paddr & 15; - if (copy_to_user((void __user *)(uintptr_t)dst_uaddr, - page_address(tpage) + offset, size)) - ret = -EFAULT; - } - -e_free: - if (tpage) - __free_page(tpage); - - return ret; -} - -static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, - unsigned long __user vaddr, - unsigned long dst_paddr, - unsigned long __user dst_vaddr, - int size, int *error) -{ - struct page *src_tpage = NULL; - struct page *dst_tpage = NULL; - int ret, len = size; - - /* If source buffer is not aligned then use an intermediate buffer */ - if (!IS_ALIGNED(vaddr, 16)) { - src_tpage = alloc_page(GFP_KERNEL); - if (!src_tpage) - return -ENOMEM; - - if (copy_from_user(page_address(src_tpage), - (void __user *)(uintptr_t)vaddr, size)) { - __free_page(src_tpage); - return -EFAULT; - } - - paddr = __sme_page_pa(src_tpage); - } - - /* - * If destination buffer or length is not aligned then do read-modify-write: - * - decrypt destination in an intermediate buffer - * - copy the source buffer in an intermediate buffer - * - use the intermediate buffer as source buffer - */ - if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { - int dst_offset; - - dst_tpage = alloc_page(GFP_KERNEL); - if (!dst_tpage) { - ret = -ENOMEM; - goto e_free; - } - - ret = __sev_dbg_decrypt(kvm, dst_paddr, - __sme_page_pa(dst_tpage), size, error); - if (ret) - goto e_free; - - /* - * If source is kernel buffer then use memcpy() otherwise - * copy_from_user(). - */ - dst_offset = dst_paddr & 15; - - if (src_tpage) - memcpy(page_address(dst_tpage) + dst_offset, - page_address(src_tpage), size); - else { - if (copy_from_user(page_address(dst_tpage) + dst_offset, - (void __user *)(uintptr_t)vaddr, size)) { - ret = -EFAULT; - goto e_free; - } - } - - paddr = __sme_page_pa(dst_tpage); - dst_paddr = round_down(dst_paddr, 16); - len = round_up(size, 16); - } - - ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true); - -e_free: - if (src_tpage) - __free_page(src_tpage); - if (dst_tpage) - __free_page(dst_tpage); - return ret; -} - -static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) -{ - unsigned long vaddr, vaddr_end, next_vaddr; - unsigned long dst_vaddr; - struct page **src_p, **dst_p; - struct kvm_sev_dbg debug; - unsigned long n; - unsigned int size; - int ret; - - if (!sev_guest(kvm)) - return -ENOTTY; - - if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug))) - return -EFAULT; - - if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) - return -EINVAL; - if (!debug.dst_uaddr) - return -EINVAL; - - vaddr = debug.src_uaddr; - size = debug.len; - vaddr_end = vaddr + size; - dst_vaddr = debug.dst_uaddr; - - for (; vaddr < vaddr_end; vaddr = next_vaddr) { - int len, s_off, d_off; - - /* lock userspace source and destination page */ - src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); - if (!src_p) - return -EFAULT; - - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); - if (!dst_p) { - sev_unpin_memory(kvm, src_p, n); - return -EFAULT; - } - - /* - * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. - */ - sev_clflush_pages(src_p, 1); - sev_clflush_pages(dst_p, 1); - - /* - * Since user buffer may not be page aligned, calculate the - * offset within the page. - */ - s_off = vaddr & ~PAGE_MASK; - d_off = dst_vaddr & ~PAGE_MASK; - len = min_t(size_t, (PAGE_SIZE - s_off), size); - - if (dec) - ret = __sev_dbg_decrypt_user(kvm, - __sme_page_pa(src_p[0]) + s_off, - dst_vaddr, - __sme_page_pa(dst_p[0]) + d_off, - len, &argp->error); - else - ret = __sev_dbg_encrypt_user(kvm, - __sme_page_pa(src_p[0]) + s_off, - vaddr, - __sme_page_pa(dst_p[0]) + d_off, - dst_vaddr, - len, &argp->error); - - sev_unpin_memory(kvm, src_p, n); - sev_unpin_memory(kvm, dst_p, n); - - if (ret) - goto err; - - next_vaddr = vaddr + len; - dst_vaddr = dst_vaddr + len; - size -= len; - } -err: - return ret; -} - -static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_secret *data; - struct kvm_sev_launch_secret params; - struct page **pages; - void *blob, *hdr; - unsigned long n; - int ret, offset; - - if (!sev_guest(kvm)) - return -ENOTTY; - - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) - return -EFAULT; - - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); - if (!pages) - return -ENOMEM; - - /* - * The secret must be copied into contiguous memory region, lets verify - * that userspace memory pages are contiguous before we issue command. - */ - if (get_num_contig_pages(0, pages, n) != n) { - ret = -EINVAL; - goto e_unpin_memory; - } - - ret = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - goto e_unpin_memory; - - offset = params.guest_uaddr & (PAGE_SIZE - 1); - data->guest_address = __sme_page_pa(pages[0]) + offset; - data->guest_len = params.guest_len; - - blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); - if (IS_ERR(blob)) { - ret = PTR_ERR(blob); - goto e_free; - } - - data->trans_address = __psp_pa(blob); - data->trans_len = params.trans_len; - - hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); - if (IS_ERR(hdr)) { - ret = PTR_ERR(hdr); - goto e_free_blob; - } - data->hdr_address = __psp_pa(hdr); - data->hdr_len = params.hdr_len; - - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error); - - kfree(hdr); - -e_free_blob: - kfree(blob); -e_free: - kfree(data); -e_unpin_memory: - sev_unpin_memory(kvm, pages, n); - return ret; -} - -static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) -{ - struct kvm_sev_cmd sev_cmd; - int r; - - if (!svm_sev_enabled()) - return -ENOTTY; - - if (!argp) - return 0; - - if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) - return -EFAULT; - - mutex_lock(&kvm->lock); - - switch (sev_cmd.id) { - case KVM_SEV_INIT: - r = sev_guest_init(kvm, &sev_cmd); - break; - case KVM_SEV_LAUNCH_START: - r = sev_launch_start(kvm, &sev_cmd); - break; - case KVM_SEV_LAUNCH_UPDATE_DATA: - r = sev_launch_update_data(kvm, &sev_cmd); - break; - case KVM_SEV_LAUNCH_MEASURE: - r = sev_launch_measure(kvm, &sev_cmd); - break; - case KVM_SEV_LAUNCH_FINISH: - r = sev_launch_finish(kvm, &sev_cmd); - break; - case KVM_SEV_GUEST_STATUS: - r = sev_guest_status(kvm, &sev_cmd); - break; - case KVM_SEV_DBG_DECRYPT: - r = sev_dbg_crypt(kvm, &sev_cmd, true); - break; - case KVM_SEV_DBG_ENCRYPT: - r = sev_dbg_crypt(kvm, &sev_cmd, false); - break; - case KVM_SEV_LAUNCH_SECRET: - r = sev_launch_secret(kvm, &sev_cmd); - break; - default: - r = -EINVAL; - goto out; - } - - if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) - r = -EFAULT; - -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct enc_region *region; - int ret = 0; - - if (!sev_guest(kvm)) - return -ENOTTY; - - if (range->addr > ULONG_MAX || range->size > ULONG_MAX) - return -EINVAL; - - region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); - if (!region) - return -ENOMEM; - - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); - if (!region->pages) { - ret = -ENOMEM; - goto e_free; - } - - /* - * The guest may change the memory encryption attribute from C=0 -> C=1 - * or vice versa for this memory range. Lets make sure caches are - * flushed to ensure that guest data gets written into memory with - * correct C-bit. - */ - sev_clflush_pages(region->pages, region->npages); - - region->uaddr = range->addr; - region->size = range->size; - - mutex_lock(&kvm->lock); - list_add_tail(®ion->list, &sev->regions_list); - mutex_unlock(&kvm->lock); - - return ret; - -e_free: - kfree(region); - return ret; -} - -static struct enc_region * -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct list_head *head = &sev->regions_list; - struct enc_region *i; - - list_for_each_entry(i, head, list) { - if (i->uaddr == range->addr && - i->size == range->size) - return i; - } - - return NULL; -} - - -static int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) -{ - struct enc_region *region; - int ret; - - mutex_lock(&kvm->lock); - - if (!sev_guest(kvm)) { - ret = -ENOTTY; - goto failed; - } - - region = find_enc_region(kvm, range); - if (!region) { - ret = -EINVAL; - goto failed; - } - - /* - * Ensure that all guest tagged cache entries are flushed before - * releasing the pages back to the system for use. CLFLUSH will - * not do this, so issue a WBINVD. - */ - wbinvd_on_all_cpus(); - - __unregister_enc_region_locked(kvm, region); - - mutex_unlock(&kvm->lock); - return 0; - -failed: - mutex_unlock(&kvm->lock); - return ret; -} - static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) { unsigned long cr4 = kvm_read_cr4(vcpu); @@ -5183,6 +3966,24 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); } +static void svm_vm_destroy(struct kvm *kvm) +{ + avic_vm_destroy(kvm); + sev_vm_destroy(kvm); +} + +static int svm_vm_init(struct kvm *kvm) +{ + if (avic) { + int ret = avic_vm_init(kvm); + if (ret) + return ret; + } + + kvm_apicv_init(kvm, avic); + return 0; +} + static struct kvm_x86_ops svm_x86_ops __initdata = { .hardware_unsetup = svm_hardware_teardown, .hardware_enable = svm_hardware_enable, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index c7abc1fede97..df3474f4fb02 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -171,6 +171,24 @@ struct vcpu_svm { unsigned int last_cpu; }; +struct svm_cpu_data { + int cpu; + + u64 asid_generation; + u32 max_asid; + u32 next_asid; + u32 min_asid; + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; + struct vmcb *current_vmcb; + + /* index = sev_asid, value = vmcb pointer */ + struct vmcb **sev_vmcbs; +}; + +DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); + void recalc_intercepts(struct vcpu_svm *svm); static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) @@ -440,4 +458,34 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, void svm_vcpu_blocking(struct kvm_vcpu *vcpu); void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); +/* sev.c */ + +extern unsigned int max_sev_asid; + +static inline bool sev_guest(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_AMD_SEV + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + return sev->active; +#else + return false; +#endif +} + +static inline bool svm_sev_enabled(void) +{ + return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0; +} + +void sev_vm_destroy(struct kvm *kvm); +int svm_mem_enc_op(struct kvm *kvm, void __user *argp); +int svm_register_enc_region(struct kvm *kvm, + struct kvm_enc_region *range); +int svm_unregister_enc_region(struct kvm *kvm, + struct kvm_enc_region *range); +void pre_sev_run(struct vcpu_svm *svm, int cpu); +int __init sev_hardware_setup(void); +void sev_hardware_teardown(void); + #endif -- cgit v1.2.3 From 199cd1d7b5348de4b58208420687676c658efed3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Mar 2020 15:02:13 +0200 Subject: KVM: SVM: Split svm_vcpu_run inline assembly to separate file The compiler (GCC) does not like the situation, where there is inline assembly block that clobbers all available machine registers in the middle of the function. This situation can be found in function svm_vcpu_run in file kvm/svm.c and results in many register spills and fills to/from stack frame. This patch fixes the issue with the same approach as was done for VMX some time ago. The big inline assembly is moved to a separate assembly .S file, taking into account all ABI requirements. There are two main benefits of the above approach: * elimination of several register spills and fills to/from stack frame, and consequently smaller function .text size. The binary size of svm_vcpu_run is lowered from 2019 to 1626 bytes. * more efficient access to a register save array. Currently, register save array is accessed as: 7b00: 48 8b 98 28 02 00 00 mov 0x228(%rax),%rbx 7b07: 48 8b 88 18 02 00 00 mov 0x218(%rax),%rcx 7b0e: 48 8b 90 20 02 00 00 mov 0x220(%rax),%rdx and passing ia pointer to a register array as an argument to a function one gets: 12: 48 8b 48 08 mov 0x8(%rax),%rcx 16: 48 8b 50 10 mov 0x10(%rax),%rdx 1a: 48 8b 58 18 mov 0x18(%rax),%rbx As a result, the total size, considering that the new function size is 229 bytes, gets lowered by 164 bytes. Signed-off-by: Uros Bizjak Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/svm/svm.c | 92 +------------------------ arch/x86/kvm/svm/vmenter.S | 162 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 90 deletions(-) create mode 100644 arch/x86/kvm/svm/vmenter.S (limited to 'arch/x86/kvm/Makefile') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index e5a71aa0967b..a789759b7261 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -14,7 +14,7 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o -kvm-amd-y += svm/svm.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o +kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 05d77b395fc1..2be5bbae3a40 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3276,6 +3276,8 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); } +bool __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); + static void svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3330,95 +3332,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); - asm volatile ( - "push %%" _ASM_BP "; \n\t" - "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" - "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" - "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" - "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" - "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" - "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%[svm]), %%r8 \n\t" - "mov %c[r9](%[svm]), %%r9 \n\t" - "mov %c[r10](%[svm]), %%r10 \n\t" - "mov %c[r11](%[svm]), %%r11 \n\t" - "mov %c[r12](%[svm]), %%r12 \n\t" - "mov %c[r13](%[svm]), %%r13 \n\t" - "mov %c[r14](%[svm]), %%r14 \n\t" - "mov %c[r15](%[svm]), %%r15 \n\t" -#endif - - /* Enter guest mode */ - "push %%" _ASM_AX " \n\t" - "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" - __ex("vmload %%" _ASM_AX) "\n\t" - __ex("vmrun %%" _ASM_AX) "\n\t" - __ex("vmsave %%" _ASM_AX) "\n\t" - "pop %%" _ASM_AX " \n\t" - - /* Save guest registers, load host registers */ - "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" - "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" - "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" - "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" - "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" - "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%[svm]) \n\t" - "mov %%r9, %c[r9](%[svm]) \n\t" - "mov %%r10, %c[r10](%[svm]) \n\t" - "mov %%r11, %c[r11](%[svm]) \n\t" - "mov %%r12, %c[r12](%[svm]) \n\t" - "mov %%r13, %c[r13](%[svm]) \n\t" - "mov %%r14, %c[r14](%[svm]) \n\t" - "mov %%r15, %c[r15](%[svm]) \n\t" - /* - * Clear host registers marked as clobbered to prevent - * speculative use. - */ - "xor %%r8d, %%r8d \n\t" - "xor %%r9d, %%r9d \n\t" - "xor %%r10d, %%r10d \n\t" - "xor %%r11d, %%r11d \n\t" - "xor %%r12d, %%r12d \n\t" - "xor %%r13d, %%r13d \n\t" - "xor %%r14d, %%r14d \n\t" - "xor %%r15d, %%r15d \n\t" -#endif - "xor %%ebx, %%ebx \n\t" - "xor %%ecx, %%ecx \n\t" - "xor %%edx, %%edx \n\t" - "xor %%esi, %%esi \n\t" - "xor %%edi, %%edi \n\t" - "pop %%" _ASM_BP - : - : [svm]"a"(svm), - [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), - [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) -#ifdef CONFIG_X86_64 - , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) -#endif - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" - , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx", "esi", "edi" -#endif - ); + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S new file mode 100644 index 000000000000..fa1af90067e9 --- /dev/null +++ b/arch/x86/kvm/svm/vmenter.S @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + +#define WORD_SIZE (BITS_PER_LONG / 8) + +/* Intentionally omit RAX as it's context switched by hardware */ +#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE +#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE +#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +/* Intentionally omit RSP as it's context switched by hardware */ +#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE +#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE +#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE + +#ifdef CONFIG_X86_64 +#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE +#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE +#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE +#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE +#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE +#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE +#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE +#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#endif + + .text + +/** + * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode + * @vmcb_pa: unsigned long + * @regs: unsigned long * (to guest registers) + */ +SYM_FUNC_START(__svm_vcpu_run) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP +#ifdef CONFIG_X86_64 + push %r15 + push %r14 + push %r13 + push %r12 +#else + push %edi + push %esi +#endif + push %_ASM_BX + + /* Save @regs. */ + push %_ASM_ARG2 + + /* Save @vmcb. */ + push %_ASM_ARG1 + + /* Move @regs to RAX. */ + mov %_ASM_ARG2, %_ASM_AX + + /* Load guest registers. */ + mov VCPU_RCX(%_ASM_AX), %_ASM_CX + mov VCPU_RDX(%_ASM_AX), %_ASM_DX + mov VCPU_RBX(%_ASM_AX), %_ASM_BX + mov VCPU_RBP(%_ASM_AX), %_ASM_BP + mov VCPU_RSI(%_ASM_AX), %_ASM_SI + mov VCPU_RDI(%_ASM_AX), %_ASM_DI +#ifdef CONFIG_X86_64 + mov VCPU_R8 (%_ASM_AX), %r8 + mov VCPU_R9 (%_ASM_AX), %r9 + mov VCPU_R10(%_ASM_AX), %r10 + mov VCPU_R11(%_ASM_AX), %r11 + mov VCPU_R12(%_ASM_AX), %r12 + mov VCPU_R13(%_ASM_AX), %r13 + mov VCPU_R14(%_ASM_AX), %r14 + mov VCPU_R15(%_ASM_AX), %r15 +#endif + + /* "POP" @vmcb to RAX. */ + pop %_ASM_AX + + /* Enter guest mode */ +1: vmload %_ASM_AX + jmp 3f +2: cmpb $0, kvm_rebooting + jne 3f + ud2 + _ASM_EXTABLE(1b, 2b) + +3: vmrun %_ASM_AX + jmp 5f +4: cmpb $0, kvm_rebooting + jne 5f + ud2 + _ASM_EXTABLE(3b, 4b) + +5: vmsave %_ASM_AX + jmp 7f +6: cmpb $0, kvm_rebooting + jne 7f + ud2 + _ASM_EXTABLE(5b, 6b) +7: + /* "POP" @regs to RAX. */ + pop %_ASM_AX + + /* Save all guest registers. */ + mov %_ASM_CX, VCPU_RCX(%_ASM_AX) + mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) + mov %_ASM_SI, VCPU_RSI(%_ASM_AX) + mov %_ASM_DI, VCPU_RDI(%_ASM_AX) +#ifdef CONFIG_X86_64 + mov %r8, VCPU_R8 (%_ASM_AX) + mov %r9, VCPU_R9 (%_ASM_AX) + mov %r10, VCPU_R10(%_ASM_AX) + mov %r11, VCPU_R11(%_ASM_AX) + mov %r12, VCPU_R12(%_ASM_AX) + mov %r13, VCPU_R13(%_ASM_AX) + mov %r14, VCPU_R14(%_ASM_AX) + mov %r15, VCPU_R15(%_ASM_AX) +#endif + + /* + * Clear all general purpose registers except RSP and RAX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RAX are exempt as they are restored by hardware + * during VM-Exit. + */ + xor %ecx, %ecx + xor %edx, %edx + xor %ebx, %ebx + xor %ebp, %ebp + xor %esi, %esi + xor %edi, %edi +#ifdef CONFIG_X86_64 + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d +#endif + + pop %_ASM_BX + +#ifdef CONFIG_X86_64 + pop %r12 + pop %r13 + pop %r14 + pop %r15 +#else + pop %esi + pop %edi +#endif + pop %_ASM_BP + ret +SYM_FUNC_END(__svm_vcpu_run) -- cgit v1.2.3