diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2022-12-06 12:29:06 -0500 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2022-12-12 15:54:07 -0500 |
commit | 9352e7470a1b4edd2fa9d235420ecc7bc3971bdc (patch) | |
tree | 141ccdb777f2ee36764f2067ab43f23da114e08a /arch/x86/kvm | |
parent | 2afc1fbbdab2aee831561f09f859989dcd5ed648 (diff) | |
parent | 5656374b168c98377b6feee8d7500993eebda230 (diff) | |
download | linux-9352e7470a1b4edd2fa9d235420ecc7bc3971bdc.tar.bz2 |
Merge remote-tracking branch 'kvm/queue' into HEAD
x86 Xen-for-KVM:
* Allow the Xen runstate information to cross a page boundary
* Allow XEN_RUNSTATE_UPDATE flag behaviour to be configured
* add support for 32-bit guests in SCHEDOP_poll
x86 fixes:
* One-off fixes for various emulation flows (SGX, VMXON, NRIPS=0).
* Reinstate IBPB on emulated VM-Exit that was incorrectly dropped a few
years back when eliminating unnecessary barriers when switching between
vmcs01 and vmcs02.
* Clean up the MSR filter docs.
* Clean up vmread_error_trampoline() to make it more obvious that params
must be passed on the stack, even for x86-64.
* Let userspace set all supported bits in MSR_IA32_FEAT_CTL irrespective
of the current guest CPUID.
* Fudge around a race with TSC refinement that results in KVM incorrectly
thinking a guest needs TSC scaling when running on a CPU with a
constant TSC, but no hardware-enumerated TSC frequency.
* Advertise (on AMD) that the SMM_CTL MSR is not supported
* Remove unnecessary exports
Selftests:
* Fix an inverted check in the access tracking perf test, and restore
support for asserting that there aren't too many idle pages when
running on bare metal.
* Fix an ordering issue in the AMX test introduced by recent conversions
to use kvm_cpu_has(), and harden the code to guard against similar bugs
in the future. Anything that tiggers caching of KVM's supported CPUID,
kvm_cpu_has() in this case, effectively hides opt-in XSAVE features if
the caching occurs before the test opts in via prctl().
* Fix build errors that occur in certain setups (unsure exactly what is
unique about the problematic setup) due to glibc overriding
static_assert() to a variant that requires a custom message.
* Introduce actual atomics for clear/set_bit() in selftests
Documentation:
* Remove deleted ioctls from documentation
* Various fixes
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 64 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 51 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx_ops.h | 18 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 81 | ||||
-rw-r--r-- | arch/x86/kvm/xen.c | 503 | ||||
-rw-r--r-- | arch/x86/kvm/xen.h | 13 |
14 files changed, 544 insertions, 227 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 723502181a3a..0b5bf013fcb8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1233,8 +1233,12 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * Other defined bits are for MSRs that KVM does not expose: * EAX 3 SPCL, SMM page configuration lock * EAX 13 PCMSR, Prefetch control MSR + * + * KVM doesn't support SMM_CTL. + * EAX 9 SMM_CTL MSR is not supported */ entry->eax &= BIT(0) | BIT(2) | BIT(6); + entry->eax |= BIT(9); if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)) entry->eax |= BIT(2); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index d8d50558f165..a70952eca905 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -31,7 +31,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return r; } -EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* * check if there is a pending userspace external interrupt @@ -150,7 +149,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_inject_timer_irqs(vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1bb63746e991..4efdb4a4d72c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -160,7 +160,6 @@ bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) && !(kvm_mwait_in_guest(vcpu->kvm) || kvm_can_post_timer_interrupt(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer); static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) { @@ -1914,7 +1913,6 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) return vcpu->arch.apic->lapic_timer.hv_timer_in_use; } -EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { @@ -2432,7 +2430,6 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) apic->isr_count = count_vectors(apic->regs + APIC_ISR); } } -EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { @@ -2724,8 +2721,6 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); } - } else { - kvm_lapic_xapic_id_updated(vcpu->arch.apic); } return 0; @@ -2761,6 +2756,9 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); + if (!apic_x2apic_mode(apic)) + kvm_lapic_xapic_id_updated(apic); + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 69dbf17f0d6a..86d6897f4806 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -465,9 +465,9 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) return; for (i = 0; i < npages; i++) { - page_virtual = kmap_atomic(pages[i]); + page_virtual = kmap_local_page(pages[i]); clflush_cache_range(page_virtual, PAGE_SIZE); - kunmap_atomic(page_virtual); + kunmap_local(page_virtual); cond_resched(); } } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 91352d692845..6ffadbd57744 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3895,8 +3895,14 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + /* + * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM + * can't read guest memory (dereference memslots) to decode the WRMSR. + */ + if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 && + nrips && control->next_rip) return handle_fastpath_set_msr_irqoff(vcpu); return EXIT_FASTPATH_NONE; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b28be793de29..b6f4411b613e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2588,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, nested_ept_init_mmu_context(vcpu); /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those - * bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we - * have more bits than L1 expected. + * Override the CR0/CR4 read shadows after setting the effective guest + * CR0/CR4. The common helpers also set the shadows, but they don't + * account for vmcs12's cr0/4_guest_host_mask. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); @@ -4798,6 +4795,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); + /* + * If IBRS is advertised to the vCPU, KVM must flush the indirect + * branch predictors when transitioning from L2 to L1, as L1 expects + * hardware (KVM in this case) to provide separate predictor modes. + * Bare metal isolates VMX root (host) from VMX non-root (guest), but + * doesn't isolate different VMCSs, i.e. in this case, doesn't provide + * separate modes for L2 vs L1. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + indirect_branch_prediction_barrier(); + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -5131,24 +5139,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks - * that have higher priority than VM-Exit (see Intel SDM's pseudocode - * for VMXON), as KVM must load valid CR0/CR4 values into hardware while - * running the guest, i.e. KVM needs to check the _guest_ values. + * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter + * the guest and so cannot rely on hardware to perform the check, + * which has higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON). * - * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and - * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real - * Mode, but KVM will never take the guest out of those modes. + * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 + * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't + * force any of the relevant guest state. For a restricted guest, KVM + * does force CR0.PE=1, but only to also force VM86 in order to emulate + * Real Mode, and so there's no need to check CR0.PE manually. */ - if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || - !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* - * CPL=0 and all other checks that are lower priority than VM-Exit must - * be checked manually. + * The CPL is checked for "not in VMX operation" and for "in VMX root", + * and has higher priority than the VM-Fail due to being post-VMXON, + * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, + * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits + * from L2 to L1, i.e. there's no need to check for the vCPU being in + * VMX non-root. + * + * Forwarding the VM-Exit unconditionally, i.e. without performing the + * #UD checks (see above), is functionally ok because KVM doesn't allow + * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's + * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are + * missed by hardware due to shadowing CR0 and/or CR4. */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); @@ -5158,6 +5177,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + /* + * Invalid CR0/CR4 generates #GP. These checks are performed if and + * only if the vCPU isn't already in VMX operation, i.e. effectively + * have lower priority than the VM-Fail above. + */ + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 6312c9541c3c..96952263b029 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -79,9 +79,10 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) } /* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). + * Return the cr0/4 value that a nested guest would read. This is a combination + * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed + * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as + * the value+mask loaded into vmcs02 may not match the vmcs12 fields. */ static inline unsigned long nested_read_cr0(struct vmcs12 *fields) { diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 8f95c7c01433..b12da2a6dec9 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -182,8 +182,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, /* Enforce CPUID restriction on max enclave size. */ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : sgx_12_0->edx; - if (size >= BIT_ULL(max_size_log2)) + if (size >= BIT_ULL(max_size_log2)) { kvm_inject_gp(vcpu, 0); + return 1; + } /* * sgx_virt_ecreate() returns: diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0b5db4de4d09..766c6b3ef5ed 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -269,6 +269,7 @@ SYM_FUNC_END(__vmx_vcpu_run) .section .text, "ax" +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed @@ -317,6 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline) RET SYM_FUNC_END(vmread_error_trampoline) +#endif SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cea8c07f5229..fe5615fd8295 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -858,7 +858,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl. */ - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; return flags; @@ -1348,8 +1348,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, /* * No indirect branch prediction barrier needed when switching - * the active VMCS within a guest, e.g. on nested VM-Enter. - * The L1 VMM can protect itself with retpolines, IBPB or IBRS. + * the active VMCS within a vCPU, unless IBRS is advertised to + * the vCPU. To minimize the number of IBPBs executed, KVM + * performs IBPB on nested VM-Exit (a single nested transition + * may switch the active VMCS multiple times). */ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) indirect_branch_prediction_barrier(); @@ -1834,12 +1836,42 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } -static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, - uint64_t val) +/* + * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of + * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain + * backwards compatibility even though KVM doesn't support emulating SMX. And + * because userspace set "VMX in SMX", the guest must also be allowed to set it, + * e.g. if the MSR is left unlocked and the guest does a RMW operation. + */ +#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ + FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ + FEAT_CTL_SGX_LC_ENABLED | \ + FEAT_CTL_SGX_ENABLED | \ + FEAT_CTL_LMCE_ENABLED) + +static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, + struct msr_data *msr) { - uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + uint64_t valid_bits; + + /* + * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are + * exposed to the guest. + */ + WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & + ~KVM_SUPPORTED_FEATURE_CONTROL); - return !(val & ~valid_bits); + if (!msr->host_initiated && + (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) + return false; + + if (msr->host_initiated) + valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; + else + valid_bits = vmx->msr_ia32_feature_control_valid_bits; + + return !(msr->data & ~valid_bits); } static int vmx_get_msr_feature(struct kvm_msr_entry *msr) @@ -2238,10 +2270,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: - if (!vmx_feature_control_msr_valid(vcpu, data) || - (to_vmx(vcpu)->msr_ia32_feature_control & - FEAT_CTL_LOCKED && !msr_info->host_initiated)) + if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) return 1; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index f6f23c7397dc..842dc898c972 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -11,14 +11,28 @@ #include "../x86.h" void vmread_error(unsigned long field, bool fault); -__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, - bool fault); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +/* + * The VMREAD error trampoline _always_ uses the stack to pass parameters, even + * for 64-bit targets. Preserving all registers allows the VMREAD inline asm + * blob to avoid clobbering GPRs, which in turn allows the compiler to better + * optimize sequences of VMREADs. + * + * Declare the trampoline as an opaque label as it's not safe to call from C + * code; there is no way to tell the compiler to pass params on the stack for + * 64-bit targets. + * + * void vmread_error_trampoline(unsigned long field, bool fault); + */ +extern unsigned long vmread_error_trampoline; +#endif + static __always_inline void vmcs_check16(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2bc3eac3f14e..fd6c01a39312 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -463,7 +463,6 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { return vcpu->arch.apic_base; } -EXPORT_SYMBOL_GPL(kvm_get_apic_base); enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) { @@ -491,7 +490,6 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_recalculate_apic_map(vcpu->kvm); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_apic_base); /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. @@ -782,7 +780,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); } -EXPORT_SYMBOL_GPL(kvm_inject_page_fault); void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) @@ -811,7 +808,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) atomic_inc(&vcpu->arch.nmi_queued); kvm_make_request(KVM_REQ_NMI, vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { @@ -836,7 +832,6 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; } -EXPORT_SYMBOL_GPL(kvm_require_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { @@ -2069,7 +2064,6 @@ int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); int kvm_emulate_invd(struct kvm_vcpu *vcpu) { @@ -2317,13 +2311,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ - if (system_time & 1) { - kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, - KVM_HOST_USES_PFN, system_time & ~1ULL, + if (system_time & 1) + kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL, sizeof(struct pvclock_vcpu_time_info)); - } else { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); - } + else + kvm_gpc_deactivate(&vcpu->arch.pv_time); return; } @@ -2515,7 +2507,6 @@ u64 kvm_scale_tsc(u64 tsc, u64 ratio) return _tsc; } -EXPORT_SYMBOL_GPL(kvm_scale_tsc); static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { @@ -2974,6 +2965,22 @@ static void kvm_update_masterclock(struct kvm *kvm) kvm_end_pvclock_update(kvm); } +/* + * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's + * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz + * can change during boot even if the TSC is constant, as it's possible for KVM + * to be loaded before TSC calibration completes. Ideally, KVM would get a + * notification when calibration completes, but practically speaking calibration + * will complete before userspace is alive enough to create VMs. + */ +static unsigned long get_cpu_tsc_khz(void) +{ + if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { @@ -2984,7 +2991,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) get_cpu(); data->flags = 0; - if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { + if (ka->use_master_clock && + (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { #ifdef CONFIG_X86_64 struct timespec64 ts; @@ -2998,7 +3006,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) data->flags |= KVM_CLOCK_TSC_STABLE; hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); @@ -3037,12 +3045,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, unsigned long flags; read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - offset + sizeof(*guest_hv_clock))) { + while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - offset + sizeof(*guest_hv_clock))) + if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) return; read_lock_irqsave(&gpc->lock, flags); @@ -3108,7 +3114,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + tgt_tsc_khz = get_cpu_tsc_khz(); if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -3391,7 +3397,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(&vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -4431,7 +4437,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND; if (sched_info_on()) - r |= KVM_XEN_HVM_CONFIG_RUNSTATE; + r |= KVM_XEN_HVM_CONFIG_RUNSTATE | + KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; break; #endif case KVM_CAP_SYNC_REGS: @@ -8771,7 +8778,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, write_fault_to_spt, emulation_type)) return 1; - if (ctxt->have_exception) { + + if (ctxt->have_exception && + !(emulation_type & EMULTYPE_SKIP)) { /* * #UD should result in just EMULATION_FAILED, and trap-like * exception should not be encountered during decode. @@ -9035,9 +9044,11 @@ static void tsc_khz_changed(void *data) struct cpufreq_freqs *freq = data; unsigned long khz = 0; + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); + if (data) khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + else khz = cpufreq_quick_get(raw_smp_processor_id()); if (!khz) khz = tsc_khz; @@ -9058,8 +9069,10 @@ static void kvm_hyperv_tsc_notifier(void) hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ - for_each_present_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + } kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { @@ -9196,10 +9209,10 @@ static void kvm_timer_init(void) } cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - } - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", - kvmclock_cpu_online, kvmclock_cpu_down_prep); + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", + kvmclock_cpu_online, kvmclock_cpu_down_prep); + } } #ifdef CONFIG_X86_64 @@ -9359,10 +9372,11 @@ void kvm_arch_exit(void) #endif kvm_lapic_exit(); - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + } #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); irq_work_sync(&pvclock_irq_work); @@ -10276,8 +10290,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; vcpu->mmio_needed = 0; r = 0; + goto out; } - goto out; } if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { /* Page is swapped out. Do synthetic halt */ @@ -11538,7 +11552,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - kvm_gpc_init(&vcpu->arch.pv_time); + kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -12040,7 +12054,6 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 4b8e9628fbf5..d7af40240248 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -42,13 +42,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) int idx = srcu_read_lock(&kvm->srcu); if (gfn == GPA_INVALID) { - kvm_gpc_deactivate(kvm, gpc); + kvm_gpc_deactivate(gpc); goto out; } do { - ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, - PAGE_SIZE); + ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE); if (ret) goto out; @@ -170,112 +169,45 @@ static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) vcpu->arch.xen.timer.function = xen_timer_callback; } -static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) { struct kvm_vcpu_xen *vx = &v->arch.xen; - u64 now = get_kvmclock_ns(v->kvm); - u64 delta_ns = now - vx->runstate_entry_time; - u64 run_delay = current->sched_info.run_delay; - - if (unlikely(!vx->runstate_entry_time)) - vx->current_runstate = RUNSTATE_offline; - - /* - * Time waiting for the scheduler isn't "stolen" if the - * vCPU wasn't running anyway. - */ - if (vx->current_runstate == RUNSTATE_running) { - u64 steal_ns = run_delay - vx->last_steal; - - delta_ns -= steal_ns; - - vx->runstate_times[RUNSTATE_runnable] += steal_ns; - } - vx->last_steal = run_delay; - - vx->runstate_times[vx->current_runstate] += delta_ns; - vx->current_runstate = state; - vx->runstate_entry_time = now; -} - -void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) -{ - struct kvm_vcpu_xen *vx = &v->arch.xen; - struct gfn_to_pfn_cache *gpc = &vx->runstate_cache; - uint64_t *user_times; + struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache; + struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; + size_t user_len, user_len1, user_len2; + struct vcpu_runstate_info rs; unsigned long flags; - size_t user_len; - int *user_state; - - kvm_xen_update_runstate(v, state); - - if (!vx->runstate_cache.active) - return; - - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) - user_len = sizeof(struct vcpu_runstate_info); - else - user_len = sizeof(struct compat_vcpu_runstate_info); - - read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - user_len)) { - read_unlock_irqrestore(&gpc->lock, flags); - - /* When invoked from kvm_sched_out() we cannot sleep */ - if (state == RUNSTATE_runnable) - return; - - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len)) - return; - - read_lock_irqsave(&gpc->lock, flags); - } + size_t times_ofs; + uint8_t *update_bit = NULL; + uint64_t entry_time; + uint64_t *rs_times; + int *rs_state; /* * The only difference between 32-bit and 64-bit versions of the - * runstate struct us the alignment of uint64_t in 32-bit, which + * runstate struct is the alignment of uint64_t in 32-bit, which * means that the 64-bit version has an additional 4 bytes of - * padding after the first field 'state'. - * - * So we use 'int __user *user_state' to point to the state field, - * and 'uint64_t __user *user_times' for runstate_entry_time. So - * the actual array of time[] in each state starts at user_times[1]. + * padding after the first field 'state'. Let's be really really + * paranoid about that, and matching it with our internal data + * structures that we memcpy into it... */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); #ifdef CONFIG_X86_64 + /* + * The 64-bit structure has 4 bytes of padding before 'state_entry_time' + * so each subsequent field is shifted by 4, and it's 4 bytes longer. + */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); + BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4); #endif - - user_state = gpc->khva; - - if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) - user_times = gpc->khva + offsetof(struct vcpu_runstate_info, - state_entry_time); - else - user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info, - state_entry_time); - /* - * First write the updated state_entry_time at the appropriate - * location determined by 'offset'. - */ - BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != - sizeof(user_times[0])); - BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != - sizeof(user_times[0])); - - user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE; - smp_wmb(); - - /* - * Next, write the new runstate. This is in the *same* place - * for 32-bit and 64-bit guests, asserted here for paranoia. + * The state field is in the same place at the start of both structs, + * and is the same size (int) as vx->current_runstate. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != offsetof(struct compat_vcpu_runstate_info, state)); @@ -284,34 +216,238 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - *user_state = vx->current_runstate; + /* + * The state_entry_time field is 64 bits in both versions, and the + * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86 + * is little-endian means that it's in the last *byte* of the word. + * That detail is important later. + */ + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != + sizeof(uint64_t)); + BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80); /* - * Write the actual runstate times immediately after the - * runstate_entry_time. + * The time array is four 64-bit quantities in both versions, matching + * the vx->runstate_times and immediately following state_entry_time. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != - offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); + offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != - offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); + offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof_field(struct compat_vcpu_runstate_info, time)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); - smp_wmb(); + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { + user_len = sizeof(struct vcpu_runstate_info); + times_ofs = offsetof(struct vcpu_runstate_info, + state_entry_time); + } else { + user_len = sizeof(struct compat_vcpu_runstate_info); + times_ofs = offsetof(struct compat_vcpu_runstate_info, + state_entry_time); + } + + /* + * There are basically no alignment constraints. The guest can set it + * up so it crosses from one page to the next, and at arbitrary byte + * alignment (and the 32-bit ABI doesn't align the 64-bit integers + * anyway, even if the overall struct had been 64-bit aligned). + */ + if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) { + user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK); + user_len2 = user_len - user_len1; + } else { + user_len1 = user_len; + user_len2 = 0; + } + BUG_ON(user_len1 + user_len2 != user_len); + + retry: + /* + * Attempt to obtain the GPC lock on *both* (if there are two) + * gfn_to_pfn caches that cover the region. + */ + read_lock_irqsave(&gpc1->lock, flags); + while (!kvm_gpc_check(gpc1, user_len1)) { + read_unlock_irqrestore(&gpc1->lock, flags); + + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; + + if (kvm_gpc_refresh(gpc1, user_len1)) + return; + + read_lock_irqsave(&gpc1->lock, flags); + } + + if (likely(!user_len2)) { + /* + * Set up three pointers directly to the runstate_info + * struct in the guest (via the GPC). + * + * • @rs_state → state field + * • @rs_times → state_entry_time field. + * • @update_bit → last byte of state_entry_time, which + * contains the XEN_RUNSTATE_UPDATE bit. + */ + rs_state = gpc1->khva; + rs_times = gpc1->khva + times_ofs; + if (v->kvm->arch.xen.runstate_update_flag) + update_bit = ((void *)(&rs_times[1])) - 1; + } else { + /* + * The guest's runstate_info is split across two pages and we + * need to hold and validate both GPCs simultaneously. We can + * declare a lock ordering GPC1 > GPC2 because nothing else + * takes them more than one at a time. + */ + read_lock(&gpc2->lock); + + if (!kvm_gpc_check(gpc2, user_len2)) { + read_unlock(&gpc2->lock); + read_unlock_irqrestore(&gpc1->lock, flags); + + /* When invoked from kvm_sched_out() we cannot sleep */ + if (atomic) + return; + + /* + * Use kvm_gpc_activate() here because if the runstate + * area was configured in 32-bit mode and only extends + * to the second page now because the guest changed to + * 64-bit mode, the second GPC won't have been set up. + */ + if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1, + user_len2)) + return; + + /* + * We dropped the lock on GPC1 so we have to go all the + * way back and revalidate that too. + */ + goto retry; + } + + /* + * In this case, the runstate_info struct will be assembled on + * the kernel stack (compat or not as appropriate) and will + * be copied to GPC1/GPC2 with a dual memcpy. Set up the three + * rs pointers accordingly. + */ + rs_times = &rs.state_entry_time; + + /* + * The rs_state pointer points to the start of what we'll + * copy to the guest, which in the case of a compat guest + * is the 32-bit field that the compiler thinks is padding. + */ + rs_state = ((void *)rs_times) - times_ofs; + + /* + * The update_bit is still directly in the guest memory, + * via one GPC or the other. + */ + if (v->kvm->arch.xen.runstate_update_flag) { + if (user_len1 >= times_ofs + sizeof(uint64_t)) + update_bit = gpc1->khva + times_ofs + + sizeof(uint64_t) - 1; + else + update_bit = gpc2->khva + times_ofs + + sizeof(uint64_t) - 1 - user_len1; + } + +#ifdef CONFIG_X86_64 + /* + * Don't leak kernel memory through the padding in the 64-bit + * version of the struct. + */ + memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time)); +#endif + } + + /* + * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the + * state_entry_time field, directly in the guest. We need to set + * that (and write-barrier) before writing to the rest of the + * structure, and clear it last. Just as Xen does, we address the + * single *byte* in which it resides because it might be in a + * different cache line to the rest of the 64-bit word, due to + * the (lack of) alignment constraints. + */ + entry_time = vx->runstate_entry_time; + if (update_bit) { + entry_time |= XEN_RUNSTATE_UPDATE; + *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; + smp_wmb(); + } /* - * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's - * runstate_entry_time field. + * Now assemble the actual structure, either on our kernel stack + * or directly in the guest according to how the rs_state and + * rs_times pointers were set up above. */ - user_times[0] &= ~XEN_RUNSTATE_UPDATE; + *rs_state = vx->current_runstate; + rs_times[0] = entry_time; + memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); + + /* For the split case, we have to then copy it to the guest. */ + if (user_len2) { + memcpy(gpc1->khva, rs_state, user_len1); + memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2); + } smp_wmb(); - read_unlock_irqrestore(&gpc->lock, flags); + /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ + if (update_bit) { + entry_time &= ~XEN_RUNSTATE_UPDATE; + *update_bit = entry_time >> 56; + smp_wmb(); + } - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); + if (user_len2) + read_unlock(&gpc2->lock); + + read_unlock_irqrestore(&gpc1->lock, flags); + + mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT); + if (user_len2) + mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT); +} + +void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) +{ + struct kvm_vcpu_xen *vx = &v->arch.xen; + u64 now = get_kvmclock_ns(v->kvm); + u64 delta_ns = now - vx->runstate_entry_time; + u64 run_delay = current->sched_info.run_delay; + + if (unlikely(!vx->runstate_entry_time)) + vx->current_runstate = RUNSTATE_offline; + + /* + * Time waiting for the scheduler isn't "stolen" if the + * vCPU wasn't running anyway. + */ + if (vx->current_runstate == RUNSTATE_running) { + u64 steal_ns = run_delay - vx->last_steal; + + delta_ns -= steal_ns; + + vx->runstate_times[RUNSTATE_runnable] += steal_ns; + } + vx->last_steal = run_delay; + + vx->runstate_times[vx->current_runstate] += delta_ns; + vx->current_runstate = state; + vx->runstate_entry_time = now; + + if (vx->runstate_cache.active) + kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable); } static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) @@ -352,12 +488,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) * little more honest about it. */ read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) + if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) return; read_lock_irqsave(&gpc->lock, flags); @@ -417,8 +551,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); read_lock_irqsave(&gpc->lock, flags); - while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); /* @@ -432,8 +565,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (in_atomic() || !task_is_running(current)) return 1; - if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, - sizeof(struct vcpu_info))) { + if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. @@ -493,6 +625,17 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + mutex_lock(&kvm->lock); + kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; + mutex_unlock(&kvm->lock); + r = 0; + break; + default: break; } @@ -530,6 +673,15 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = 0; break; + case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: + if (!sched_info_on()) { + r = -EOPNOTSUPP; + break; + } + data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag; + r = 0; + break; + default: break; } @@ -554,15 +706,13 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - r = kvm_gpc_activate(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache, NULL, - KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_info)); + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, + data->u.gpa, sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -570,37 +720,65 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - r = kvm_gpc_activate(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache, + data->u.gpa, sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; - case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: + case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: { + size_t sz, sz1, sz2; + if (!sched_info_on()) { r = -EOPNOTSUPP; break; } if (data->u.gpa == GPA_INVALID) { - kvm_gpc_deactivate(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); r = 0; + deactivate_out: + kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); break; } - r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_runstate_info)); - break; + /* + * If the guest switches to 64-bit mode after setting the runstate + * address, that's actually OK. kvm_xen_update_runstate_guest() + * will cope. + */ + if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) + sz = sizeof(struct vcpu_runstate_info); + else + sz = sizeof(struct compat_vcpu_runstate_info); + + /* How much fits in the (first) page? */ + sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK); + r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache, + data->u.gpa, sz1); + if (r) + goto deactivate_out; + + /* Either map the second page, or deactivate the second GPC */ + if (sz1 >= sz) { + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); + } else { + sz2 = sz - sz1; + BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK); + r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache, + data->u.gpa + sz1, sz2); + if (r) + goto deactivate_out; + } + kvm_xen_update_runstate_guest(vcpu, false); + break; + } case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: if (!sched_info_on()) { r = -EOPNOTSUPP; @@ -693,6 +871,8 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) if (data->u.runstate.state <= RUNSTATE_offline) kvm_xen_update_runstate(vcpu, data->u.runstate.state); + else if (vcpu->arch.xen.runstate_cache.active) + kvm_xen_update_runstate_guest(vcpu, false); r = 0; break; @@ -972,9 +1152,9 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, bool ret = true; int idx, i; - read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + read_lock_irqsave(&gpc->lock, flags); + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; ret = false; @@ -994,8 +1174,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, } out_rcu: - srcu_read_unlock(&kvm->srcu, idx); read_unlock_irqrestore(&gpc->lock, flags); + srcu_read_unlock(&kvm->srcu, idx); return ret; } @@ -1008,20 +1188,45 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, evtchn_port_t port, *ports; gpa_t gpa; - if (!longmode || !lapic_in_kernel(vcpu) || + if (!lapic_in_kernel(vcpu) || !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, - sizeof(sched_poll))) { + if (!gpa) { *r = -EFAULT; return true; } + if (IS_ENABLED(CONFIG_64BIT) && !longmode) { + struct compat_sched_poll sp32; + + /* Sanity check that the compat struct definition is correct */ + BUILD_BUG_ON(sizeof(sp32) != 16); + + if (kvm_vcpu_read_guest(vcpu, gpa, &sp32, sizeof(sp32))) { + *r = -EFAULT; + return true; + } + + /* + * This is a 32-bit pointer to an array of evtchn_port_t which + * are uint32_t, so once it's converted no further compat + * handling is needed. + */ + sched_poll.ports = (void *)(unsigned long)(sp32.ports); + sched_poll.nr_ports = sp32.nr_ports; + sched_poll.timeout = sp32.timeout; + } else { + if (kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, + sizeof(sched_poll))) { + *r = -EFAULT; + return true; + } + } + if (unlikely(sched_poll.nr_ports > 1)) { /* Xen (unofficially) limits number of pollers to 128 */ if (sched_poll.nr_ports > 128) { @@ -1371,7 +1576,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) idx = srcu_read_lock(&kvm->srcu); read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { @@ -1405,7 +1610,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) gpc = &vcpu->arch.xen.vcpu_info_cache; read_lock_irqsave(&gpc->lock, flags); - if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) { + if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { /* * Could not access the vcpu_info. Set the bit in-kernel * and prod the vCPU to deliver it for itself. @@ -1503,7 +1708,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) break; idx = srcu_read_lock(&kvm->srcu); - rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE); + rc = kvm_gpc_refresh(gpc, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); @@ -1833,9 +2038,14 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); - kvm_gpc_init(&vcpu->arch.xen.runstate_cache); - kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); - kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL, + KVM_HOST_USES_PFN); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -1843,9 +2053,10 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); - kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); del_timer_sync(&vcpu->arch.xen.poll_timer); } @@ -1853,7 +2064,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) void kvm_xen_init_vm(struct kvm *kvm) { idr_init(&kvm->arch.xen.evtchn_ports); - kvm_gpc_init(&kvm->arch.xen.shinfo_cache); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); } void kvm_xen_destroy_vm(struct kvm *kvm) @@ -1861,7 +2072,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm) struct evtchnfd *evtchnfd; int i; - kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache); + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 532a535a9e99..ea33d80a0c51 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -143,11 +143,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu); #include <asm/xen/interface.h> #include <xen/interface/vcpu.h> -void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state); +void kvm_xen_update_runstate(struct kvm_vcpu *vcpu, int state); static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu) { - kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running); + kvm_xen_update_runstate(vcpu, RUNSTATE_running); } static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) @@ -162,7 +162,7 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!vcpu->preempted)) return; - kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable); + kvm_xen_update_runstate(vcpu, RUNSTATE_runnable); } /* 32-bit compatibility definitions, also used natively in 32-bit build */ @@ -207,4 +207,11 @@ struct compat_vcpu_runstate_info { uint64_t time[4]; } __attribute__((packed)); +struct compat_sched_poll { + /* This is actually a guest virtual address which points to ports. */ + uint32_t ports; + unsigned int nr_ports; + uint64_t timeout; +}; + #endif /* __ARCH_X86_KVM_XEN_H__ */ |