diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-16 13:00:24 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-16 13:00:24 -0800 |
commit | 974aa5630b318938273d7efe7a2cf031c7b927db (patch) | |
tree | b79803c07b9c16d87058ce69f80ebe173cdfd838 /arch | |
parent | 441692aafc1731087bbaf657a8b6059d95c2a6df (diff) | |
parent | a6014f1ab7088dc02b58991cfb6b32a34afdbf12 (diff) | |
download | linux-974aa5630b318938273d7efe7a2cf031c7b927db.tar.bz2 |
Merge tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Radim Krčmář:
"First batch of KVM changes for 4.15
Common:
- Python 3 support in kvm_stat
- Accounting of slabs to kmemcg
ARM:
- Optimized arch timer handling for KVM/ARM
- Improvements to the VGIC ITS code and introduction of an ITS reset
ioctl
- Unification of the 32-bit fault injection logic
- More exact external abort matching logic
PPC:
- Support for running hashed page table (HPT) MMU mode on a host that
is using the radix MMU mode; single threaded mode on POWER 9 is
added as a pre-requisite
- Resolution of merge conflicts with the last second 4.14 HPT fixes
- Fixes and cleanups
s390:
- Some initial preparation patches for exitless interrupts and crypto
- New capability for AIS migration
- Fixes
x86:
- Improved emulation of LAPIC timer mode changes, MCi_STATUS MSRs,
and after-reset state
- Refined dependencies for VMX features
- Fixes for nested SMI injection
- A lot of cleanups"
* tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (89 commits)
KVM: s390: provide a capability for AIS state migration
KVM: s390: clear_io_irq() requests are not expected for adapter interrupts
KVM: s390: abstract conversion between isc and enum irq_types
KVM: s390: vsie: use common code functions for pinning
KVM: s390: SIE considerations for AP Queue virtualization
KVM: s390: document memory ordering for kvm_s390_vcpu_wakeup
KVM: PPC: Book3S HV: Cosmetic post-merge cleanups
KVM: arm/arm64: fix the incompatible matching for external abort
KVM: arm/arm64: Unify 32bit fault injection
KVM: arm/arm64: vgic-its: Implement KVM_DEV_ARM_ITS_CTRL_RESET
KVM: arm/arm64: Document KVM_DEV_ARM_ITS_CTRL_RESET
KVM: arm/arm64: vgic-its: Free caches when GITS_BASER Valid bit is cleared
KVM: arm/arm64: vgic-its: New helper functions to free the caches
KVM: arm/arm64: vgic-its: Remove kvm_its_unmap_device
arm/arm64: KVM: Load the timer state when enabling the timer
KVM: arm/arm64: Rework kvm_timer_should_fire
KVM: arm/arm64: Get rid of kvm_timer_flush_hwstate
KVM: arm/arm64: Avoid phys timer emulation in vcpu entry/exit
KVM: arm/arm64: Move phys_timer_emulate function
KVM: arm/arm64: Use kvm_arm_timer_set/get_reg for guest register traps
...
Diffstat (limited to 'arch')
47 files changed, 1474 insertions, 877 deletions
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 14d68a4d826f..36dd2962a42d 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -68,6 +68,8 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); +extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); + extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern void __init_stage2_translation(void); diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 98089ffd91bb..3d22eb87f919 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -25,7 +25,22 @@ #include <asm/kvm_arm.h> #include <asm/cputype.h> +/* arm64 compatibility macros */ +#define COMPAT_PSR_MODE_ABT ABT_MODE +#define COMPAT_PSR_MODE_UND UND_MODE +#define COMPAT_PSR_T_BIT PSR_T_BIT +#define COMPAT_PSR_I_BIT PSR_I_BIT +#define COMPAT_PSR_A_BIT PSR_A_BIT +#define COMPAT_PSR_E_BIT PSR_E_BIT +#define COMPAT_PSR_IT_MASK PSR_IT_MASK + unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); + +static inline unsigned long *vcpu_reg32(struct kvm_vcpu *vcpu, u8 reg_num) +{ + return vcpu_reg(vcpu, reg_num); +} + unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu); static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu, @@ -42,10 +57,25 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); -void kvm_inject_undefined(struct kvm_vcpu *vcpu); +void kvm_inject_undef32(struct kvm_vcpu *vcpu); +void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr); +void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_vabt(struct kvm_vcpu *vcpu); -void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); + +static inline void kvm_inject_undefined(struct kvm_vcpu *vcpu) +{ + kvm_inject_undef32(vcpu); +} + +static inline void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) +{ + kvm_inject_dabt32(vcpu, addr); +} + +static inline void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) +{ + kvm_inject_pabt32(vcpu, addr); +} static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) { @@ -203,7 +233,7 @@ static inline u8 kvm_vcpu_trap_get_fault_type(struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) { - switch (kvm_vcpu_trap_get_fault_type(vcpu)) { + switch (kvm_vcpu_trap_get_fault(vcpu)) { case FSC_SEA: case FSC_SEA_TTW0: case FSC_SEA_TTW1: diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index 14b5903f0224..ab20ffa8b9e7 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -98,8 +98,8 @@ #define cntvoff_el2 CNTVOFF #define cnthctl_el2 CNTHCTL -void __timer_save_state(struct kvm_vcpu *vcpu); -void __timer_restore_state(struct kvm_vcpu *vcpu); +void __timer_enable_traps(struct kvm_vcpu *vcpu); +void __timer_disable_traps(struct kvm_vcpu *vcpu); void __vgic_v2_save_state(struct kvm_vcpu *vcpu); void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 1f57bbe82b6f..6edd177bb1c7 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -152,6 +152,12 @@ struct kvm_arch_memory_slot { (__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64) #define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__) +/* PL1 Physical Timer Registers */ +#define KVM_REG_ARM_PTIMER_CTL ARM_CP15_REG32(0, 14, 2, 1) +#define KVM_REG_ARM_PTIMER_CNT ARM_CP15_REG64(0, 14) +#define KVM_REG_ARM_PTIMER_CVAL ARM_CP15_REG64(2, 14) + +/* Virtual Timer Registers */ #define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1) #define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14) #define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14) @@ -216,6 +222,7 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_ITS_SAVE_TABLES 1 #define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 +#define KVM_DEV_ARM_ITS_CTRL_RESET 4 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index 30a13647c54c..cdff963f133a 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -165,143 +165,6 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu) * Inject exceptions into the guest */ -static u32 exc_vector_base(struct kvm_vcpu *vcpu) -{ - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - u32 vbar = vcpu_cp15(vcpu, c12_VBAR); - - if (sctlr & SCTLR_V) - return 0xffff0000; - else /* always have security exceptions */ - return vbar; -} - -/* - * Switch to an exception mode, updating both CPSR and SPSR. Follow - * the logic described in AArch32.EnterMode() from the ARMv8 ARM. - */ -static void kvm_update_psr(struct kvm_vcpu *vcpu, unsigned long mode) -{ - unsigned long cpsr = *vcpu_cpsr(vcpu); - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - - *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | mode; - - switch (mode) { - case FIQ_MODE: - *vcpu_cpsr(vcpu) |= PSR_F_BIT; - /* Fall through */ - case ABT_MODE: - case IRQ_MODE: - *vcpu_cpsr(vcpu) |= PSR_A_BIT; - /* Fall through */ - default: - *vcpu_cpsr(vcpu) |= PSR_I_BIT; - } - - *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT); - - if (sctlr & SCTLR_TE) - *vcpu_cpsr(vcpu) |= PSR_T_BIT; - if (sctlr & SCTLR_EE) - *vcpu_cpsr(vcpu) |= PSR_E_BIT; - - /* Note: These now point to the mode banked copies */ - *vcpu_spsr(vcpu) = cpsr; -} - -/** - * kvm_inject_undefined - inject an undefined exception into the guest - * @vcpu: The VCPU to receive the undefined exception - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. - * - * Modelled after TakeUndefInstrException() pseudocode. - */ -void kvm_inject_undefined(struct kvm_vcpu *vcpu) -{ - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_thumb = (cpsr & PSR_T_BIT); - u32 vect_offset = 4; - u32 return_offset = (is_thumb) ? 2 : 4; - - kvm_update_psr(vcpu, UND_MODE); - *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; - - /* Branch to exception vector */ - *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset; -} - -/* - * Modelled after TakeDataAbortException() and TakePrefetchAbortException - * pseudocode. - */ -static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr) -{ - u32 vect_offset; - u32 return_offset = (is_pabt) ? 4 : 8; - bool is_lpae; - - kvm_update_psr(vcpu, ABT_MODE); - *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; - - if (is_pabt) - vect_offset = 12; - else - vect_offset = 16; - - /* Branch to exception vector */ - *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset; - - if (is_pabt) { - /* Set IFAR and IFSR */ - vcpu_cp15(vcpu, c6_IFAR) = addr; - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - /* Always give debug fault for now - should give guest a clue */ - if (is_lpae) - vcpu_cp15(vcpu, c5_IFSR) = 1 << 9 | 0x22; - else - vcpu_cp15(vcpu, c5_IFSR) = 2; - } else { /* !iabt */ - /* Set DFAR and DFSR */ - vcpu_cp15(vcpu, c6_DFAR) = addr; - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - /* Always give debug fault for now - should give guest a clue */ - if (is_lpae) - vcpu_cp15(vcpu, c5_DFSR) = 1 << 9 | 0x22; - else - vcpu_cp15(vcpu, c5_DFSR) = 2; - } - -} - -/** - * kvm_inject_dabt - inject a data abort into the guest - * @vcpu: The VCPU to receive the undefined exception - * @addr: The address to report in the DFAR - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. - */ -void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) -{ - inject_abt(vcpu, false, addr); -} - -/** - * kvm_inject_pabt - inject a prefetch abort into the guest - * @vcpu: The VCPU to receive the undefined exception - * @addr: The address to report in the DFAR - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. - */ -void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) -{ - inject_abt(vcpu, true, addr); -} - /** * kvm_inject_vabt - inject an async abort / SError into the guest * @vcpu: The VCPU to receive the exception diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index ebd2dd46adf7..330c9ce34ba5 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -174,7 +174,7 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) __activate_vm(vcpu); __vgic_restore_state(vcpu); - __timer_restore_state(vcpu); + __timer_enable_traps(vcpu); __sysreg_restore_state(guest_ctxt); __banked_restore_state(guest_ctxt); @@ -191,7 +191,8 @@ again: __banked_save_state(guest_ctxt); __sysreg_save_state(guest_ctxt); - __timer_save_state(vcpu); + __timer_disable_traps(vcpu); + __vgic_save_state(vcpu); __deactivate_traps(vcpu); @@ -237,7 +238,7 @@ void __hyp_text __noreturn __hyp_panic(int cause) vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR); host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); - __timer_save_state(vcpu); + __timer_disable_traps(vcpu); __deactivate_traps(vcpu); __deactivate_vm(vcpu); __banked_restore_state(host_ctxt); diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h index bdedd8f748d1..f2a234d6516c 100644 --- a/arch/arm64/include/asm/arch_timer.h +++ b/arch/arm64/include/asm/arch_timer.h @@ -52,6 +52,7 @@ struct arch_timer_erratum_workaround { const char *desc; u32 (*read_cntp_tval_el0)(void); u32 (*read_cntv_tval_el0)(void); + u64 (*read_cntpct_el0)(void); u64 (*read_cntvct_el0)(void); int (*set_next_event_phys)(unsigned long, struct clock_event_device *); int (*set_next_event_virt)(unsigned long, struct clock_event_device *); @@ -149,11 +150,8 @@ static inline void arch_timer_set_cntkctl(u32 cntkctl) static inline u64 arch_counter_get_cntpct(void) { - /* - * AArch64 kernel and user space mandate the use of CNTVCT. - */ - BUG(); - return 0; + isb(); + return arch_timer_reg_read_stable(cntpct_el0); } static inline u64 arch_counter_get_cntvct(void) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 26a64d0f9ab9..ab4d0a926043 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -55,6 +55,8 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); +extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); + extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern u64 __vgic_v3_get_ich_vtr_el2(void); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index e5df3fce0008..5f28dfa14cee 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -41,6 +41,9 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu); void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); +void kvm_inject_undef32(struct kvm_vcpu *vcpu); +void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr); +void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr); static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { @@ -237,7 +240,7 @@ static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu) { - switch (kvm_vcpu_trap_get_fault_type(vcpu)) { + switch (kvm_vcpu_trap_get_fault(vcpu)) { case FSC_SEA: case FSC_SEA_TTW0: case FSC_SEA_TTW1: diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 4572a9b560fa..08d3bb66c8b7 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -129,8 +129,8 @@ void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); -void __timer_save_state(struct kvm_vcpu *vcpu); -void __timer_restore_state(struct kvm_vcpu *vcpu); +void __timer_enable_traps(struct kvm_vcpu *vcpu); +void __timer_disable_traps(struct kvm_vcpu *vcpu); void __sysreg_save_host_state(struct kvm_cpu_context *ctxt); void __sysreg_restore_host_state(struct kvm_cpu_context *ctxt); diff --git a/arch/arm64/include/asm/timex.h b/arch/arm64/include/asm/timex.h index 81a076eb37fa..9ad60bae5c8d 100644 --- a/arch/arm64/include/asm/timex.h +++ b/arch/arm64/include/asm/timex.h @@ -22,7 +22,7 @@ * Use the current timer as a cycle counter since this is what we use for * the delay loop. */ -#define get_cycles() arch_counter_get_cntvct() +#define get_cycles() arch_timer_read_counter() #include <asm-generic/timex.h> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 51149ec75fe4..9abbf3044654 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -196,6 +196,12 @@ struct kvm_arch_memory_slot { #define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64) +/* Physical Timer EL0 Registers */ +#define KVM_REG_ARM_PTIMER_CTL ARM64_SYS_REG(3, 3, 14, 2, 1) +#define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2) +#define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1) + +/* EL0 Virtual Timer Registers */ #define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1) #define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) #define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) @@ -228,6 +234,7 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_ITS_SAVE_TABLES 1 #define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 +#define KVM_DEV_ARM_ITS_CTRL_RESET 4 /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 951f3ebaff26..525c01f48867 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -304,7 +304,7 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) __activate_vm(vcpu); __vgic_restore_state(vcpu); - __timer_restore_state(vcpu); + __timer_enable_traps(vcpu); /* * We must restore the 32-bit state before the sysregs, thanks @@ -374,7 +374,7 @@ again: __sysreg_save_guest_state(guest_ctxt); __sysreg32_save_state(vcpu); - __timer_save_state(vcpu); + __timer_disable_traps(vcpu); __vgic_save_state(vcpu); __deactivate_traps(vcpu); @@ -442,7 +442,7 @@ void __hyp_text __noreturn __hyp_panic(void) vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2); host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); - __timer_save_state(vcpu); + __timer_disable_traps(vcpu); __deactivate_traps(vcpu); __deactivate_vm(vcpu); __sysreg_restore_host_state(host_ctxt); diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 3556715a774e..8ecbcb40e317 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -33,88 +33,6 @@ #define LOWER_EL_AArch64_VECTOR 0x400 #define LOWER_EL_AArch32_VECTOR 0x600 -/* - * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. - */ -static const u8 return_offsets[8][2] = { - [0] = { 0, 0 }, /* Reset, unused */ - [1] = { 4, 2 }, /* Undefined */ - [2] = { 0, 0 }, /* SVC, unused */ - [3] = { 4, 4 }, /* Prefetch abort */ - [4] = { 8, 8 }, /* Data abort */ - [5] = { 0, 0 }, /* HVC, unused */ - [6] = { 4, 4 }, /* IRQ, unused */ - [7] = { 4, 4 }, /* FIQ, unused */ -}; - -static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) -{ - unsigned long cpsr; - unsigned long new_spsr_value = *vcpu_cpsr(vcpu); - bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT); - u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; - u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - - cpsr = mode | COMPAT_PSR_I_BIT; - - if (sctlr & (1 << 30)) - cpsr |= COMPAT_PSR_T_BIT; - if (sctlr & (1 << 25)) - cpsr |= COMPAT_PSR_E_BIT; - - *vcpu_cpsr(vcpu) = cpsr; - - /* Note: These now point to the banked copies */ - *vcpu_spsr(vcpu) = new_spsr_value; - *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; - - /* Branch to exception vector */ - if (sctlr & (1 << 13)) - vect_offset += 0xffff0000; - else /* always have security exceptions */ - vect_offset += vcpu_cp15(vcpu, c12_VBAR); - - *vcpu_pc(vcpu) = vect_offset; -} - -static void inject_undef32(struct kvm_vcpu *vcpu) -{ - prepare_fault32(vcpu, COMPAT_PSR_MODE_UND, 4); -} - -/* - * Modelled after TakeDataAbortException() and TakePrefetchAbortException - * pseudocode. - */ -static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, - unsigned long addr) -{ - u32 vect_offset; - u32 *far, *fsr; - bool is_lpae; - - if (is_pabt) { - vect_offset = 12; - far = &vcpu_cp15(vcpu, c6_IFAR); - fsr = &vcpu_cp15(vcpu, c5_IFSR); - } else { /* !iabt */ - vect_offset = 16; - far = &vcpu_cp15(vcpu, c6_DFAR); - fsr = &vcpu_cp15(vcpu, c5_DFSR); - } - - prepare_fault32(vcpu, COMPAT_PSR_MODE_ABT | COMPAT_PSR_A_BIT, vect_offset); - - *far = addr; - - /* Give the guest an IMPLEMENTATION DEFINED exception */ - is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); - if (is_lpae) - *fsr = 1 << 9 | 0x34; - else - *fsr = 0x14; -} - enum exception_type { except_type_sync = 0, except_type_irq = 0x80, @@ -211,7 +129,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) { if (!(vcpu->arch.hcr_el2 & HCR_RW)) - inject_abt32(vcpu, false, addr); + kvm_inject_dabt32(vcpu, addr); else inject_abt64(vcpu, false, addr); } @@ -227,7 +145,7 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) { if (!(vcpu->arch.hcr_el2 & HCR_RW)) - inject_abt32(vcpu, true, addr); + kvm_inject_pabt32(vcpu, addr); else inject_abt64(vcpu, true, addr); } @@ -241,7 +159,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) void kvm_inject_undefined(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.hcr_el2 & HCR_RW)) - inject_undef32(vcpu); + kvm_inject_undef32(vcpu); else inject_undef64(vcpu); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a0ee9b05e3d4..1830ebc227d1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -842,13 +842,16 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); u64 now = kvm_phys_timer_read(); + u64 cval; - if (p->is_write) - ptimer->cnt_cval = p->regval + now; - else - p->regval = ptimer->cnt_cval - now; + if (p->is_write) { + kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, + p->regval + now); + } else { + cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); + p->regval = cval - now; + } return true; } @@ -857,24 +860,10 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - - if (p->is_write) { - /* ISTATUS bit is read-only */ - ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT; - } else { - u64 now = kvm_phys_timer_read(); - - p->regval = ptimer->cnt_ctl; - /* - * Set ISTATUS bit if it's expired. - * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is - * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit - * regardless of ENABLE bit for our implementation convenience. - */ - if (ptimer->cnt_cval <= now) - p->regval |= ARCH_TIMER_CTRL_IT_STAT; - } + if (p->is_write) + kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval); + else + p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); return true; } @@ -883,12 +872,10 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - if (p->is_write) - ptimer->cnt_cval = p->regval; + kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval); else - p->regval = ptimer->cnt_cval; + p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); return true; } diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index b8d5b8e35244..9a667007bff8 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -216,7 +216,8 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, bool *writable); extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); -extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize); +extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, + unsigned long gfn, unsigned long psize); extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index); void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep, diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index d55c7f881ce7..735cfa35298a 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -20,6 +20,8 @@ #ifndef __ASM_KVM_BOOK3S_64_H__ #define __ASM_KVM_BOOK3S_64_H__ +#include <linux/string.h> +#include <asm/bitops.h> #include <asm/book3s/64/mmu-hash.h> /* Power architecture requires HPT is at least 256kiB, at most 64TiB */ @@ -107,18 +109,96 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) hpte[0] = cpu_to_be64(hpte_v); } +/* + * These functions encode knowledge of the POWER7/8/9 hardware + * interpretations of the HPTE LP (large page size) field. + */ +static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) +{ + unsigned int lphi; + + if (!(h & HPTE_V_LARGE)) + return 12; /* 4kB */ + lphi = (l >> 16) & 0xf; + switch ((l >> 12) & 0xf) { + case 0: + return !lphi ? 24 : -1; /* 16MB */ + break; + case 1: + return 16; /* 64kB */ + break; + case 3: + return !lphi ? 34 : -1; /* 16GB */ + break; + case 7: + return (16 << 8) + 12; /* 64kB in 4kB */ + break; + case 8: + if (!lphi) + return (24 << 8) + 16; /* 16MB in 64kkB */ + if (lphi == 3) + return (24 << 8) + 12; /* 16MB in 4kB */ + break; + } + return -1; +} + +static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l) +{ + return kvmppc_hpte_page_shifts(h, l) & 0xff; +} + +static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l) +{ + int tmp = kvmppc_hpte_page_shifts(h, l); + + if (tmp >= 0x100) + tmp >>= 8; + return tmp; +} + +static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r) +{ + return 1ul << kvmppc_hpte_actual_page_shift(v, r); +} + +static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift) +{ + switch (base_shift) { + case 12: + switch (actual_shift) { + case 12: + return 0; + case 16: + return 7; + case 24: + return 0x38; + } + break; + case 16: + switch (actual_shift) { + case 16: + return 1; + case 24: + return 8; + } + break; + case 24: + return 0; + } + return -1; +} + static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, unsigned long pte_index) { - int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K; - unsigned int penc; + int a_pgshift, b_pgshift; unsigned long rb = 0, va_low, sllp; - unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1); - if (v & HPTE_V_LARGE) { - i = hpte_page_sizes[lp]; - b_psize = i & 0xf; - a_psize = i >> 4; + b_pgshift = a_pgshift = kvmppc_hpte_page_shifts(v, r); + if (a_pgshift >= 0x100) { + b_pgshift &= 0xff; + a_pgshift >>= 8; } /* @@ -152,37 +232,33 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, va_low ^= v >> (SID_SHIFT_1T - 16); va_low &= 0x7ff; - switch (b_psize) { - case MMU_PAGE_4K: - sllp = get_sllp_encoding(a_psize); - rb |= sllp << 5; /* AP field */ + if (b_pgshift == 12) { + if (a_pgshift > 12) { + sllp = (a_pgshift == 16) ? 5 : 4; + rb |= sllp << 5; /* AP field */ + } rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */ - break; - default: - { + } else { int aval_shift; /* * remaining bits of AVA/LP fields * Also contain the rr bits of LP */ - rb |= (va_low << mmu_psize_defs[b_psize].shift) & 0x7ff000; + rb |= (va_low << b_pgshift) & 0x7ff000; /* * Now clear not needed LP bits based on actual psize */ - rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1); + rb &= ~((1ul << a_pgshift) - 1); /* * AVAL field 58..77 - base_page_shift bits of va * we have space for 58..64 bits, Missing bits should * be zero filled. +1 is to take care of L bit shift */ - aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1; + aval_shift = 64 - (77 - b_pgshift) + 1; rb |= ((va_low << aval_shift) & 0xfe); rb |= 1; /* L field */ - penc = mmu_psize_defs[b_psize].penc[a_psize]; - rb |= penc << 12; /* LP field */ - break; - } + rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */ } rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */ return rb; @@ -370,6 +446,28 @@ static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt) return (1UL << (hpt->order - 7)) - 1; } +/* Set bits in a dirty bitmap, which is in LE format */ +static inline void set_dirty_bits(unsigned long *map, unsigned long i, + unsigned long npages) +{ + + if (npages >= 8) + memset((char *)map + i / 8, 0xff, npages / 8); + else + for (; npages; ++i, --npages) + __set_bit_le(i, map); +} + +static inline void set_dirty_bits_atomic(unsigned long *map, unsigned long i, + unsigned long npages) +{ + if (npages >= 8) + memset((char *)map + i / 8, 0xff, npages / 8); + else + for (; npages; ++i, --npages) + set_bit_le(i, map); +} + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 7cea76f11c26..ab386af2904f 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -82,6 +82,16 @@ struct kvm_split_mode { u8 do_nap; u8 napped[MAX_SMT_THREADS]; struct kvmppc_vcore *vc[MAX_SUBCORES]; + /* Bits for changing lpcr on P9 */ + unsigned long lpcr_req; + unsigned long lpidr_req; + unsigned long host_lpcr; + u32 do_set; + u32 do_restore; + union { + u32 allphases; + u8 phase[4]; + } lpcr_sync; }; /* @@ -107,7 +117,8 @@ struct kvmppc_host_state { u8 hwthread_req; u8 hwthread_state; u8 host_ipi; - u8 ptid; + u8 ptid; /* thread number within subcore when split */ + u8 tid; /* thread number within whole core */ struct kvm_vcpu *kvm_vcpu; struct kvmppc_vcore *kvm_vcore; void __iomem *xics_phys; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e372ed871c51..3aa5b577cd60 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -235,10 +235,7 @@ struct revmap_entry { */ #define KVMPPC_RMAP_LOCK_BIT 63 #define KVMPPC_RMAP_RC_SHIFT 32 -#define KVMPPC_RMAP_CHG_SHIFT 48 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) -#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT) -#define KVMPPC_RMAP_CHG_ORDER (0x3ful << KVMPPC_RMAP_CHG_SHIFT) #define KVMPPC_RMAP_PRESENT 0x100000000ul #define KVMPPC_RMAP_INDEX 0xfffffffful @@ -276,7 +273,7 @@ struct kvm_arch { int tlbie_lock; unsigned long lpcr; unsigned long vrma_slb_v; - int hpte_setup_done; + int mmu_ready; atomic_t vcpus_running; u32 online_vcores; atomic_t hpte_mod_interest; @@ -284,6 +281,7 @@ struct kvm_arch { cpumask_t cpu_in_guest; u8 radix; u8 fwnmi_enabled; + bool threads_indep; pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ba5fadd6f3c9..96753f3aac6d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -168,6 +168,7 @@ extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order); extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); extern void kvmppc_free_hpt(struct kvm_hpt_info *info); +extern void kvmppc_rmap_reset(struct kvm *kvm); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, @@ -177,6 +178,8 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, struct iommu_group *grp); extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, struct iommu_group *grp); +extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm); +extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 9aace433491a..6b958414b4e0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -642,6 +642,7 @@ int main(void) HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); HSTATE_FIELD(HSTATE_PTID, ptid); + HSTATE_FIELD(HSTATE_TID, tid); HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]); HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]); HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]); @@ -667,6 +668,8 @@ int main(void) OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap); OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped); + OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set); + OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 59247af5fd45..235319c2574e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -73,8 +73,6 @@ struct kvm_resize_hpt { struct kvm_hpt_info hpt; }; -static void kvmppc_rmap_reset(struct kvm *kvm); - int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) { unsigned long hpt = 0; @@ -106,7 +104,6 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) /* Allocate reverse map array */ rev = vmalloc(sizeof(struct revmap_entry) * npte); if (!rev) { - pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); if (cma) kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); else @@ -137,19 +134,22 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) long err = -EBUSY; struct kvm_hpt_info info; - if (kvm_is_radix(kvm)) - return -EINVAL; - mutex_lock(&kvm->lock); - if (kvm->arch.hpte_setup_done) { - kvm->arch.hpte_setup_done = 0; - /* order hpte_setup_done vs. vcpus_running */ + if (kvm->arch.mmu_ready) { + kvm->arch.mmu_ready = 0; + /* order mmu_ready vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; goto out; } } + if (kvm_is_radix(kvm)) { + err = kvmppc_switch_mmu_to_hpt(kvm); + if (err) + goto out; + } + if (kvm->arch.hpt.order == order) { /* We already have a suitable HPT */ @@ -183,6 +183,7 @@ out: void kvmppc_free_hpt(struct kvm_hpt_info *info) { vfree(info->rev); + info->rev = NULL; if (info->cma) kvm_free_hpt_cma(virt_to_page(info->virt), 1 << (info->order - PAGE_SHIFT)); @@ -334,7 +335,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, { unsigned long ra_mask; - ra_mask = hpte_page_size(v, r) - 1; + ra_mask = kvmppc_actual_pgsz(v, r) - 1; return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); } @@ -350,6 +351,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + if (kvm_is_radix(vcpu->kvm)) + return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite); + /* Get SLB entry */ if (virtmode) { slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); @@ -505,7 +509,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, mmio_update = atomic64_read(&kvm->arch.mmio_update); if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { r = vcpu->arch.pgfault_cache->rpte; - psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); + psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0], + r); gpa_base = r & HPTE_R_RPN & ~(psize - 1); gfn_base = gpa_base >> PAGE_SHIFT; gpa = gpa_base | (ea & (psize - 1)); @@ -534,7 +539,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; /* Translate the logical address and get the page */ - psize = hpte_page_size(hpte[0], r); + psize = kvmppc_actual_pgsz(hpte[0], r); gpa_base = r & HPTE_R_RPN & ~(psize - 1); gfn_base = gpa_base >> PAGE_SHIFT; gpa = gpa_base | (ea & (psize - 1)); @@ -650,10 +655,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* * If the HPT is being resized, don't update the HPTE, * instead let the guest retry after the resize operation is complete. - * The synchronization for hpte_setup_done test vs. set is provided + * The synchronization for mmu_ready test vs. set is provided * by the HPTE lock. */ - if (!kvm->arch.hpte_setup_done) + if (!kvm->arch.mmu_ready) goto out_unlock; if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || @@ -720,7 +725,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; } -static void kvmppc_rmap_reset(struct kvm *kvm) +void kvmppc_rmap_reset(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -786,6 +791,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, /* Must be called with both HPTE and rmap locked */ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, + struct kvm_memory_slot *memslot, unsigned long *rmapp, unsigned long gfn) { __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); @@ -808,7 +814,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, /* Now check and modify the HPTE */ ptel = rev[i].guest_rpte; - psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel); if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); @@ -817,8 +823,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, /* Harvest R and C */ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; - if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmapp, psize); + if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, psize); if (rcbits & ~rev[i].guest_rpte) { rev[i].guest_rpte = ptel | rcbits; note_hpte_modification(kvm, &rev[i]); @@ -856,7 +862,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, continue; } - kvmppc_unmap_hpte(kvm, i, rmapp, gfn); + kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn); unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } @@ -1039,14 +1045,6 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) retry: lock_rmap(rmapp); - if (*rmapp & KVMPPC_RMAP_CHANGED) { - long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER) - >> KVMPPC_RMAP_CHG_SHIFT; - *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER); - npages_dirty = 1; - if (change_order > PAGE_SHIFT) - npages_dirty = 1ul << (change_order - PAGE_SHIFT); - } if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { unlock_rmap(rmapp); return npages_dirty; @@ -1102,7 +1100,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) rev[i].guest_rpte |= HPTE_R_C; note_hpte_modification(kvm, &rev[i]); } - n = hpte_page_size(v, r); + n = kvmppc_actual_pgsz(v, r); n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; if (n > npages_dirty) npages_dirty = n; @@ -1138,7 +1136,7 @@ void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map) { - unsigned long i, j; + unsigned long i; unsigned long *rmapp; preempt_disable(); @@ -1150,9 +1148,8 @@ long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, * since we always put huge-page HPTEs in the rmap chain * corresponding to their page base address. */ - if (npages && map) - for (j = i; npages; ++j, --npages) - __set_bit_le(j, map); + if (npages) + set_dirty_bits(map, i, npages); ++rmapp; } preempt_enable(); @@ -1196,7 +1193,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, struct page *page = virt_to_page(va); struct kvm_memory_slot *memslot; unsigned long gfn; - unsigned long *rmap; int srcu_idx; put_page(page); @@ -1204,20 +1200,12 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, if (!dirty) return; - /* We need to mark this page dirty in the rmap chain */ + /* We need to mark this page dirty in the memslot dirty_bitmap, if any */ gfn = gpa >> PAGE_SHIFT; srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, gfn); - if (memslot) { - if (!kvm_is_radix(kvm)) { - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - unlock_rmap(rmap); - } else if (memslot->dirty_bitmap) { - mark_page_dirty(kvm, gfn); - } - } + if (memslot && memslot->dirty_bitmap) + set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap); srcu_read_unlock(&kvm->srcu, srcu_idx); } @@ -1277,7 +1265,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, guest_rpte = rev->guest_rpte; ret = -EIO; - apsize = hpte_page_size(vpte, guest_rpte); + apsize = kvmppc_actual_pgsz(vpte, guest_rpte); if (!apsize) goto out; @@ -1292,7 +1280,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; lock_rmap(rmapp); - kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); + kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn); unlock_rmap(rmapp); } @@ -1465,7 +1453,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, struct kvm_resize_hpt *resize; int ret; - if (flags != 0) + if (flags != 0 || kvm_is_radix(kvm)) return -EINVAL; if (shift && ((shift < 18) || (shift > 46))) @@ -1531,7 +1519,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, struct kvm_resize_hpt *resize; long ret; - if (flags != 0) + if (flags != 0 || kvm_is_radix(kvm)) return -EINVAL; if (shift && ((shift < 18) || (shift > 46))) @@ -1543,15 +1531,15 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, /* This shouldn't be possible */ ret = -EIO; - if (WARN_ON(!kvm->arch.hpte_setup_done)) + if (WARN_ON(!kvm->arch.mmu_ready)) goto out_no_hpt; /* Stop VCPUs from running while we mess with the HPT */ - kvm->arch.hpte_setup_done = 0; + kvm->arch.mmu_ready = 0; smp_mb(); /* Boot all CPUs out of the guest so they re-read - * hpte_setup_done */ + * mmu_ready */ on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); ret = -ENXIO; @@ -1574,7 +1562,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, out: /* Let VCPUs run again */ - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; smp_mb(); out_no_hpt: resize_hpt_release(kvm, resize); @@ -1717,6 +1705,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, if (!access_ok(VERIFY_WRITE, buf, count)) return -EFAULT; + if (kvm_is_radix(kvm)) + return 0; first_pass = ctx->first_pass; flags = ctx->flags; @@ -1810,20 +1800,22 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, unsigned long tmp[2]; ssize_t nb; long int err, ret; - int hpte_setup; + int mmu_ready; if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; + if (kvm_is_radix(kvm)) + return -EINVAL; /* lock out vcpus from running while we're doing this */ mutex_lock(&kvm->lock); - hpte_setup = kvm->arch.hpte_setup_done; - if (hpte_setup) { - kvm->arch.hpte_setup_done = 0; /* temporarily */ - /* order hpte_setup_done vs. vcpus_running */ + mmu_ready = kvm->arch.mmu_ready; + if (mmu_ready) { + kvm->arch.mmu_ready = 0; /* temporarily */ + /* order mmu_ready vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.hpte_setup_done = 1; + kvm->arch.mmu_ready = 1; mutex_unlock(&kvm->lock); return -EBUSY; } @@ -1876,7 +1868,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, "r=%lx\n", ret, i, v, r); goto out; } - if (!hpte_setup && is_vrma_hpte(v)) { + if (!mmu_ready && is_vrma_hpte(v)) { unsigned long psize = hpte_base_page_size(v, r); unsigned long senc = slb_pgsize_encoding(psize); unsigned long lpcr; @@ -1885,7 +1877,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, (VRMA_VSID << SLB_VSID_SHIFT_1T); lpcr = senc << (LPCR_VRMASD_SH - 4); kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - hpte_setup = 1; + mmu_ready = 1; } ++i; hptp += 2; @@ -1901,9 +1893,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, } out: - /* Order HPTE updates vs. hpte_setup_done */ + /* Order HPTE updates vs. mmu_ready */ smp_wmb(); - kvm->arch.hpte_setup_done = hpte_setup; + kvm->arch.mmu_ready = mmu_ready; mutex_unlock(&kvm->lock); if (err) @@ -2012,6 +2004,10 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, struct kvm *kvm; __be64 *hptp; + kvm = p->kvm; + if (kvm_is_radix(kvm)) + return 0; + ret = mutex_lock_interruptible(&p->mutex); if (ret) return ret; @@ -2034,7 +2030,6 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, } } - kvm = p->kvm; i = p->hpt_index; hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); @@ -2109,10 +2104,7 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ - if (kvm_is_radix(vcpu->kvm)) - mmu->xlate = kvmppc_mmu_radix_xlate; - else - mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index c5d7435455f1..58618f644c56 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -474,26 +474,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return ret; } -static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn, unsigned int order) -{ - unsigned long i, limit; - unsigned long *dp; - - if (!memslot->dirty_bitmap) - return; - limit = 1ul << order; - if (limit < BITS_PER_LONG) { - for (i = 0; i < limit; ++i) - mark_page_dirty(kvm, gfn + i); - return; - } - dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn); - limit /= BITS_PER_LONG; - for (i = 0; i < limit; ++i) - *dp++ = ~0ul; -} - /* Called with kvm->lock held */ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) @@ -508,12 +488,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift); - if (old & _PAGE_DIRTY) { - if (!shift) - mark_page_dirty(kvm, gfn); - else - mark_pages_dirty(kvm, memslot, - gfn, shift - PAGE_SHIFT); + if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { + unsigned long npages = 1; + if (shift) + npages = 1ul << (shift - PAGE_SHIFT); + kvmppc_update_dirty_map(memslot, gfn, npages); } } return 0; @@ -579,20 +558,8 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map) { unsigned long i, j; - unsigned long n, *p; int npages; - /* - * Radix accumulates dirty bits in the first half of the - * memslot's dirty_bitmap area, for when pages are paged - * out or modified by the host directly. Pick up these - * bits and add them to the map. - */ - n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long); - p = memslot->dirty_bitmap; - for (i = 0; i < n; ++i) - map[i] |= xchg(&p[i], 0); - for (i = 0; i < memslot->npages; i = j) { npages = kvm_radix_test_clear_dirty(kvm, memslot, i); @@ -604,9 +571,10 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, * real address, if npages > 1 we can skip to i + npages. */ j = i + 1; - if (npages) - for (j = i; npages; ++j, --npages) - __set_bit_le(j, map); + if (npages) { + set_dirty_bits(map, i, npages); + i = j + npages; + } } return 0; } @@ -694,6 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm) pgd_clear(pgd); } pgd_free(kvm->mm, kvm->arch.pgtable); + kvm->arch.pgtable = NULL; } static void pte_ctor(void *addr) diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 3589c4e3d49b..688722acd692 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -113,7 +113,7 @@ slb_do_enter: /* Remove all SLB entries that are in use. */ - li r0, r0 + li r0, 0 slbmte r0, r0 slbia diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 40e5857c4b1c..79ea3d9269db 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -19,6 +19,7 @@ */ #include <linux/kvm_host.h> +#include <linux/kernel.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/preempt.h> @@ -98,6 +99,10 @@ static int target_smt_mode; module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); +static bool indep_threads_mode = true; +module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); + #ifdef CONFIG_KVM_XICS static struct kernel_param_ops module_param_ops = { .set = param_set_int, @@ -115,6 +120,7 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +static void kvmppc_setup_partition_table(struct kvm *kvm); static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, int *ip) @@ -1734,9 +1740,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, * MMU mode (radix or HPT), unfortunately, but since we only support * HPT guests on a HPT host so far, that isn't an impediment yet. */ -static int threads_per_vcore(void) +static int threads_per_vcore(struct kvm *kvm) { - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (kvm->arch.threads_indep) return 1; return threads_per_subcore; } @@ -1774,7 +1780,7 @@ static struct debugfs_timings_element { {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, }; -#define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) +#define N_TIMINGS (ARRAY_SIZE(timings)) struct debugfs_timings_state { struct kvm_vcpu *vcpu; @@ -2228,11 +2234,10 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) kvmppc_ipi_thread(cpu); } -static void kvmppc_wait_for_nap(void) +static void kvmppc_wait_for_nap(int n_threads) { int cpu = smp_processor_id(); int i, loops; - int n_threads = threads_per_vcore(); if (n_threads <= 1) return; @@ -2319,7 +2324,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_PREEMPT; vc->pcpu = smp_processor_id(); - if (vc->num_threads < threads_per_vcore()) { + if (vc->num_threads < threads_per_vcore(vc->kvm)) { spin_lock(&lp->lock); list_add_tail(&vc->preempt_list, &lp->list); spin_unlock(&lp->lock); @@ -2357,7 +2362,7 @@ struct core_info { /* * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 - * respectively in 2-way micro-threading (split-core) mode. + * respectively in 2-way micro-threading (split-core) mode on POWER8. */ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; @@ -2373,7 +2378,14 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) static bool subcore_config_ok(int n_subcores, int n_threads) { - /* Can only dynamically split if unsplit to begin with */ + /* + * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core + * mode, with one thread per subcore. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return n_subcores <= 4 && n_threads == 1; + + /* On POWER8, can only dynamically split if unsplit to begin with */ if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) return false; if (n_subcores > MAX_SUBCORES) @@ -2404,6 +2416,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return false; + /* POWER9 currently requires all threads to be in the same MMU mode */ + if (cpu_has_feature(CPU_FTR_ARCH_300) && + kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) + return false; + if (n_threads < cip->max_subcore_threads) n_threads = cip->max_subcore_threads; if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) @@ -2632,6 +2649,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int target_threads; int controlled_threads; int trap; + bool is_power8; + bool hpt_on_radix; /* * Remove from the list any threads that have a signal pending @@ -2654,15 +2673,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * the number of threads per subcore, except on POWER9, * where it's 1 because the threads are (mostly) independent. */ - controlled_threads = threads_per_vcore(); + controlled_threads = threads_per_vcore(vc->kvm); /* * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. + * On POWER9, we need to be not in independent-threads mode if + * this is a HPT guest on a radix host. */ - if ((controlled_threads > 1) && - ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { + hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm); + if (((controlled_threads > 1) && + ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || + (hpt_on_radix && vc->kvm->arch.threads_indep)) { for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; kvmppc_remove_runnable(vc, vcpu); @@ -2699,14 +2722,13 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * Hard-disable interrupts, and check resched flag and signals. * If we need to reschedule or deliver a signal, clean up * and return without going into the guest(s). - * If the hpte_setup_done flag has been cleared, don't go into the + * If the mmu_ready flag has been cleared, don't go into the * guest because that means a HPT resize operation is in progress. */ local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info) || - (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) { + recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ @@ -2728,32 +2750,51 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cmd_bit = stat_bit = 0; split = core_info.n_subcores; sip = NULL; - if (split > 1) { - /* threads_per_subcore must be MAX_SMT_THREADS (8) here */ - if (split == 2 && (dynamic_mt_modes & 2)) { - cmd_bit = HID0_POWER8_1TO2LPAR; - stat_bit = HID0_POWER8_2LPARMODE; - } else { - split = 4; - cmd_bit = HID0_POWER8_1TO4LPAR; - stat_bit = HID0_POWER8_4LPARMODE; - } - subcore_size = MAX_SMT_THREADS / split; + is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S) + && !cpu_has_feature(CPU_FTR_ARCH_300); + + if (split > 1 || hpt_on_radix) { sip = &split_info; memset(&split_info, 0, sizeof(split_info)); - split_info.rpr = mfspr(SPRN_RPR); - split_info.pmmar = mfspr(SPRN_PMMAR); - split_info.ldbar = mfspr(SPRN_LDBAR); - split_info.subcore_size = subcore_size; for (sub = 0; sub < core_info.n_subcores; ++sub) split_info.vc[sub] = core_info.vc[sub]; + + if (is_power8) { + if (split == 2 && (dynamic_mt_modes & 2)) { + cmd_bit = HID0_POWER8_1TO2LPAR; + stat_bit = HID0_POWER8_2LPARMODE; + } else { + split = 4; + cmd_bit = HID0_POWER8_1TO4LPAR; + stat_bit = HID0_POWER8_4LPARMODE; + } + subcore_size = MAX_SMT_THREADS / split; + split_info.rpr = mfspr(SPRN_RPR); + split_info.pmmar = mfspr(SPRN_PMMAR); + split_info.ldbar = mfspr(SPRN_LDBAR); + split_info.subcore_size = subcore_size; + } else { + split_info.subcore_size = 1; + if (hpt_on_radix) { + /* Use the split_info for LPCR/LPIDR changes */ + split_info.lpcr_req = vc->lpcr; + split_info.lpidr_req = vc->kvm->arch.lpid; + split_info.host_lpcr = vc->kvm->arch.host_lpcr; + split_info.do_set = 1; + } + } + /* order writes to split_info before kvm_split_mode pointer */ smp_wmb(); } - for (thr = 0; thr < controlled_threads; ++thr) + + for (thr = 0; thr < controlled_threads; ++thr) { + paca[pcpu + thr].kvm_hstate.tid = thr; + paca[pcpu + thr].kvm_hstate.napping = 0; paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; + } - /* Initiate micro-threading (split-core) if required */ + /* Initiate micro-threading (split-core) on POWER8 if required */ if (cmd_bit) { unsigned long hid0 = mfspr(SPRN_HID0); @@ -2772,7 +2813,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { - thr = subcore_thread_map[sub]; + thr = is_power8 ? subcore_thread_map[sub] : sub; thr0_done = false; active |= 1 << thr; pvc = core_info.vc[sub]; @@ -2799,18 +2840,20 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * the vcore pointer in the PACA of the secondaries. */ smp_mb(); - if (cmd_bit) - split_info.do_nap = 1; /* ask secondaries to nap when done */ /* * When doing micro-threading, poke the inactive threads as well. * This gets them to the nap instruction after kvm_do_nap, * which reduces the time taken to unsplit later. + * For POWER9 HPT guest on radix host, we need all the secondary + * threads woken up so they can do the LPCR/LPIDR change. */ - if (split > 1) + if (cmd_bit || hpt_on_radix) { + split_info.do_nap = 1; /* ask secondaries to nap when done */ for (thr = 1; thr < threads_per_subcore; ++thr) if (!(active & (1 << thr))) kvmppc_ipi_thread(pcpu + thr); + } vc->vcore_state = VCORE_RUNNING; preempt_disable(); @@ -2844,10 +2887,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_EXITING; /* wait for secondary threads to finish writing their state to memory */ - kvmppc_wait_for_nap(); + kvmppc_wait_for_nap(controlled_threads); /* Return to whole-core mode if we split the core earlier */ - if (split > 1) { + if (cmd_bit) { unsigned long hid0 = mfspr(SPRN_HID0); unsigned long loops = 0; @@ -2863,8 +2906,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cpu_relax(); ++loops; } - split_info.do_nap = 0; + } else if (hpt_on_radix) { + /* Wait for all threads to have seen final sync */ + for (thr = 1; thr < controlled_threads; ++thr) { + while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) { + HMT_low(); + barrier(); + } + HMT_medium(); + } } + split_info.do_nap = 0; kvmppc_set_host_core(pcpu); @@ -3073,6 +3125,25 @@ out: trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } +static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) +{ + int r = 0; + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->lock); + if (!kvm->arch.mmu_ready) { + if (!kvm_is_radix(kvm)) + r = kvmppc_hv_setup_htab_rma(vcpu); + if (!r) { + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvmppc_setup_partition_table(kvm); + kvm->arch.mmu_ready = 1; + } + } + mutex_unlock(&kvm->lock); + return r; +} + static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int n_ceded, i, r; @@ -3129,15 +3200,15 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && !signal_pending(current)) { - /* See if the HPT and VRMA are ready to go */ - if (!kvm_is_radix(vcpu->kvm) && - !vcpu->kvm->arch.hpte_setup_done) { + /* See if the MMU is ready to go */ + if (!vcpu->kvm->arch.mmu_ready) { spin_unlock(&vc->lock); - r = kvmppc_hv_setup_htab_rma(vcpu); + r = kvmhv_setup_mmu(vcpu); spin_lock(&vc->lock); if (r) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason = 0; + kvm_run->fail_entry. + hardware_entry_failure_reason = 0; vcpu->arch.ret = r; break; } @@ -3219,6 +3290,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) unsigned long ebb_regs[3] = {}; /* shut up GCC */ unsigned long user_tar = 0; unsigned int user_vrsave; + struct kvm *kvm; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -3256,8 +3328,9 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINTR; } - atomic_inc(&vcpu->kvm->arch.vcpus_running); - /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ + kvm = vcpu->kvm; + atomic_inc(&kvm->arch.vcpus_running); + /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */ smp_mb(); flush_all_to_thread(current); @@ -3285,10 +3358,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) trace_kvm_hcall_exit(vcpu, r); kvmppc_core_prepare_to_enter(vcpu); } else if (r == RESUME_PAGE_FAULT) { - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + srcu_idx = srcu_read_lock(&kvm->srcu); r = kvmppc_book3s_hv_page_fault(run, vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + srcu_read_unlock(&kvm->srcu, srcu_idx); } else if (r == RESUME_PASSTHROUGH) { if (WARN_ON(xive_enabled())) r = H_SUCCESS; @@ -3308,27 +3381,26 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) mtspr(SPRN_VRSAVE, user_vrsave); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; - atomic_dec(&vcpu->kvm->arch.vcpus_running); + atomic_dec(&kvm->arch.vcpus_running); return r; } static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, - int linux_psize) + int shift, int sllp) { - struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; - - if (!def->shift) - return; - (*sps)->page_shift = def->shift; - (*sps)->slb_enc = def->sllp; - (*sps)->enc[0].page_shift = def->shift; - (*sps)->enc[0].pte_enc = def->penc[linux_psize]; + (*sps)->page_shift = shift; + (*sps)->slb_enc = sllp; + (*sps)->enc[0].page_shift = shift; + (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift); /* - * Add 16MB MPSS support if host supports it + * Add 16MB MPSS support (may get filtered out by userspace) */ - if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { - (*sps)->enc[1].page_shift = 24; - (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; + if (shift != 24) { + int penc = kvmppc_pgsize_lp_encoding(shift, 24); + if (penc != -1) { + (*sps)->enc[1].page_shift = 24; + (*sps)->enc[1].pte_enc = penc; + } } (*sps)++; } @@ -3339,13 +3411,6 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, struct kvm_ppc_one_seg_page_size *sps; /* - * Since we don't yet support HPT guests on a radix host, - * return an error if the host uses radix. - */ - if (radix_enabled()) - return -EINVAL; - - /* * POWER7, POWER8 and POWER9 all support 32 storage keys for data. * POWER7 doesn't support keys for instruction accesses, * POWER8 and POWER9 do. @@ -3353,16 +3418,15 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, info->data_keys = 32; info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0; - info->flags = KVM_PPC_PAGE_SIZES_REAL; - if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) - info->flags |= KVM_PPC_1T_SEGMENTS; - info->slb_size = mmu_slb_size; + /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */ + info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS; + info->slb_size = 32; /* We only support these sizes for now, and no muti-size segments */ sps = &info->sps[0]; - kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); - kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); - kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); + kvmppc_add_seg_page_size(&sps, 12, 0); + kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); + kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); return 0; } @@ -3377,7 +3441,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, struct kvm_memory_slot *memslot; int i, r; unsigned long n; - unsigned long *buf; + unsigned long *buf, *p; struct kvm_vcpu *vcpu; mutex_lock(&kvm->slots_lock); @@ -3393,8 +3457,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, goto out; /* - * Use second half of bitmap area because radix accumulates - * bits in the first half. + * Use second half of bitmap area because both HPT and radix + * accumulate bits in the first half. */ n = kvm_dirty_bitmap_bytes(memslot); buf = memslot->dirty_bitmap + n / sizeof(long); @@ -3407,6 +3471,16 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, if (r) goto out; + /* + * We accumulate dirty bits in the first half of the + * memslot's dirty_bitmap area, for when pages are paged + * out or modified by the host directly. Pick up these + * bits and add them to the map. + */ + p = memslot->dirty_bitmap; + for (i = 0; i < n / sizeof(long); ++i) + buf[i] |= xchg(&p[i], 0); + /* Harvest dirty bits from VPA and DTL updates */ /* Note: we never modify the SLB shadow buffer areas */ kvm_for_each_vcpu(i, vcpu, kvm) { @@ -3438,15 +3512,6 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, unsigned long npages) { - /* - * For now, if radix_enabled() then we only support radix guests, - * and in that case we don't need the rmap array. - */ - if (radix_enabled()) { - slot->arch.rmap = NULL; - return 0; - } - slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) return -ENOMEM; @@ -3467,8 +3532,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, const struct kvm_memory_slot *new) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; /* * If we are making a new memslot, it might make @@ -3478,18 +3541,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, */ if (npages) atomic64_inc(&kvm->arch.mmio_update); - - if (npages && old->npages && !kvm_is_radix(kvm)) { - /* - * If modifying a memslot, reset all the rmap dirty bits. - * If this is a new memslot, we don't need to do anything - * since the rmap array starts out as all zeroes, - * i.e. no pages are dirty. - */ - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, mem->slot); - kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL); - } } /* @@ -3545,6 +3596,10 @@ static void kvmppc_setup_partition_table(struct kvm *kvm) mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); } +/* + * Set up HPT (hashed page table) and RMA (real-mode area). + * Must be called with kvm->lock held. + */ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; @@ -3556,10 +3611,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) unsigned long psize, porder; int srcu_idx; - mutex_lock(&kvm->lock); - if (kvm->arch.hpte_setup_done) - goto out; /* another vcpu beat us to it */ - /* Allocate hashed page table (if not done already) and reset it */ if (!kvm->arch.hpt.virt) { int order = KVM_DEFAULT_HPT_ORDER; @@ -3618,18 +3669,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* the -4 is to account for senc values starting at 0x10 */ lpcr = senc << (LPCR_VRMASD_SH - 4); kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - } else { - kvmppc_setup_partition_table(kvm); } - /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ + /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */ smp_wmb(); - kvm->arch.hpte_setup_done = 1; err = 0; out_srcu: srcu_read_unlock(&kvm->srcu, srcu_idx); out: - mutex_unlock(&kvm->lock); return err; up_out: @@ -3637,6 +3684,34 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } +/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) +{ + kvmppc_free_radix(kvm); + kvmppc_update_lpcr(kvm, LPCR_VPM1, + LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvmppc_rmap_reset(kvm); + kvm->arch.radix = 0; + kvm->arch.process_table = 0; + return 0; +} + +/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ +int kvmppc_switch_mmu_to_radix(struct kvm *kvm) +{ + int err; + + err = kvmppc_init_vm_radix(kvm); + if (err) + return err; + + kvmppc_free_hpt(&kvm->arch.hpt); + kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, + LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvm->arch.radix = 1; + return 0; +} + #ifdef CONFIG_KVM_XICS /* * Allocate a per-core structure for managing state about which cores are @@ -3780,10 +3855,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) } /* - * For now, if the host uses radix, the guest must be radix. + * If the host uses radix, the guest starts out as radix. */ if (radix_enabled()) { kvm->arch.radix = 1; + kvm->arch.mmu_ready = 1; lpcr &= ~LPCR_VPM1; lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; ret = kvmppc_init_vm_radix(kvm); @@ -3803,7 +3879,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S. */ - if (kvm_is_radix(kvm)) + if (radix_enabled()) kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ else if (cpu_has_feature(CPU_FTR_ARCH_300)) kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ @@ -3815,10 +3891,12 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. - * On POWER9, we only need to do this for HPT guests on a radix - * host, which is not yet supported. + * On POWER9, we only need to do this if the "indep_threads_mode" + * module parameter has been set to N. */ - if (!cpu_has_feature(CPU_FTR_ARCH_300)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.threads_indep = indep_threads_mode; + if (!kvm->arch.threads_indep) kvm_hv_vm_activated(); /* @@ -3858,7 +3936,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { debugfs_remove_recursive(kvm->arch.debugfs_dir); - if (!cpu_has_feature(CPU_FTR_ARCH_300)) + if (!kvm->arch.threads_indep) kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); @@ -4193,6 +4271,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) { unsigned long lpcr; int radix; + int err; /* If not on a POWER9, reject it */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) @@ -4202,12 +4281,8 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) return -EINVAL; - /* We can't change a guest to/from radix yet */ - radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); - if (radix != kvm_is_radix(kvm)) - return -EINVAL; - /* GR (guest radix) bit in process_table field must match */ + radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); if (!!(cfg->process_table & PATB_GR) != radix) return -EINVAL; @@ -4215,15 +4290,40 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if ((cfg->process_table & PRTS_MASK) > 24) return -EINVAL; + /* We can change a guest to/from radix now, if the host is radix */ + if (radix && !radix_enabled()) + return -EINVAL; + mutex_lock(&kvm->lock); + if (radix != kvm_is_radix(kvm)) { + if (kvm->arch.mmu_ready) { + kvm->arch.mmu_ready = 0; + /* order mmu_ready vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.mmu_ready = 1; + err = -EBUSY; + goto out_unlock; + } + } + if (radix) + err = kvmppc_switch_mmu_to_radix(kvm); + else + err = kvmppc_switch_mmu_to_hpt(kvm); + if (err) + goto out_unlock; + } + kvm->arch.process_table = cfg->process_table; kvmppc_setup_partition_table(kvm); lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); - mutex_unlock(&kvm->lock); + err = 0; - return 0; + out_unlock: + mutex_unlock(&kvm->lock); + return err; } static struct kvmppc_ops kvm_ops_hv = { @@ -4365,4 +4465,3 @@ module_exit(kvmppc_book3s_exit_hv); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(KVM_MINOR); MODULE_ALIAS("devname:kvm"); - diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 90644db9d38e..49a2c7825e04 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap) struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; int ptid = local_paca->kvm_hstate.ptid; struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode; - int me, ee, i; + int me, ee, i, t; + int cpu0; /* Set our bit in the threads-exiting-guest map in the 0xff00 bits of vcore->entry_exit_map */ @@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap) if ((ee >> 8) == 0) kvmhv_interrupt_vcore(vc, ee); } + + /* + * On POWER9 when running a HPT guest on a radix host (sip != NULL), + * we have to interrupt inactive CPU threads to get them to + * restore the host LPCR value. + */ + if (sip->lpcr_req) { + if (cmpxchg(&sip->do_restore, 0, 1) == 0) { + vc = local_paca->kvm_hstate.kvm_vcore; + cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid; + for (t = 1; t < threads_per_core; ++t) { + if (sip->napped[t]) + kvmhv_rm_send_ipi(cpu0 + t); + } + } + } } struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; @@ -529,6 +546,8 @@ static inline bool is_rm(void) unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_xirr(vcpu); @@ -541,6 +560,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; vcpu->arch.gpr[5] = get_tb(); if (xive_enabled()) { if (is_rm()) @@ -554,6 +575,8 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_ipoll(vcpu, server); @@ -567,6 +590,8 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, unsigned long mfrr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_ipi(vcpu, server, mfrr); @@ -579,6 +604,8 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_cppr(vcpu, cppr); @@ -591,6 +618,8 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) { + if (!kvmppc_xics_enabled(vcpu)) + return H_TOO_HARD; if (xive_enabled()) { if (is_rm()) return xive_rm_h_eoi(vcpu, xirr); @@ -601,3 +630,89 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return xics_rm_h_eoi(vcpu, xirr); } #endif /* CONFIG_KVM_XICS */ + +void kvmppc_bad_interrupt(struct pt_regs *regs) +{ + die("Bad interrupt in KVM entry/exit code", regs, SIGABRT); + panic("Bad KVM trap"); +} + +/* + * Functions used to switch LPCR HR and UPRT bits on all threads + * when entering and exiting HPT guests on a radix host. + */ + +#define PHASE_REALMODE 1 /* in real mode */ +#define PHASE_SET_LPCR 2 /* have set LPCR */ +#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */ +#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */ + +#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p)) + +static void wait_for_sync(struct kvm_split_mode *sip, int phase) +{ + int thr = local_paca->kvm_hstate.tid; + + sip->lpcr_sync.phase[thr] |= phase; + phase = ALL(phase); + while ((sip->lpcr_sync.allphases & phase) != phase) { + HMT_low(); + barrier(); + } + HMT_medium(); +} + +void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip) +{ + unsigned long rb, set; + + /* wait for every other thread to get to real mode */ + wait_for_sync(sip, PHASE_REALMODE); + + /* Set LPCR and LPIDR */ + mtspr(SPRN_LPCR, sip->lpcr_req); + mtspr(SPRN_LPID, sip->lpidr_req); + isync(); + + /* Invalidate the TLB on thread 0 */ + if (local_paca->kvm_hstate.tid == 0) { + sip->do_set = 0; + asm volatile("ptesync" : : : "memory"); + for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) { + rb = TLBIEL_INVAL_SET_LPID + + (set << TLBIEL_INVAL_SET_SHIFT); + asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : : + "r" (rb), "r" (0)); + } + asm volatile("ptesync" : : : "memory"); + } + + /* indicate that we have done so and wait for others */ + wait_for_sync(sip, PHASE_SET_LPCR); + /* order read of sip->lpcr_sync.allphases vs. sip->do_set */ + smp_rmb(); +} + +/* + * Called when a thread that has been in the guest needs + * to reload the host LPCR value - but only on POWER9 when + * running a HPT guest on a radix host. + */ +void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) +{ + /* we're out of the guest... */ + wait_for_sync(sip, PHASE_OUT_OF_GUEST); + + mtspr(SPRN_LPID, 0); + mtspr(SPRN_LPCR, sip->host_lpcr); + isync(); + + if (local_paca->kvm_hstate.tid == 0) { + sip->do_restore = 0; + smp_wmb(); /* order store of do_restore vs. phase */ + } + + wait_for_sync(sip, PHASE_RESET_LPCR); + smp_mb(); + local_paca->kvm_hstate.kvm_split_mode = NULL; +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 4efe364f1188..26c11f678fbf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -107,30 +107,50 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -/* Update the changed page order field of an rmap entry */ -void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize) +/* Update the dirty bitmap of a memslot */ +void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, + unsigned long gfn, unsigned long psize) { - unsigned long order; + unsigned long npages; - if (!psize) + if (!psize || !memslot->dirty_bitmap) return; - order = ilog2(psize); - order <<= KVMPPC_RMAP_CHG_SHIFT; - if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER)) - *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order; + npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE; + gfn -= memslot->base_gfn; + set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages); +} +EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map); + +static void kvmppc_set_dirty_from_hpte(struct kvm *kvm, + unsigned long hpte_v, unsigned long hpte_gr) +{ + struct kvm_memory_slot *memslot; + unsigned long gfn; + unsigned long psize; + + psize = kvmppc_actual_pgsz(hpte_v, hpte_gr); + gfn = hpte_rpn(hpte_gr, psize); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (memslot && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, psize); } -EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change); /* Returns a pointer to the revmap entry for the page mapped by a HPTE */ static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v, - unsigned long hpte_gr) + unsigned long hpte_gr, + struct kvm_memory_slot **memslotp, + unsigned long *gfnp) { struct kvm_memory_slot *memslot; unsigned long *rmap; unsigned long gfn; - gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr)); + gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr)); memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (memslotp) + *memslotp = memslot; + if (gfnp) + *gfnp = gfn; if (!memslot) return NULL; @@ -147,10 +167,12 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unsigned long ptel, head; unsigned long *rmap; unsigned long rcbits; + struct kvm_memory_slot *memslot; + unsigned long gfn; rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); ptel = rev->guest_rpte |= rcbits; - rmap = revmap_for_hpte(kvm, hpte_v, ptel); + rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn); if (!rmap) return; lock_rmap(rmap); @@ -169,7 +191,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, } *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r)); + kvmppc_update_dirty_map(memslot, gfn, + kvmppc_actual_pgsz(hpte_v, hpte_r)); unlock_rmap(rmap); } @@ -193,7 +216,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (kvm_is_radix(kvm)) return H_FUNCTION; - psize = hpte_page_size(pteh, ptel); + psize = kvmppc_actual_pgsz(pteh, ptel); if (!psize) return H_PARAMETER; writing = hpte_is_writable(ptel); @@ -797,7 +820,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, gr |= r & (HPTE_R_R | HPTE_R_C); if (r & HPTE_R_R) { kvmppc_clear_ref_hpte(kvm, hpte, pte_index); - rmap = revmap_for_hpte(kvm, v, gr); + rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL); if (rmap) { lock_rmap(rmap); *rmap |= KVMPPC_RMAP_REFERENCED; @@ -819,7 +842,6 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, __be64 *hpte; unsigned long v, r, gr; struct revmap_entry *rev; - unsigned long *rmap; long ret = H_NOT_FOUND; if (kvm_is_radix(kvm)) @@ -848,16 +870,9 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, r = be64_to_cpu(hpte[1]); gr |= r & (HPTE_R_R | HPTE_R_C); if (r & HPTE_R_C) { - unsigned long psize = hpte_page_size(v, r); hpte[1] = cpu_to_be64(r & ~HPTE_R_C); eieio(); - rmap = revmap_for_hpte(kvm, v, gr); - if (rmap) { - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - kvmppc_update_rmap_change(rmap, psize); - unlock_rmap(rmap); - } + kvmppc_set_dirty_from_hpte(kvm, v, gr); } } vcpu->arch.gpr[4] = gr; @@ -1014,7 +1029,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, * Check the HPTE again, including base page size */ if ((v & valid) && (v & mask) == val && - hpte_base_page_size(v, r) == (1ul << pshift)) + kvmppc_hpte_base_page_shift(v, r) == pshift) /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 68bf0f14a962..2659844784b8 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -31,6 +31,7 @@ #include <asm/tm.h> #include <asm/opal.h> #include <asm/xive-regs.h> +#include <asm/thread_info.h> /* Sign-extend HDEC if not on POWER9 */ #define EXTEND_HDEC(reg) \ @@ -81,6 +82,19 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) RFI kvmppc_call_hv_entry: +BEGIN_FTR_SECTION + /* On P9, do LPCR setting, if necessary */ + ld r3, HSTATE_SPLIT_MODE(r13) + cmpdi r3, 0 + beq 46f + lwz r4, KVM_SPLIT_DO_SET(r3) + cmpwi r4, 0 + beq 46f + bl kvmhv_p9_set_lpcr + nop +46: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r4, HSTATE_KVM_VCPU(r13) bl kvmppc_hv_entry @@ -387,6 +401,7 @@ kvm_secondary_got_guest: ld r6, HSTATE_SPLIT_MODE(r13) cmpdi r6, 0 beq 63f +BEGIN_FTR_SECTION ld r0, KVM_SPLIT_RPR(r6) mtspr SPRN_RPR, r0 ld r0, KVM_SPLIT_PMMAR(r6) @@ -394,6 +409,15 @@ kvm_secondary_got_guest: ld r0, KVM_SPLIT_LDBAR(r6) mtspr SPRN_LDBAR, r0 isync +FTR_SECTION_ELSE + /* On P9 we use the split_info for coordinating LPCR changes */ + lwz r4, KVM_SPLIT_DO_SET(r6) + cmpwi r4, 0 + beq 63f + mr r3, r6 + bl kvmhv_p9_set_lpcr + nop +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 63: /* Order load of vcpu after load of vcore */ lwsync @@ -464,6 +488,12 @@ kvm_no_guest: ld r3, HSTATE_SPLIT_MODE(r13) cmpdi r3, 0 beq kvm_no_guest + lwz r0, KVM_SPLIT_DO_SET(r3) + cmpwi r0, 0 + bne kvmhv_do_set + lwz r0, KVM_SPLIT_DO_RESTORE(r3) + cmpwi r0, 0 + bne kvmhv_do_restore lbz r0, KVM_SPLIT_DO_NAP(r3) cmpwi r0, 0 beq kvm_no_guest @@ -476,6 +506,19 @@ kvm_no_guest: stb r0, HSTATE_HWTHREAD_STATE(r13) b kvm_no_guest +kvmhv_do_set: + /* Set LPCR, LPIDR etc. on P9 */ + HMT_MEDIUM + bl kvmhv_p9_set_lpcr + nop + b kvm_no_guest + +kvmhv_do_restore: + HMT_MEDIUM + bl kvmhv_p9_restore_lpcr + nop + b kvm_no_guest + /* * Here the primary thread is trying to return the core to * whole-core mode, so we need to nap. @@ -513,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Set kvm_split_mode.napped[tid] = 1 */ ld r3, HSTATE_SPLIT_MODE(r13) li r0, 1 - lhz r4, PACAPACAINDEX(r13) - clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */ + lbz r4, HSTATE_TID(r13) addi r4, r4, KVM_SPLIT_NAPPED stbx r0, r3, r4 /* Check the do_nap flag again after setting napped[] */ @@ -1911,10 +1953,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 19: lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 -16: ld r8,KVM_HOST_LPCR(r4) +16: +BEGIN_FTR_SECTION + /* On POWER9 with HPT-on-radix we need to wait for all other threads */ + ld r3, HSTATE_SPLIT_MODE(r13) + cmpdi r3, 0 + beq 47f + lwz r8, KVM_SPLIT_DO_RESTORE(r3) + cmpwi r8, 0 + beq 47f + stw r12, STACK_SLOT_TRAP(r1) + bl kvmhv_p9_restore_lpcr + nop + lwz r12, STACK_SLOT_TRAP(r1) + b 48f +47: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r8,KVM_HOST_LPCR(r4) mtspr SPRN_LPCR,r8 isync - +48: /* load host SLB entries */ BEGIN_MMU_FTR_SECTION b 0f @@ -3133,10 +3191,139 @@ kvmppc_restore_tm: /* * We come here if we get any exception or interrupt while we are * executing host real mode code while in guest MMU context. - * For now just spin, but we should do something better. + * r12 is (CR << 32) | vector + * r13 points to our PACA + * r12 is saved in HSTATE_SCRATCH0(r13) + * ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE + * r9 is saved in HSTATE_SCRATCH2(r13) + * r13 is saved in HSPRG1 + * cfar is saved in HSTATE_CFAR(r13) + * ppr is saved in HSTATE_PPR(r13) */ kvmppc_bad_host_intr: + /* + * Switch to the emergency stack, but start half-way down in + * case we were already on it. + */ + mr r9, r1 + std r1, PACAR1(r13) + ld r1, PACAEMERGSP(r13) + subi r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE + std r9, 0(r1) + std r0, GPR0(r1) + std r9, GPR1(r1) + std r2, GPR2(r1) + SAVE_4GPRS(3, r1) + SAVE_2GPRS(7, r1) + srdi r0, r12, 32 + clrldi r12, r12, 32 + std r0, _CCR(r1) + std r12, _TRAP(r1) + andi. r0, r12, 2 + beq 1f + mfspr r3, SPRN_HSRR0 + mfspr r4, SPRN_HSRR1 + mfspr r5, SPRN_HDAR + mfspr r6, SPRN_HDSISR + b 2f +1: mfspr r3, SPRN_SRR0 + mfspr r4, SPRN_SRR1 + mfspr r5, SPRN_DAR + mfspr r6, SPRN_DSISR +2: std r3, _NIP(r1) + std r4, _MSR(r1) + std r5, _DAR(r1) + std r6, _DSISR(r1) + ld r9, HSTATE_SCRATCH2(r13) + ld r12, HSTATE_SCRATCH0(r13) + GET_SCRATCH0(r0) + SAVE_4GPRS(9, r1) + std r0, GPR13(r1) + SAVE_NVGPRS(r1) + ld r5, HSTATE_CFAR(r13) + std r5, ORIG_GPR3(r1) + mflr r3 +#ifdef CONFIG_RELOCATABLE + ld r4, HSTATE_SCRATCH1(r13) +#else + mfctr r4 +#endif + mfxer r5 + lbz r6, PACASOFTIRQEN(r13) + std r3, _LINK(r1) + std r4, _CTR(r1) + std r5, _XER(r1) + std r6, SOFTE(r1) + ld r2, PACATOC(r13) + LOAD_REG_IMMEDIATE(3, 0x7265677368657265) + std r3, STACK_FRAME_OVERHEAD-16(r1) + + /* + * On POWER9 do a minimal restore of the MMU and call C code, + * which will print a message and panic. + * XXX On POWER7 and POWER8, we just spin here since we don't + * know what the other threads are doing (and we don't want to + * coordinate with them) - but at least we now have register state + * in memory that we might be able to look at from another CPU. + */ +BEGIN_FTR_SECTION b . +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_KVM(r9) + + li r0, 0 + mtspr SPRN_AMR, r0 + mtspr SPRN_IAMR, r0 + mtspr SPRN_CIABR, r0 + mtspr SPRN_DAWRX, r0 + + /* Flush the ERAT on radix P9 DD1 guest exit */ +BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) + +BEGIN_MMU_FTR_SECTION + b 4f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) + + slbmte r0, r0 + slbia + ptesync + ld r8, PACA_SLBSHADOWPTR(r13) + .rept SLB_NUM_BOLTED + li r3, SLBSHADOW_SAVEAREA + LDX_BE r5, r8, r3 + addi r3, r3, 8 + LDX_BE r6, r8, r3 + andis. r7, r5, SLB_ESID_V@h + beq 3f + slbmte r6, r5 +3: addi r8, r8, 16 + .endr + +4: lwz r7, KVM_HOST_LPID(r10) + mtspr SPRN_LPID, r7 + mtspr SPRN_PID, r0 + ld r8, KVM_HOST_LPCR(r10) + mtspr SPRN_LPCR, r8 + isync + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + /* + * Turn on the MMU and jump to C code + */ + bcl 20, 31, .+4 +5: mflr r3 + addi r3, r3, 9f - 5b + ld r4, PACAKMSR(r13) + mtspr SPRN_SRR0, r3 + mtspr SPRN_SRR1, r4 + rfid +9: addi r3, r1, STACK_FRAME_OVERHEAD + bl kvmppc_bad_interrupt + b 9b /* * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 69a09444d46e..d0dc8624198f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1326,12 +1326,22 @@ static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, kvmppc_set_pvr_pr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; +#ifdef CONFIG_PPC_BOOK3S_64 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { + /* Flush all SLB entries */ + vcpu->arch.mmu.slbmte(vcpu, 0, 0); + vcpu->arch.mmu.slbia(vcpu); + for (i = 0; i < 64; i++) { - vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, - sregs->u.s.ppc64.slb[i].slbe); + u64 rb = sregs->u.s.ppc64.slb[i].slbe; + u64 rs = sregs->u.s.ppc64.slb[i].slbv; + + if (rb & SLB_ESID_V) + vcpu->arch.mmu.slbmte(vcpu, rs, rb); } - } else { + } else +#endif + { for (i = 0; i < 16; i++) { vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); } diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index 8a4205fa774f..dae3be5ff42b 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd) case H_PROTECT: case H_BULK_REMOVE: case H_PUT_TCE: + case H_PUT_TCE_INDIRECT: + case H_STUFF_TCE: case H_CEDE: case H_LOGICAL_CI_LOAD: case H_LOGICAL_CI_STORE: diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c6c734424c70..423b21393bc9 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -377,7 +377,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, start = vma->vm_pgoff; end = start + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + vma_pages(vma); pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1abe6eb51335..6b6c53c42ac9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -590,8 +590,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: - r = !!(hv_enabled && !radix_enabled() && - cpu_has_feature(CPU_FTR_ARCH_300)); + r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); break; #endif case KVM_CAP_SYNC_MMU: diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index fd006a272024..f3a9b5a445b6 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -685,11 +685,28 @@ struct kvm_s390_crypto { __u8 dea_kw; }; +#define APCB0_MASK_SIZE 1 +struct kvm_s390_apcb0 { + __u64 apm[APCB0_MASK_SIZE]; /* 0x0000 */ + __u64 aqm[APCB0_MASK_SIZE]; /* 0x0008 */ + __u64 adm[APCB0_MASK_SIZE]; /* 0x0010 */ + __u64 reserved18; /* 0x0018 */ +}; + +#define APCB1_MASK_SIZE 4 +struct kvm_s390_apcb1 { + __u64 apm[APCB1_MASK_SIZE]; /* 0x0000 */ + __u64 aqm[APCB1_MASK_SIZE]; /* 0x0020 */ + __u64 adm[APCB1_MASK_SIZE]; /* 0x0040 */ + __u64 reserved60[4]; /* 0x0060 */ +}; + struct kvm_s390_crypto_cb { - __u8 reserved00[72]; /* 0x0000 */ - __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ - __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ - __u8 reserved80[128]; /* 0x0080 */ + struct kvm_s390_apcb0 apcb0; /* 0x0000 */ + __u8 reserved20[0x0048 - 0x0020]; /* 0x0020 */ + __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ + __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ + struct kvm_s390_apcb1 apcb1; /* 0x0080 */ }; /* diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 329b2843fee2..fa557372d600 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -213,6 +213,16 @@ static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) vcpu->arch.local_int.pending_irqs; } +static inline int isc_to_irq_type(unsigned long isc) +{ + return IRQ_PEND_IO_ISC_0 + isc; +} + +static inline int irq_type_to_isc(unsigned long irq_type) +{ + return irq_type - IRQ_PEND_IO_ISC_0; +} + static unsigned long disable_iscs(struct kvm_vcpu *vcpu, unsigned long active_mask) { @@ -220,7 +230,7 @@ static unsigned long disable_iscs(struct kvm_vcpu *vcpu, for (i = 0; i <= MAX_ISC; i++) if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i))) - active_mask &= ~(1UL << (IRQ_PEND_IO_ISC_0 + i)); + active_mask &= ~(1UL << (isc_to_irq_type(i))); return active_mask; } @@ -901,7 +911,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, fi = &vcpu->kvm->arch.float_int; spin_lock(&fi->lock); - isc_list = &fi->lists[irq_type - IRQ_PEND_IO_ISC_0]; + isc_list = &fi->lists[irq_type_to_isc(irq_type)]; inti = list_first_entry_or_null(isc_list, struct kvm_s390_interrupt_info, list); @@ -1074,6 +1084,12 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) * in kvm_vcpu_block without having the waitqueue set (polling) */ vcpu->valid_wakeup = true; + /* + * This is mostly to document, that the read in swait_active could + * be moved before other stores, leading to subtle races. + * All current users do not store or use an atomic like update + */ + smp_mb__after_atomic(); if (swait_active(&vcpu->wq)) { /* * The vcpu gave up the cpu voluntarily, mark it as a good @@ -1395,7 +1411,7 @@ static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm, list_del_init(&iter->list); fi->counters[FIRQ_CNTR_IO] -= 1; if (list_empty(isc_list)) - clear_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs); + clear_bit(isc_to_irq_type(isc), &fi->pending_irqs); spin_unlock(&fi->lock); return iter; } @@ -1522,7 +1538,7 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) isc = int_word_to_isc(inti->io.io_int_word); list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; list_add_tail(&inti->list, list); - set_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs); + set_bit(isc_to_irq_type(isc), &fi->pending_irqs); spin_unlock(&fi->lock); return 0; } @@ -2175,6 +2191,8 @@ static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr) return -EINVAL; if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid))) return -EFAULT; + if (!schid) + return -EINVAL; kfree(kvm_s390_get_io_int(kvm, isc_mask, schid)); /* * If userspace is conforming to the architecture, we can have at most diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4bc70afe0a10..98ad8b9e0360 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -395,6 +395,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_INSTR0: case KVM_CAP_S390_CMMA_MIGRATION: case KVM_CAP_S390_AIS: + case KVM_CAP_S390_AIS_MIGRATION: r = 1; break; case KVM_CAP_S390_MEM_OP: diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b18b5652e5c5..a311938b63b3 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -443,22 +443,14 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * * Returns: - 0 on success * - -EINVAL if the gpa is not valid guest storage - * - -ENOMEM if out of memory */ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) { struct page *page; - hva_t hva; - int rc; - hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - if (kvm_is_error_hva(hva)) + page = gfn_to_page(kvm, gpa_to_gfn(gpa)); + if (is_error_page(page)) return -EINVAL; - rc = get_user_pages_fast(hva, 1, 1, &page); - if (rc < 0) - return rc; - else if (rc != 1) - return -ENOMEM; *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); return 0; } @@ -466,11 +458,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) { - struct page *page; - - page = virt_to_page(hpa); - set_page_dirty_lock(page); - put_page(page); + kvm_release_pfn_dirty(hpa >> PAGE_SHIFT); /* mark the page always as dirty for migration */ mark_page_dirty(kvm, gpa_to_gfn(gpa)); } @@ -557,7 +545,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc = set_validity_icpt(scb_s, 0x003bU); if (!rc) { rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) rc = set_validity_icpt(scb_s, 0x0034U); } if (rc) @@ -574,10 +562,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* 256 bytes cannot cross page boundaries */ rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) { rc = set_validity_icpt(scb_s, 0x0080U); - if (rc) goto unpin; + } scb_s->itdba = hpa; } @@ -592,10 +580,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * if this block gets bigger, we have to shadow it. */ rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) { rc = set_validity_icpt(scb_s, 0x1310U); - if (rc) goto unpin; + } scb_s->gvrd = hpa; } @@ -607,11 +595,11 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* 64 bytes cannot cross page boundaries */ rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) { rc = set_validity_icpt(scb_s, 0x0043U); - /* Validity 0x0044 will be checked by SIE */ - if (rc) goto unpin; + } + /* Validity 0x0044 will be checked by SIE */ scb_s->riccbd = hpa; } if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { @@ -635,10 +623,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * cross page boundaries */ rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) { rc = set_validity_icpt(scb_s, 0x10b0U); - if (rc) goto unpin; + } scb_s->sdnxo = hpa | sdnxc; } return 0; @@ -663,7 +651,6 @@ static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, * * Returns: - 0 if the scb was pinned. * - > 0 if control has to be given to guest 2 - * - -ENOMEM if out of memory */ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, gpa_t gpa) @@ -672,14 +659,13 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, int rc; rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) { + if (rc) { rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (!rc) - rc = 1; + WARN_ON_ONCE(rc); + return 1; } - if (!rc) - vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; - return rc; + vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + return 0; } /* diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ee23a43386a2..034caa1a084e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -226,6 +226,8 @@ struct x86_emulate_ops { unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); + int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase); + }; typedef u32 __attribute__((vector_size(16))) sse128_t; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9d7d856b2d89..1bfb99770c34 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1061,6 +1061,11 @@ struct kvm_x86_ops { void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); void (*setup_mce)(struct kvm_vcpu *vcpu); + + int (*smi_allowed)(struct kvm_vcpu *vcpu); + int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); + int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); + int (*enable_smi_window)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1426,4 +1431,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #endif } +#define put_smstate(type, buf, offset, val) \ + *(type *)((buf) + (offset) - 0x7e00) = val + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index caec8417539f..8b6780751132 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -70,11 +70,11 @@ #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 -#define SECONDARY_EXEC_RDRAND 0x00000800 +#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 -#define SECONDARY_EXEC_RDSEED 0x00010000 +#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d90cdc77e077..8079d141792a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2591,6 +2591,15 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) ctxt->ops->set_msr(ctxt, MSR_EFER, efer); smbase = ctxt->ops->get_smbase(ctxt); + + /* + * Give pre_leave_smm() a chance to make ISA-specific changes to the + * vCPU state (e.g. enter guest mode) before loading state from the SMM + * state-save area. + */ + if (ctxt->ops->pre_leave_smm(ctxt, smbase)) + return X86EMUL_UNHANDLEABLE; + if (emulator_has_longmode(ctxt)) ret = rsm_load_state_64(ctxt, smbase + 0x8000); else diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 36c90d631096..943acbf00c69 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1301,14 +1301,42 @@ static void update_divide_count(struct kvm_lapic *apic) apic->divide_count); } +static void limit_periodic_timer_frequency(struct kvm_lapic *apic) +{ + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask; if (apic->lapic_timer.timer_mode != timer_mode) { + if (apic_lvtt_tscdeadline(apic) != (timer_mode == + APIC_LVT_TIMER_TSCDEADLINE)) { + hrtimer_cancel(&apic->lapic_timer.timer); + kvm_lapic_set_reg(apic, APIC_TMICT, 0); + apic->lapic_timer.period = 0; + apic->lapic_timer.tscdeadline = 0; + } apic->lapic_timer.timer_mode = timer_mode; - hrtimer_cancel(&apic->lapic_timer.timer); + limit_periodic_timer_frequency(apic); } } @@ -1430,6 +1458,30 @@ static void start_sw_period(struct kvm_lapic *apic) HRTIMER_MODE_ABS_PINNED); } +static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) +{ + ktime_t now, remaining; + u64 ns_remaining_old, ns_remaining_new; + + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + limit_periodic_timer_frequency(apic); + + now = ktime_get(); + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); + if (ktime_to_ns(remaining) < 0) + remaining = 0; + + ns_remaining_old = ktime_to_ns(remaining); + ns_remaining_new = mul_u64_u32_div(ns_remaining_old, + apic->divide_count, old_divisor); + + apic->lapic_timer.tscdeadline += + nsec_to_cycles(apic->vcpu, ns_remaining_new) - + nsec_to_cycles(apic->vcpu, ns_remaining_old); + apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new); +} + static bool set_target_expiration(struct kvm_lapic *apic) { ktime_t now; @@ -1439,27 +1491,13 @@ static bool set_target_expiration(struct kvm_lapic *apic) apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; - if (!apic->lapic_timer.period) + if (!apic->lapic_timer.period) { + apic->lapic_timer.tscdeadline = 0; return false; - - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } } + limit_periodic_timer_frequency(apic); + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " "timer initial count 0x%x, period %lldns, " @@ -1515,6 +1553,9 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) return false; + if (!ktimer->tscdeadline) + return false; + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); if (r < 0) return false; @@ -1738,13 +1779,21 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) start_apic_timer(apic); break; - case APIC_TDCR: + case APIC_TDCR: { + uint32_t old_divisor = apic->divide_count; + if (val & 4) apic_debug("KVM_WRITE:TDCR %x\n", val); kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); + if (apic->divide_count != old_divisor && + apic->lapic_timer.period) { + hrtimer_cancel(&apic->lapic_timer.timer); + update_target_expiration(apic, old_divisor); + restart_apic_timer(apic); + } break; - + } case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) { apic_debug("KVM_WRITE:ESR not zero %x\n", val); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a119b361b8b7..e5e66e5c6640 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -150,6 +150,20 @@ module_param(dbg, bool, 0644); /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 +/* + * Return values of handle_mmio_page_fault and mmu.page_fault: + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * + * For handle_mmio_page_fault only: + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE = 1, + RET_PF_INVALID = 2, +}; + struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -2424,7 +2438,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { - return __shadow_walk_next(iterator, *iterator->sptep); + __shadow_walk_next(iterator, *iterator->sptep); } static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2794,13 +2808,13 @@ done: return ret; } -static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, - int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, - bool speculative, bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, + bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; - bool emulate = false; + int ret = RET_PF_RETRY; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2830,12 +2844,12 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) - emulate = true; + ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep))) - emulate = true; + ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", @@ -2855,7 +2869,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, kvm_release_pfn_clean(pfn); - return emulate; + return ret; } static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2994,14 +3008,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. - * Return 1 to tell kvm to emulate it. */ if (pfn == KVM_PFN_ERR_RO_FAULT) - return 1; + return RET_PF_EMULATE; if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); - return 0; + return RET_PF_RETRY; } return -EFAULT; @@ -3286,13 +3299,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, } if (fast_page_fault(vcpu, v, level, error_code)) - return 0; + return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; @@ -3312,7 +3325,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } @@ -3659,54 +3672,38 @@ exit: return reserved; } -/* - * Return values of handle_mmio_page_fault: - * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction - * directly. - * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page - * fault path update the mmio spte. - * RET_MMIO_PF_RETRY: let CPU fault again on the address. - * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). - */ -enum { - RET_MMIO_PF_EMULATE = 1, - RET_MMIO_PF_INVALID = 2, - RET_MMIO_PF_RETRY = 0, - RET_MMIO_PF_BUG = -1 -}; - static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; if (mmio_info_in_cache(vcpu, addr, direct)) - return RET_MMIO_PF_EMULATE; + return RET_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) - return RET_MMIO_PF_BUG; + return -EINVAL; if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) - return RET_MMIO_PF_INVALID; + return RET_PF_INVALID; if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return RET_MMIO_PF_EMULATE; + return RET_PF_EMULATE; } /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ - return RET_MMIO_PF_RETRY; + return RET_PF_RETRY; } EXPORT_SYMBOL_GPL(handle_mmio_page_fault); @@ -3756,7 +3753,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return 1; + return RET_PF_EMULATE; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3820,8 +3817,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, - u64 fault_address, char *insn, int insn_len, - bool need_unprotect) + u64 fault_address, char *insn, int insn_len) { int r = 1; @@ -3829,7 +3825,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, default: trace_kvm_page_fault(fault_address, error_code); - if (need_unprotect && kvm_event_needs_reinjection(vcpu)) + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); @@ -3876,7 +3872,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return 1; + return RET_PF_EMULATE; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3893,13 +3889,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } if (fast_page_fault(vcpu, gpa, level, error_code)) - return 0; + return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; @@ -3919,7 +3915,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -4918,25 +4914,25 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, vcpu->arch.gpa_val = cr2; } + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); - if (r == RET_MMIO_PF_EMULATE) { + if (r == RET_PF_EMULATE) { emulation_type = 0; goto emulate; } - if (r == RET_MMIO_PF_RETRY) - return 1; - if (r < 0) - return r; - /* Must be RET_MMIO_PF_INVALID. */ } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), - false); + if (r == RET_PF_INVALID) { + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); + WARN_ON(r == RET_PF_INVALID); + } + + if (r == RET_PF_RETRY) + return 1; if (r < 0) return r; - if (!r) - return 1; /* * Before emulating the instruction, check if the error code @@ -4993,8 +4989,7 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu.pae_root); - if (vcpu->arch.mmu.lm_root != NULL) - free_page((unsigned long)vcpu->arch.mmu.lm_root); + free_page((unsigned long)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) @@ -5464,10 +5459,8 @@ static struct shrinker mmu_shrinker = { static void mmu_destroy_caches(void) { - if (pte_list_desc_cache) - kmem_cache_destroy(pte_list_desc_cache); - if (mmu_page_header_cache) - kmem_cache_destroy(mmu_page_header_cache); + kmem_cache_destroy(pte_list_desc_cache); + kmem_cache_destroy(mmu_page_header_cache); } int kvm_mmu_module_init(void) @@ -5476,13 +5469,13 @@ int kvm_mmu_module_init(void) pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), - 0, 0, NULL); + 0, SLAB_ACCOUNT, NULL); if (!pte_list_desc_cache) goto nomem; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), - 0, 0, NULL); + 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) goto nomem; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index efc857615d8e..5b408c0ad612 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -66,8 +66,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, - u64 fault_address, char *insn, int insn_len, - bool need_unprotect); + u64 fault_address, char *insn, int insn_len); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f18d1f8d332b..5abae72266b7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -593,7 +593,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, emulate; + int top_level, ret; direct_access = gw->pte_access; @@ -659,15 +659,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } clear_sp_write_flooding_count(it.sptep); - emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return emulate; + return ret; out_gpte_changed: kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } /* @@ -762,12 +762,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!prefault) inject_page_fault(vcpu, &walker.fault); - return 0; + return RET_PF_RETRY; } if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { shadow_page_table_clear_flood(vcpu, addr); - return 1; + return RET_PF_EMULATE; } vcpu->arch.write_fault_to_shadow_pgtable = false; @@ -789,7 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; @@ -834,7 +834,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0e68f0b3cbf7..b71daed3cca2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1034,15 +1034,12 @@ static int avic_ga_log_notifier(u32 ga_tag) } spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - if (!vcpu) - return 0; - /* Note: * At this point, the IOMMU should have already set the pending * bit in the vAPIC backing page. So, we just need to schedule * in the vcpu. */ - if (vcpu->mode == OUTSIDE_GUEST_MODE) + if (vcpu) kvm_vcpu_wake_up(vcpu); return 0; @@ -2144,7 +2141,18 @@ static int pf_interception(struct vcpu_svm *svm) return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len, !npt_enabled); + svm->vmcb->control.insn_len); +} + +static int npf_interception(struct vcpu_svm *svm) +{ + u64 fault_address = svm->vmcb->control.exit_info_2; + u64 error_code = svm->vmcb->control.exit_info_1; + + trace_kvm_page_fault(fault_address, error_code); + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + svm->vmcb->control.insn_bytes, + svm->vmcb->control.insn_len); } static int db_interception(struct vcpu_svm *svm) @@ -2916,70 +2924,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) return true; } -static bool nested_svm_vmrun(struct vcpu_svm *svm) +static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb, struct page *page) { - struct vmcb *nested_vmcb; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; - struct page *page; - u64 vmcb_gpa; - - vmcb_gpa = svm->vmcb->save.rax; - - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) - return false; - - if (!nested_vmcb_checks(nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; - - nested_svm_unmap(page); - - return false; - } - - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); - - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); - - /* Clear internal status */ - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs - */ - hsave->save.es = vmcb->save.es; - hsave->save.cs = vmcb->save.cs; - hsave->save.ss = vmcb->save.ss; - hsave->save.ds = vmcb->save.ds; - hsave->save.gdtr = vmcb->save.gdtr; - hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.efer; - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); - hsave->save.rip = kvm_rip_read(&svm->vcpu); - hsave->save.rsp = vmcb->save.rsp; - hsave->save.rax = vmcb->save.rax; - if (npt_enabled) - hsave->save.cr3 = vmcb->save.cr3; - else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - - copy_vmcb_control_area(hsave, vmcb); - if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else @@ -3072,6 +3019,73 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) enable_gif(svm); mark_all_dirty(svm->vmcb); +} + +static bool nested_svm_vmrun(struct vcpu_svm *svm) +{ + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + struct page *page; + u64 vmcb_gpa; + + vmcb_gpa = svm->vmcb->save.rax; + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return false; + + if (!nested_vmcb_checks(nested_vmcb)) { + nested_vmcb->control.exit_code = SVM_EXIT_ERR; + nested_vmcb->control.exit_code_hi = 0; + nested_vmcb->control.exit_info_1 = 0; + nested_vmcb->control.exit_info_2 = 0; + + nested_svm_unmap(page); + + return false; + } + + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, + nested_vmcb->save.rip, + nested_vmcb->control.int_ctl, + nested_vmcb->control.event_inj, + nested_vmcb->control.nested_ctl); + + trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, + nested_vmcb->control.intercept_cr >> 16, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept); + + /* Clear internal status */ + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + /* + * Save the old vmcb, so we don't need to pick what we save, but can + * restore everything when a VMEXIT occurs + */ + hsave->save.es = vmcb->save.es; + hsave->save.cs = vmcb->save.cs; + hsave->save.ss = vmcb->save.ss; + hsave->save.ds = vmcb->save.ds; + hsave->save.gdtr = vmcb->save.gdtr; + hsave->save.idtr = vmcb->save.idtr; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); + hsave->save.rip = kvm_rip_read(&svm->vcpu); + hsave->save.rsp = vmcb->save.rsp; + hsave->save.rax = vmcb->save.rax; + if (npt_enabled) + hsave->save.cr3 = vmcb->save.cr3; + else + hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); + + copy_vmcb_control_area(hsave, vmcb); + + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page); return true; } @@ -3173,7 +3187,7 @@ static int stgi_interception(struct vcpu_svm *svm) /* * If VGIF is enabled, the STGI intercept is only added to - * detect the opening of the NMI window; remove it now. + * detect the opening of the SMI/NMI window; remove it now. */ if (vgif_enabled(svm)) clr_intercept(svm, INTERCEPT_STGI); @@ -4131,7 +4145,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MONITOR] = monitor_interception, [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, - [SVM_EXIT_NPF] = pf_interception, + [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = emulate_on_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -5393,6 +5407,88 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +static int svm_smi_allowed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* Per APM Vol.2 15.22.2 "Response to SMI" */ + if (!gif_set(svm)) + return 0; + + if (is_guest_mode(&svm->vcpu) && + svm->nested.intercept & (1ULL << INTERCEPT_SMI)) { + /* TODO: Might need to set exit_info_1 and exit_info_2 here */ + svm->vmcb->control.exit_code = SVM_EXIT_SMI; + svm->nested.exit_required = true; + return 0; + } + + return 1; +} + +static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + if (is_guest_mode(vcpu)) { + /* FED8h - SVM Guest */ + put_smstate(u64, smstate, 0x7ed8, 1); + /* FEE0h - SVM Guest VMCB Physical Address */ + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb); + + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + + ret = nested_svm_vmexit(svm); + if (ret) + return ret; + } + return 0; +} + +static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *nested_vmcb; + struct page *page; + struct { + u64 guest; + u64 vmcb; + } svm_state_save; + int ret; + + ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save, + sizeof(svm_state_save)); + if (ret) + return ret; + + if (svm_state_save.guest) { + vcpu->arch.hflags &= ~HF_SMM_MASK; + nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page); + if (nested_vmcb) + enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page); + else + ret = 1; + vcpu->arch.hflags |= HF_SMM_MASK; + } + return ret; +} + +static int enable_smi_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!gif_set(svm)) { + if (vgif_enabled(svm)) + set_intercept(svm, INTERCEPT_STGI); + /* STGI will cause a vm exit */ + return 1; + } + return 0; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5503,6 +5599,11 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, + + .smi_allowed = svm_smi_allowed, + .pre_enter_smm = svm_pre_enter_smm, + .pre_leave_smm = svm_pre_leave_smm, + .enable_smi_window = enable_smi_window, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a6f4f095f8f4..7c3522a989d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -486,6 +486,14 @@ struct nested_vmx { u64 nested_vmx_cr4_fixed1; u64 nested_vmx_vmcs_enum; u64 nested_vmx_vmfunc_controls; + + /* SMM related state */ + struct { + /* in VMX operation on SMM entry? */ + bool vmxon; + /* in guest mode on SMM entry? */ + bool guest_mode; + } smm; }; #define POSTED_INTR_ON 0 @@ -900,16 +908,13 @@ static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static bool vmx_xsaves_supported(void); -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static int alloc_identity_pagetable(struct kvm *kvm); static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, @@ -1598,18 +1603,15 @@ static inline void vpid_sync_context(int vpid) static inline void ept_sync_global(void) { - if (cpu_has_vmx_invept_global()) - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); } static inline void ept_sync_context(u64 eptp) { - if (enable_ept) { - if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); - else - ept_sync_global(); - } + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); } static __always_inline void vmcs_check16(unsigned long field) @@ -2831,8 +2833,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_PML; vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } - } else - vmx->nested.nested_vmx_ept_caps = 0; + } if (cpu_has_vmx_vmfunc()) { vmx->nested.nested_vmx_secondary_ctls_high |= @@ -2841,8 +2842,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * Advertise EPTP switching unconditionally * since we emulate it */ - vmx->nested.nested_vmx_vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; + if (enable_ept) + vmx->nested.nested_vmx_vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; } /* @@ -2856,8 +2858,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_VPID; vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; - } else - vmx->nested.nested_vmx_vpid_caps = 0; + } if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= @@ -3544,7 +3545,8 @@ static int hardware_enable(void) wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } kvm_cpu_vmxon(phys_addr); - ept_sync_global(); + if (enable_ept) + ept_sync_global(); return 0; } @@ -3657,8 +3659,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_RDSEED | - SECONDARY_EXEC_RDRAND | + SECONDARY_EXEC_RDSEED_EXITING | + SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_VMFUNC; @@ -3679,14 +3681,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, + &vmx_capability.ept, &vmx_capability.vpid); + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); - rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, - vmx_capability.ept, vmx_capability.vpid); + } else if (vmx_capability.ept) { + vmx_capability.ept = 0; + pr_warn_once("EPT CAP should not exist if not support " + "1-setting enable EPT VM-execution control\n"); + } + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && + vmx_capability.vpid) { + vmx_capability.vpid = 0; + pr_warn_once("VPID CAP should not exist if not support " + "1-setting enable VPID VM-execution control\n"); } min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; @@ -4781,18 +4794,18 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm_pfn_t identity_map_pfn; u32 tmp; - if (!enable_ept) - return 0; - /* Protect kvm->arch.ept_identity_pagetable_done. */ mutex_lock(&kvm->slots_lock); if (likely(kvm->arch.ept_identity_pagetable_done)) goto out2; + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; - r = alloc_identity_pagetable(kvm); + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm->arch.ept_identity_map_addr, PAGE_SIZE); if (r < 0) goto out2; @@ -4864,20 +4877,6 @@ out: return r; } -static int alloc_identity_pagetable(struct kvm *kvm) -{ - /* Called with kvm->slots_lock held. */ - - int r = 0; - - BUG_ON(kvm->arch.ept_identity_pagetable_done); - - r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm->arch.ept_identity_map_addr, PAGE_SIZE); - - return r; -} - static int allocate_vpid(void) { int vpid; @@ -5282,13 +5281,13 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static bool vmx_rdrand_supported(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDRAND; + SECONDARY_EXEC_RDRAND_EXITING; } static bool vmx_rdseed_supported(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDSEED; + SECONDARY_EXEC_RDSEED_EXITING; } static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) @@ -5382,30 +5381,30 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (vmx_rdrand_supported()) { bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND; + exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; if (nested) { if (rdrand_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_RDRAND; + SECONDARY_EXEC_RDRAND_EXITING; else vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND; + ~SECONDARY_EXEC_RDRAND_EXITING; } } if (vmx_rdseed_supported()) { bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED; + exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; if (nested) { if (rdseed_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_RDSEED; + SECONDARY_EXEC_RDSEED_EXITING; else vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED; + ~SECONDARY_EXEC_RDSEED_EXITING; } } @@ -5426,7 +5425,7 @@ static void ept_set_mmio_spte_mask(void) /* * Sets up the vmcs for emulated real mode. */ -static int vmx_vcpu_setup(struct vcpu_vmx *vmx) +static void vmx_vcpu_setup(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 unsigned long a; @@ -5539,8 +5538,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } - - return 0; } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -5604,6 +5601,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); + if (kvm_mpx_supported()) + vmcs_write64(GUEST_BNDCFGS, 0); setup_msrs(vmx); @@ -5912,8 +5911,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) cr2 = vmcs_readl(EXIT_QUALIFICATION); /* EPT won't cause page fault directly */ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, - true); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -6747,16 +6745,14 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || - !cpu_has_vmx_ept_mt_wb()) { + !cpu_has_vmx_ept_mt_wb() || + !cpu_has_vmx_invept_global()) enable_ept = 0; - enable_unrestricted_guest = 0; - enable_ept_ad_bits = 0; - } if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; - if (!cpu_has_vmx_unrestricted_guest()) + if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) @@ -6776,8 +6772,13 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); - if (!cpu_has_vmx_ple()) + if (!cpu_has_vmx_ple()) { ple_gap = 0; + ple_window = 0; + ple_window_grow = 0; + ple_window_max = 0; + ple_window_shrink = 0; + } if (!cpu_has_vmx_apicv()) { enable_apicv = 0; @@ -8415,9 +8416,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); case EXIT_REASON_RDRAND: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); case EXIT_REASON_RDSEED: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: @@ -9475,7 +9476,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; put_cpu(); } @@ -9556,11 +9556,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; - err = vmx_vcpu_setup(vmx); + vmx_vcpu_setup(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); - if (err) - goto free_vmcs; if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) @@ -9568,9 +9566,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) } if (enable_ept) { - if (!kvm->arch.ept_identity_map_addr) - kvm->arch.ept_identity_map_addr = - VMX_EPT_IDENTITY_PAGETABLE_ADDR; err = init_rmode_identity_map(kvm); if (err) goto free_vmcs; @@ -11325,6 +11320,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) @@ -11421,8 +11418,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); if (likely(!vmx->fail)) { - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); + if (exit_reason == -1) + sync_vmcs12(vcpu, vmcs12); + else + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) @@ -11486,7 +11486,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (enable_shadow_vmcs) + if (enable_shadow_vmcs && exit_reason != -1) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ @@ -11510,12 +11510,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; } - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); + if (exit_reason != -1) + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); load_vmcs12_host_state(vcpu, vmcs12); @@ -11938,6 +11939,54 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } +static int vmx_smi_allowed(struct kvm_vcpu *vcpu) +{ + /* we need a nested vmexit to enter SMM, postpone if run is pending */ + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; + return 1; +} + +static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx->nested.smm.guest_mode = is_guest_mode(vcpu); + if (vmx->nested.smm.guest_mode) + nested_vmx_vmexit(vcpu, -1, 0, 0); + + vmx->nested.smm.vmxon = vmx->nested.vmxon; + vmx->nested.vmxon = false; + return 0; +} + +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int ret; + + if (vmx->nested.smm.vmxon) { + vmx->nested.vmxon = true; + vmx->nested.smm.vmxon = false; + } + + if (vmx->nested.smm.guest_mode) { + vcpu->arch.hflags &= ~HF_SMM_MASK; + ret = enter_vmx_non_root_mode(vcpu, false); + vcpu->arch.hflags |= HF_SMM_MASK; + if (ret) + return ret; + + vmx->nested.smm.guest_mode = false; + } + return 0; +} + +static int enable_smi_window(struct kvm_vcpu *vcpu) +{ + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -12063,6 +12112,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { #endif .setup_mce = vmx_setup_mce, + + .smi_allowed = vmx_smi_allowed, + .pre_enter_smm = vmx_pre_enter_smm, + .pre_leave_smm = vmx_pre_leave_smm, + .enable_smi_window = enable_smi_window, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03869eb7fcd6..34c85aa2e2d1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2006,10 +2006,12 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } -static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 msr = msr_info->index; + u64 data = msr_info->data; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -2034,6 +2036,9 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; + if (!msr_info->host_initiated && + (offset & 0x3) == 1 && data != 0) + return -1; vcpu->arch.mce_banks[offset] = data; break; } @@ -2283,7 +2288,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return set_msr_mce(vcpu, msr, data); + return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -4034,10 +4039,16 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; + mutex_lock(&kvm->lock); + r = -EINVAL; + if (kvm->created_vcpus) + goto set_identity_unlock; r = -EFAULT; if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) - goto out; + goto set_identity_unlock; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); +set_identity_unlock: + mutex_unlock(&kvm->lock); break; } case KVM_SET_NR_MMU_PAGES: @@ -5275,6 +5286,11 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); } +static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5316,6 +5332,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, + .pre_leave_smm = emulator_pre_leave_smm, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6426,7 +6443,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } kvm_x86_ops->queue_exception(vcpu); - } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; enter_smm(vcpu); } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { @@ -6473,9 +6490,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) { u32 flags = 0; @@ -6641,13 +6655,20 @@ static void enter_smm(struct kvm_vcpu *vcpu) u32 cr0; trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); - vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, buf); else enter_smm_save_state_32(vcpu, buf); + /* + * Give pre_enter_smm() a chance to make ISA-specific changes to the + * vCPU state (e.g. leave guest mode) after we've saved the state into + * the SMM state-save area. + */ + kvm_x86_ops->pre_enter_smm(vcpu, buf); + + vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); if (kvm_x86_ops->get_nmi_mask(vcpu)) @@ -6876,17 +6897,23 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; else { - /* Enable NMI/IRQ window open exits if needed. + /* Enable SMI/NMI/IRQ window open exits if needed. * - * SMIs have two cases: 1) they can be nested, and - * then there is nothing to do here because RSM will - * cause a vmexit anyway; 2) or the SMI can be pending - * because inject_pending_event has completed the - * injection of an IRQ or NMI from the previous vmexit, - * and then we request an immediate exit to inject the SMI. + * SMIs have three cases: + * 1) They can be nested, and then there is nothing to + * do here because RSM will cause a vmexit anyway. + * 2) There is an ISA-specific reason why SMI cannot be + * injected, and the moment when this changes can be + * intercepted. + * 3) Or the SMI can be pending because + * inject_pending_event has completed the injection + * of an IRQ or NMI from the previous vmexit, and + * then we request an immediate exit to inject the + * SMI. */ if (vcpu->arch.smi_pending && !is_smm(vcpu)) - req_immediate_exit = true; + if (!kvm_x86_ops->enable_smi_window(vcpu)) + req_immediate_exit = true; if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) @@ -7798,18 +7825,40 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; + if (kvm_mpx_supported()) { + void *mpx_state_buffer; + + /* + * To avoid have the INIT path from kvm_apic_has_events() that be + * called with loaded FPU and does not let userspace fix the state. + */ + kvm_put_guest_fpu(vcpu); + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, + XFEATURE_MASK_BNDREGS); + if (mpx_state_buffer) + memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, + XFEATURE_MASK_BNDCSR); + if (mpx_state_buffer) + memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + } + if (!init_event) { kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; vcpu->arch.msr_misc_features_enables = 0; + + vcpu->arch.xcr0 = XFEATURE_MASK_FP; } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + vcpu->arch.ia32_xss = 0; + kvm_x86_ops->vcpu_reset(vcpu, init_event); } @@ -7974,16 +8023,11 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; - struct kvm *kvm; int r; - BUG_ON(vcpu->kvm == NULL); - kvm = vcpu->kvm; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); - vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; @@ -8001,7 +8045,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (r < 0) goto fail_free_pio_data; - if (irqchip_in_kernel(kvm)) { + if (irqchip_in_kernel(vcpu->kvm)) { r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; @@ -8023,10 +8067,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) fx_init(vcpu); - vcpu->arch.ia32_tsc_adjust_msr = 0x0; - vcpu->arch.pv_time_enabled = false; - - vcpu->arch.guest_supported_xcr0 = 0; vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); |