diff options
Diffstat (limited to 'arch')
79 files changed, 2221 insertions, 1434 deletions
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 69b746955fca..b9db269c6e61 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -149,6 +149,11 @@ static inline bool kvm_vcpu_trap_is_iabt(struct kvm_vcpu *vcpu) static inline u8 kvm_vcpu_trap_get_fault(struct kvm_vcpu *vcpu) { + return kvm_vcpu_get_hsr(vcpu) & HSR_FSC; +} + +static inline u8 kvm_vcpu_trap_get_fault_type(struct kvm_vcpu *vcpu) +{ return kvm_vcpu_get_hsr(vcpu) & HSR_FSC_TYPE; } diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 6dfb404f6c46..53036e21756b 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -19,6 +19,8 @@ #ifndef __ARM_KVM_HOST_H__ #define __ARM_KVM_HOST_H__ +#include <linux/types.h> +#include <linux/kvm_types.h> #include <asm/kvm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> @@ -40,9 +42,8 @@ #include <kvm/arm_vgic.h> -struct kvm_vcpu; u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); -int kvm_target_cpu(void); +int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_reset_coprocs(struct kvm_vcpu *vcpu); @@ -149,20 +150,17 @@ struct kvm_vcpu_stat { u32 halt_wakeup; }; -struct kvm_vcpu_init; int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init); int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); -struct kvm_one_reg; int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); u64 kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); #define KVM_ARCH_WANT_MMU_NOTIFIER -struct kvm; int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); @@ -172,7 +170,8 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); /* We do not have shadow page tables, hence the empty hooks */ -static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva) +static inline int kvm_age_hva(struct kvm *kvm, unsigned long start, + unsigned long end) { return 0; } @@ -182,12 +181,16 @@ static inline int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) return 0; } +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address) +{ +} + struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu); -struct kvm_one_reg; int kvm_arm_coproc_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); @@ -233,4 +236,10 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic) int kvm_perf_init(void); int kvm_perf_teardown(void); +static inline void kvm_arch_hardware_disable(void) {} +static inline void kvm_arch_hardware_unsetup(void) {} +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5cc0b0f5f72f..3f688b458143 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -78,17 +78,6 @@ static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) flush_pmd_entry(pte); } -static inline bool kvm_is_write_fault(unsigned long hsr) -{ - unsigned long hsr_ec = hsr >> HSR_EC_SHIFT; - if (hsr_ec == HSR_EC_IABT) - return false; - else if ((hsr & HSR_ISV) && !(hsr & HSR_WNR)) - return false; - else - return true; -} - static inline void kvm_clean_pgd(pgd_t *pgd) { clean_dcache_area(pgd, PTRS_PER_S2_PGD * sizeof(pgd_t)); diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index e6ebdd3471e5..09ee408c1a67 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -25,6 +25,7 @@ #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE +#define __KVM_HAVE_READONLY_MEM #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) @@ -173,6 +174,7 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) +#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index a99e0cdf8ba2..779605122f32 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -82,12 +82,12 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void) /** * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus. */ -struct kvm_vcpu __percpu **kvm_get_running_vcpus(void) +struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) { return &kvm_arm_running_vcpu; } -int kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void) { return 0; } @@ -97,27 +97,16 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -void kvm_arch_hardware_disable(void *garbage) -{ -} - int kvm_arch_hardware_setup(void) { return 0; } -void kvm_arch_hardware_unsetup(void) -{ -} - void kvm_arch_check_processor_compat(void *rtn) { *(int *)rtn = 0; } -void kvm_arch_sync_events(struct kvm *kvm) -{ -} /** * kvm_arch_init_vm - initializes a VM data structure @@ -172,6 +161,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm->vcpus[i] = NULL; } } + + kvm_vgic_destroy(kvm); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -188,6 +179,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ARM_PSCI: case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_READONLY_MEM: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -253,6 +245,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_caches(vcpu); kvm_timer_vcpu_terminate(vcpu); + kvm_vgic_vcpu_destroy(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } @@ -268,26 +261,15 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - int ret; - /* Force users to call KVM_ARM_VCPU_INIT */ vcpu->arch.target = -1; - /* Set up VGIC */ - ret = kvm_vgic_vcpu_init(vcpu); - if (ret) - return ret; - /* Set up the timer */ kvm_timer_vcpu_init(vcpu); return 0; } -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { vcpu->cpu = cpu; @@ -428,9 +410,9 @@ static void update_vttbr(struct kvm *kvm) /* update vttbr to be used with the new vmid */ pgd_phys = virt_to_phys(kvm->arch.pgd); + BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK; - kvm->arch.vttbr = pgd_phys & VTTBR_BADDR_MASK; - kvm->arch.vttbr |= vmid; + kvm->arch.vttbr = pgd_phys | vmid; spin_unlock(&kvm_vmid_lock); } diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 37a0fe1bb9bb..7928dbdf2102 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -791,7 +791,7 @@ static bool is_valid_cache(u32 val) u32 level, ctype; if (val >= CSSELR_MAX) - return -ENOENT; + return false; /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */ level = (val >> 1); diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 813e49258690..cc0b78769bd8 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -163,7 +163,7 @@ static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); if (ret != 0) - return ret; + return -EFAULT; return kvm_arm_timer_set_reg(vcpu, reg->id, val); } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 16e7994bf347..eea03069161b 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -746,22 +746,29 @@ static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) return false; } +static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_trap_is_iabt(vcpu)) + return false; + + return kvm_vcpu_dabt_iswrite(vcpu); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_memory_slot *memslot, + struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { int ret; bool write_fault, writable, hugetlb = false, force_pte = false; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; - unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; pgprot_t mem_type = PAGE_S2; - write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); + write_fault = kvm_is_write_fault(vcpu); if (fault_status == FSC_PERM && !write_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; @@ -863,7 +870,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) unsigned long fault_status; phys_addr_t fault_ipa; struct kvm_memory_slot *memslot; - bool is_iabt; + unsigned long hva; + bool is_iabt, write_fault, writable; gfn_t gfn; int ret, idx; @@ -874,17 +882,22 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ - fault_status = kvm_vcpu_trap_get_fault(vcpu); + fault_status = kvm_vcpu_trap_get_fault_type(vcpu); if (fault_status != FSC_FAULT && fault_status != FSC_PERM) { - kvm_err("Unsupported fault status: EC=%#x DFCS=%#lx\n", - kvm_vcpu_trap_get_class(vcpu), fault_status); + kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", + kvm_vcpu_trap_get_class(vcpu), + (unsigned long)kvm_vcpu_trap_get_fault(vcpu), + (unsigned long)kvm_vcpu_get_hsr(vcpu)); return -EFAULT; } idx = srcu_read_lock(&vcpu->kvm->srcu); gfn = fault_ipa >> PAGE_SHIFT; - if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) { + memslot = gfn_to_memslot(vcpu->kvm, gfn); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); + write_fault = kvm_is_write_fault(vcpu); + if (kvm_is_error_hva(hva) || (write_fault && !writable)) { if (is_iabt) { /* Prefetch Abort on I/O address */ kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); @@ -892,13 +905,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) goto out_unlock; } - if (fault_status != FSC_FAULT) { - kvm_err("Unsupported fault status on io memory: %#lx\n", - fault_status); - ret = -EFAULT; - goto out_unlock; - } - /* * The IPA is reported as [MAX:12], so we need to * complement it with the bottom 12 bits from the @@ -910,9 +916,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) goto out_unlock; } - memslot = gfn_to_memslot(vcpu->kvm, gfn); - - ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); if (ret == 0) ret = 1; out_unlock: diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index cc83520459ed..7fd3e27e3ccc 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -122,6 +122,17 @@ #define VTCR_EL2_T0SZ_MASK 0x3f #define VTCR_EL2_T0SZ_40B 24 +/* + * We configure the Stage-2 page tables to always restrict the IPA space to be + * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are + * not known to exist and will break with this configuration. + * + * Note that when using 4K pages, we concatenate two first level page tables + * together. + * + * The magic numbers used for VTTBR_X in this patch can be found in Tables + * D4-23 and D4-25 in ARM DDI 0487A.b. + */ #ifdef CONFIG_ARM64_64K_PAGES /* * Stage2 translation configuration: @@ -149,7 +160,7 @@ #endif #define VTTBR_BADDR_SHIFT (VTTBR_X - 1) -#define VTTBR_BADDR_MASK (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) +#define VTTBR_BADDR_MASK (((1LLU << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT) #define VTTBR_VMID_SHIFT (48LLU) #define VTTBR_VMID_MASK (0xffLLU << VTTBR_VMID_SHIFT) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index fdc3e21abd8d..5674a55b5518 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -174,6 +174,11 @@ static inline bool kvm_vcpu_trap_is_iabt(const struct kvm_vcpu *vcpu) static inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) { + return kvm_vcpu_get_hsr(vcpu) & ESR_EL2_FSC; +} + +static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu) +{ return kvm_vcpu_get_hsr(vcpu) & ESR_EL2_FSC_TYPE; } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e10c45a578e3..2012c4ba8d67 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -22,6 +22,8 @@ #ifndef __ARM64_KVM_HOST_H__ #define __ARM64_KVM_HOST_H__ +#include <linux/types.h> +#include <linux/kvm_types.h> #include <asm/kvm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> @@ -41,8 +43,7 @@ #define KVM_VCPU_MAX_FEATURES 3 -struct kvm_vcpu; -int kvm_target_cpu(void); +int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_dev_ioctl_check_extension(long ext); @@ -164,25 +165,23 @@ struct kvm_vcpu_stat { u32 halt_wakeup; }; -struct kvm_vcpu_init; int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init); int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); -struct kvm_one_reg; int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); #define KVM_ARCH_WANT_MMU_NOTIFIER -struct kvm; int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); /* We do not have shadow page tables, hence the empty hooks */ -static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva) +static inline int kvm_age_hva(struct kvm *kvm, unsigned long start, + unsigned long end) { return 0; } @@ -192,8 +191,13 @@ static inline int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) return 0; } +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address) +{ +} + struct kvm_vcpu *kvm_arm_get_running_vcpu(void); -struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); +struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); u64 kvm_call_hyp(void *hypfn, ...); @@ -244,4 +248,10 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic) } } +static inline void kvm_arch_hardware_disable(void) {} +static inline void kvm_arch_hardware_unsetup(void) {} +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 8e138c7c53ac..a030d163840b 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -59,10 +59,9 @@ #define KERN_TO_HYP(kva) ((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET) /* - * Align KVM with the kernel's view of physical memory. Should be - * 40bit IPA, with PGD being 8kB aligned in the 4KB page configuration. + * We currently only support a 40bit IPA. */ -#define KVM_PHYS_SHIFT PHYS_MASK_SHIFT +#define KVM_PHYS_SHIFT (40) #define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) #define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) @@ -93,19 +92,6 @@ void kvm_clear_hyp_idmap(void); #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) #define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) -static inline bool kvm_is_write_fault(unsigned long esr) -{ - unsigned long esr_ec = esr >> ESR_EL2_EC_SHIFT; - - if (esr_ec == ESR_EL2_EC_IABT) - return false; - - if ((esr & ESR_EL2_ISV) && !(esr & ESR_EL2_WNR)) - return false; - - return true; -} - static inline void kvm_clean_pgd(pgd_t *pgd) {} static inline void kvm_clean_pmd_entry(pmd_t *pmd) {} static inline void kvm_clean_pte(pte_t *pte) {} diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index e633ff8cdec8..8e38878c87c6 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -37,6 +37,7 @@ #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_IRQ_LINE +#define __KVM_HAVE_READONLY_MEM #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) @@ -159,6 +160,7 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) +#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 8d1ec2887a26..76794692c20b 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -174,7 +174,7 @@ static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); if (ret != 0) - return ret; + return -EFAULT; return kvm_arm_timer_set_reg(vcpu, reg->id, val); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 5805e7c4a4dd..4cc3b719208e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1218,7 +1218,7 @@ static bool is_valid_cache(u32 val) u32 level, ctype; if (val >= CSSELR_MAX) - return -ENOENT; + return false; /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */ level = (val >> 1); diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index db95f570705f..4729752b7256 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -234,9 +234,6 @@ struct kvm_vm_data { #define KVM_REQ_PTC_G 32 #define KVM_REQ_RESUME 33 -struct kvm; -struct kvm_vcpu; - struct kvm_mmio_req { uint64_t addr; /* physical address */ uint64_t size; /* size in bytes */ @@ -595,6 +592,18 @@ void kvm_sal_emul(struct kvm_vcpu *vcpu); struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_free_memslot(struct kvm *kvm, + struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} +static inline void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + enum kvm_mr_change change) {} +static inline void kvm_arch_hardware_unsetup(void) {} + #endif /* __ASSEMBLY__*/ #endif diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 0729ba6acddf..ec6b9acb6bea 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -125,7 +125,7 @@ long ia64_pal_vp_create(u64 *vpd, u64 *host_iva, u64 *opt_handler) static DEFINE_SPINLOCK(vp_lock); -int kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void) { long status; long tmp_base; @@ -160,7 +160,7 @@ int kvm_arch_hardware_enable(void *garbage) return 0; } -void kvm_arch_hardware_disable(void *garbage) +void kvm_arch_hardware_disable(void) { long status; @@ -1364,10 +1364,6 @@ static void kvm_release_vm_pages(struct kvm *kvm) } } -void kvm_arch_sync_events(struct kvm *kvm) -{ -} - void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); @@ -1376,10 +1372,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_release_vm_pages(kvm); } -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { if (cpu != vcpu->cpu) { @@ -1468,7 +1460,6 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) kfree(vcpu->arch.apic); } - long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1551,21 +1542,12 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ -} - int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) { return 0; } -void kvm_arch_memslots_updated(struct kvm *kvm) -{ -} - int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem, @@ -1597,14 +1579,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return 0; } -void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, - enum kvm_mr_change change) -{ - return; -} - void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_flush_remote_tlbs(kvm); @@ -1853,10 +1827,6 @@ int kvm_arch_hardware_setup(void) return 0; } -void kvm_arch_hardware_unsetup(void) -{ -} - int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) { return __apic_accept_irq(vcpu, irq->vector); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 7a3fc67bd7f9..f2c249796ea8 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -96,11 +96,6 @@ #define CAUSEB_DC 27 #define CAUSEF_DC (_ULCAST_(1) << 27) -struct kvm; -struct kvm_run; -struct kvm_vcpu; -struct kvm_interrupt; - extern atomic_t kvm_mips_instance; extern pfn_t(*kvm_mips_gfn_to_pfn) (struct kvm *kvm, gfn_t gfn); extern void (*kvm_mips_release_pfn_clean) (pfn_t pfn); @@ -767,5 +762,16 @@ extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu); extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm); +static inline void kvm_arch_hardware_disable(void) {} +static inline void kvm_arch_hardware_unsetup(void) {} +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_free_memslot(struct kvm *kvm, + struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} +static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} +static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) {} +static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index cd7114147ae7..e3b21e51ff7e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -77,24 +77,16 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } -int kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void) { return 0; } -void kvm_arch_hardware_disable(void *garbage) -{ -} - int kvm_arch_hardware_setup(void) { return 0; } -void kvm_arch_hardware_unsetup(void) -{ -} - void kvm_arch_check_processor_compat(void *rtn) { *(int *)rtn = 0; @@ -163,10 +155,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm) mutex_unlock(&kvm->lock); } -void kvm_arch_sync_events(struct kvm *kvm) -{ -} - static void kvm_mips_uninit_tlbs(void *arg) { /* Restore wired count */ @@ -194,21 +182,12 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, return -ENOIOCTLCMD; } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ -} - int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) { return 0; } -void kvm_arch_memslots_updated(struct kvm *kvm) -{ -} - int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem, @@ -254,19 +233,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, } } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ -} - -void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ -} - -void kvm_arch_flush_shadow(struct kvm *kvm) -{ -} - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err, size, offset; @@ -998,10 +964,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) return 0; } -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ -} - int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 465dfcb82c92..5bca220bbb60 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -53,17 +53,17 @@ #define BOOKE_INTERRUPT_DEBUG 15 /* E500 */ -#define BOOKE_INTERRUPT_SPE_ALTIVEC_UNAVAIL 32 -#define BOOKE_INTERRUPT_SPE_FP_DATA_ALTIVEC_ASSIST 33 -/* - * TODO: Unify 32-bit and 64-bit kernel exception handlers to use same defines - */ -#define BOOKE_INTERRUPT_SPE_UNAVAIL BOOKE_INTERRUPT_SPE_ALTIVEC_UNAVAIL -#define BOOKE_INTERRUPT_SPE_FP_DATA BOOKE_INTERRUPT_SPE_FP_DATA_ALTIVEC_ASSIST -#define BOOKE_INTERRUPT_ALTIVEC_UNAVAIL BOOKE_INTERRUPT_SPE_ALTIVEC_UNAVAIL -#define BOOKE_INTERRUPT_ALTIVEC_ASSIST \ - BOOKE_INTERRUPT_SPE_FP_DATA_ALTIVEC_ASSIST +#ifdef CONFIG_SPE_POSSIBLE +#define BOOKE_INTERRUPT_SPE_UNAVAIL 32 +#define BOOKE_INTERRUPT_SPE_FP_DATA 33 #define BOOKE_INTERRUPT_SPE_FP_ROUND 34 +#endif + +#ifdef CONFIG_PPC_E500MC +#define BOOKE_INTERRUPT_ALTIVEC_UNAVAIL 32 +#define BOOKE_INTERRUPT_ALTIVEC_ASSIST 33 +#endif + #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35 #define BOOKE_INTERRUPT_DOORBELL 36 #define BOOKE_INTERRUPT_DOORBELL_CRITICAL 37 diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index f7aa5cc395c4..3286f0d6a86c 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -23,15 +23,16 @@ #include <linux/types.h> #include <linux/kvm_host.h> -/* LPIDs we support with this build -- runtime limit may be lower */ +/* + * Number of available lpids. Only the low-order 6 bits of LPID rgister are + * implemented on e500mc+ cores. + */ #define KVMPPC_NR_LPIDS 64 #define KVMPPC_INST_EHPRIV 0x7c00021c #define EHPRIV_OC_SHIFT 11 /* "ehpriv 1" : ehpriv with OC = 1 is used for debug emulation */ #define EHPRIV_OC_DEBUG 1 -#define KVMPPC_INST_EHPRIV_DEBUG (KVMPPC_INST_EHPRIV | \ - (EHPRIV_OC_DEBUG << EHPRIV_OC_SHIFT)) static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 98d9dd50d063..047855619cc4 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -53,14 +53,18 @@ #define KVM_ARCH_WANT_MMU_NOTIFIER -struct kvm; extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address) +{ +} + #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 @@ -76,10 +80,6 @@ extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); /* Physical Address Mask - allowed range of real mode RAM access */ #define KVM_PAM 0x0fffffffffffffffULL -struct kvm; -struct kvm_run; -struct kvm_vcpu; - struct lppaca; struct slb_shadow; struct dtl_entry; @@ -144,6 +144,7 @@ enum kvm_exit_types { EMULATED_TLBWE_EXITS, EMULATED_RFI_EXITS, EMULATED_RFCI_EXITS, + EMULATED_RFDI_EXITS, DEC_EXITS, EXT_INTR_EXITS, HALT_WAKEUP, @@ -589,8 +590,6 @@ struct kvm_vcpu_arch { u32 crit_save; /* guest debug registers*/ struct debug_reg dbg_reg; - /* hardware visible debug registers when in guest state */ - struct debug_reg shadow_dbg_reg; #endif gpa_t paddr_accessed; gva_t vaddr_accessed; @@ -612,7 +611,6 @@ struct kvm_vcpu_arch { u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ struct hrtimer dec_timer; - struct tasklet_struct tasklet; u64 dec_jiffies; u64 dec_expires; unsigned long pending_exceptions; @@ -687,4 +685,12 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP #define __KVM_HAVE_CREATE_DEVICE +static inline void kvm_arch_hardware_disable(void) {} +static inline void kvm_arch_hardware_unsetup(void) {} +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} +static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_exit(void) {} + #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index fb86a2299d8a..a6dcdb6d13c1 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -38,6 +38,12 @@ #include <asm/paca.h> #endif +/* + * KVMPPC_INST_SW_BREAKPOINT is debug Instruction + * for supporting software breakpoint. + */ +#define KVMPPC_INST_SW_BREAKPOINT 0x00dddd00 + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_DO_MMIO, /* kvm_run filled with MMIO request */ @@ -89,7 +95,7 @@ extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu); extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); -extern void kvmppc_decrementer_func(unsigned long data); +extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); @@ -206,6 +212,9 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq); extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq); +void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu); +void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu); + union kvmppc_one_reg { u32 wval; u64 dval; @@ -243,7 +252,7 @@ struct kvmppc_ops { int (*unmap_hva)(struct kvm *kvm, unsigned long hva); int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, unsigned long end); - int (*age_hva)(struct kvm *kvm, unsigned long hva); + int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); int (*test_age_hva)(struct kvm *kvm, unsigned long hva); void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte); void (*mmu_destroy)(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 1d653308a33c..16547efa2d5a 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -319,6 +319,8 @@ * DBSR bits which have conflicting definitions on true Book E versus IBM 40x. */ #ifdef CONFIG_BOOKE +#define DBSR_IDE 0x80000000 /* Imprecise Debug Event */ +#define DBSR_MRR 0x30000000 /* Most Recent Reset */ #define DBSR_IC 0x08000000 /* Instruction Completion */ #define DBSR_BT 0x04000000 /* Branch Taken */ #define DBSR_IRPT 0x02000000 /* Exception Debug Event */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index e0e49dbb145d..ab4d4732c492 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -476,6 +476,11 @@ struct kvm_get_htab_header { /* FP and vector status/control registers */ #define KVM_REG_PPC_FPSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80) +/* + * VSCR register is documented as a 32-bit register in the ISA, but it can + * only be accesses via a vector register. Expose VSCR as a 32-bit register + * even though the kernel represents it as a 128-bit vector. + */ #define KVM_REG_PPC_VSCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81) /* Virtual processor areas */ @@ -557,6 +562,7 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_DABRX (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb8) #define KVM_REG_PPC_WORT (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb9) #define KVM_REG_PPC_SPRG9 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xba) +#define KVM_REG_PPC_DBSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbb) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S index 4f1393d20079..dddba3e94260 100644 --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S @@ -91,6 +91,7 @@ _GLOBAL(setup_altivec_idle) blr +#ifdef CONFIG_PPC_E500MC _GLOBAL(__setup_cpu_e6500) mflr r6 #ifdef CONFIG_PPC64 @@ -107,14 +108,20 @@ _GLOBAL(__setup_cpu_e6500) bl __setup_cpu_e5500 mtlr r6 blr +#endif /* CONFIG_PPC_E500MC */ #ifdef CONFIG_PPC32 +#ifdef CONFIG_E200 _GLOBAL(__setup_cpu_e200) /* enable dedicated debug exception handling resources (Debug APU) */ mfspr r3,SPRN_HID0 ori r3,r3,HID0_DAPUEN@l mtspr SPRN_HID0,r3 b __setup_e200_ivors +#endif /* CONFIG_E200 */ + +#ifdef CONFIG_E500 +#ifndef CONFIG_PPC_E500MC _GLOBAL(__setup_cpu_e500v1) _GLOBAL(__setup_cpu_e500v2) mflr r4 @@ -129,6 +136,7 @@ _GLOBAL(__setup_cpu_e500v2) #endif mtlr r4 blr +#else /* CONFIG_PPC_E500MC */ _GLOBAL(__setup_cpu_e500mc) _GLOBAL(__setup_cpu_e5500) mflr r5 @@ -159,7 +167,9 @@ _GLOBAL(__setup_cpu_e5500) 2: mtlr r5 blr -#endif +#endif /* CONFIG_PPC_E500MC */ +#endif /* CONFIG_E500 */ +#endif /* CONFIG_PPC32 */ #ifdef CONFIG_PPC_BOOK3E_64 _GLOBAL(__restore_cpu_e6500) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 9b6dcaaec1a3..808405906336 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -1961,6 +1961,7 @@ static struct cpu_spec __initdata cpu_specs[] = { #endif /* CONFIG_PPC32 */ #ifdef CONFIG_E500 #ifdef CONFIG_PPC32 +#ifndef CONFIG_PPC_E500MC { /* e500 */ .pvr_mask = 0xffff0000, .pvr_value = 0x80200000, @@ -2000,6 +2001,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_e500, .platform = "ppc8548", }, +#else { /* e500mc */ .pvr_mask = 0xffff0000, .pvr_value = 0x80230000, @@ -2018,7 +2020,9 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_e500mc, .platform = "ppce500mc", }, +#endif /* CONFIG_PPC_E500MC */ #endif /* CONFIG_PPC32 */ +#ifdef CONFIG_PPC_E500MC { /* e5500 */ .pvr_mask = 0xffff0000, .pvr_value = 0x80240000, @@ -2062,6 +2066,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_e500mc, .platform = "ppce6500", }, +#endif /* CONFIG_PPC_E500MC */ #ifdef CONFIG_PPC32 { /* default match */ .pvr_mask = 0x00000000, diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index bb9cac6c8051..3e68d1c69718 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -635,7 +635,7 @@ interrupt_end_book3e: /* Altivec Unavailable Interrupt */ START_EXCEPTION(altivec_unavailable); - NORMAL_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_SPE_ALTIVEC_UNAVAIL, + NORMAL_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_ALTIVEC_UNAVAIL, PROLOG_ADDITION_NONE) /* we can probably do a shorter exception entry for that one... */ EXCEPTION_COMMON(0x200) @@ -658,7 +658,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) /* AltiVec Assist */ START_EXCEPTION(altivec_assist); NORMAL_EXCEPTION_PROLOG(0x220, - BOOKE_INTERRUPT_SPE_FP_DATA_ALTIVEC_ASSIST, + BOOKE_INTERRUPT_ALTIVEC_ASSIST, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x220) INTS_DISABLE diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index b497188a94a1..fffd1f96bb1d 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -613,34 +613,36 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) mfspr r10, SPRN_SPRG_RSCRATCH0 b InstructionStorage +/* Define SPE handlers for e200 and e500v2 */ #ifdef CONFIG_SPE /* SPE Unavailable */ START_EXCEPTION(SPEUnavailable) - NORMAL_EXCEPTION_PROLOG(SPE_ALTIVEC_UNAVAIL) + NORMAL_EXCEPTION_PROLOG(SPE_UNAVAIL) beq 1f bl load_up_spe b fast_exception_return 1: addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_EE_LITE(0x2010, KernelSPE) -#else - EXCEPTION(0x2020, SPE_ALTIVEC_UNAVAIL, SPEUnavailable, \ +#elif defined(CONFIG_SPE_POSSIBLE) + EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \ unknown_exception, EXC_XFER_EE) -#endif /* CONFIG_SPE */ +#endif /* CONFIG_SPE_POSSIBLE */ /* SPE Floating Point Data */ #ifdef CONFIG_SPE - EXCEPTION(0x2030, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData, + EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData, SPEFloatingPointException, EXC_XFER_EE) /* SPE Floating Point Round */ EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ SPEFloatingPointRoundException, EXC_XFER_EE) -#else - EXCEPTION(0x2040, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData, +#elif defined(CONFIG_SPE_POSSIBLE) + EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, unknown_exception, EXC_XFER_EE) EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ unknown_exception, EXC_XFER_EE) -#endif /* CONFIG_SPE */ +#endif /* CONFIG_SPE_POSSIBLE */ + /* Performance Monitor */ EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \ @@ -947,6 +949,7 @@ get_phys_addr: * Global functions */ +#ifdef CONFIG_E200 /* Adjust or setup IVORs for e200 */ _GLOBAL(__setup_e200_ivors) li r3,DebugDebug@l @@ -959,7 +962,10 @@ _GLOBAL(__setup_e200_ivors) mtspr SPRN_IVOR34,r3 sync blr +#endif +#ifdef CONFIG_E500 +#ifndef CONFIG_PPC_E500MC /* Adjust or setup IVORs for e500v1/v2 */ _GLOBAL(__setup_e500_ivors) li r3,DebugCrit@l @@ -974,7 +980,7 @@ _GLOBAL(__setup_e500_ivors) mtspr SPRN_IVOR35,r3 sync blr - +#else /* Adjust or setup IVORs for e500mc */ _GLOBAL(__setup_e500mc_ivors) li r3,DebugDebug@l @@ -1000,6 +1006,8 @@ _GLOBAL(__setup_ehv_ivors) mtspr SPRN_IVOR41,r3 sync blr +#endif /* CONFIG_PPC_E500MC */ +#endif /* CONFIG_E500 */ #ifdef CONFIG_SPE /* diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index dd03f6b299ba..b32db4b95361 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -535,174 +535,111 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return -ENOTSUPP; } -int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { - int r; - union kvmppc_one_reg val; - int size; + int r = 0; long int i; - size = one_reg_size(reg->id); - if (size > sizeof(val)) - return -EINVAL; - - r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, id, val); if (r == -EINVAL) { r = 0; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_DAR: - val = get_reg_val(reg->id, kvmppc_get_dar(vcpu)); + *val = get_reg_val(id, kvmppc_get_dar(vcpu)); break; case KVM_REG_PPC_DSISR: - val = get_reg_val(reg->id, kvmppc_get_dsisr(vcpu)); + *val = get_reg_val(id, kvmppc_get_dsisr(vcpu)); break; case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: - i = reg->id - KVM_REG_PPC_FPR0; - val = get_reg_val(reg->id, VCPU_FPR(vcpu, i)); + i = id - KVM_REG_PPC_FPR0; + *val = get_reg_val(id, VCPU_FPR(vcpu, i)); break; case KVM_REG_PPC_FPSCR: - val = get_reg_val(reg->id, vcpu->arch.fp.fpscr); - break; -#ifdef CONFIG_ALTIVEC - case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: - if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { - r = -ENXIO; - break; - } - val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; - break; - case KVM_REG_PPC_VSCR: - if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { - r = -ENXIO; - break; - } - val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); + *val = get_reg_val(id, vcpu->arch.fp.fpscr); break; - case KVM_REG_PPC_VRSAVE: - val = get_reg_val(reg->id, vcpu->arch.vrsave); - break; -#endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: if (cpu_has_feature(CPU_FTR_VSX)) { - long int i = reg->id - KVM_REG_PPC_VSR0; - val.vsxval[0] = vcpu->arch.fp.fpr[i][0]; - val.vsxval[1] = vcpu->arch.fp.fpr[i][1]; + i = id - KVM_REG_PPC_VSR0; + val->vsxval[0] = vcpu->arch.fp.fpr[i][0]; + val->vsxval[1] = vcpu->arch.fp.fpr[i][1]; } else { r = -ENXIO; } break; #endif /* CONFIG_VSX */ - case KVM_REG_PPC_DEBUG_INST: { - u32 opcode = INS_TW; - r = copy_to_user((u32 __user *)(long)reg->addr, - &opcode, sizeof(u32)); + case KVM_REG_PPC_DEBUG_INST: + *val = get_reg_val(id, INS_TW); break; - } #ifdef CONFIG_KVM_XICS case KVM_REG_PPC_ICP_STATE: if (!vcpu->arch.icp) { r = -ENXIO; break; } - val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu)); + *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); break; #endif /* CONFIG_KVM_XICS */ case KVM_REG_PPC_FSCR: - val = get_reg_val(reg->id, vcpu->arch.fscr); + *val = get_reg_val(id, vcpu->arch.fscr); break; case KVM_REG_PPC_TAR: - val = get_reg_val(reg->id, vcpu->arch.tar); + *val = get_reg_val(id, vcpu->arch.tar); break; case KVM_REG_PPC_EBBHR: - val = get_reg_val(reg->id, vcpu->arch.ebbhr); + *val = get_reg_val(id, vcpu->arch.ebbhr); break; case KVM_REG_PPC_EBBRR: - val = get_reg_val(reg->id, vcpu->arch.ebbrr); + *val = get_reg_val(id, vcpu->arch.ebbrr); break; case KVM_REG_PPC_BESCR: - val = get_reg_val(reg->id, vcpu->arch.bescr); + *val = get_reg_val(id, vcpu->arch.bescr); break; case KVM_REG_PPC_VTB: - val = get_reg_val(reg->id, vcpu->arch.vtb); + *val = get_reg_val(id, vcpu->arch.vtb); break; case KVM_REG_PPC_IC: - val = get_reg_val(reg->id, vcpu->arch.ic); + *val = get_reg_val(id, vcpu->arch.ic); break; default: r = -EINVAL; break; } } - if (r) - return r; - - if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) - r = -EFAULT; return r; } -int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { - int r; - union kvmppc_one_reg val; - int size; + int r = 0; long int i; - size = one_reg_size(reg->id); - if (size > sizeof(val)) - return -EINVAL; - - if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) - return -EFAULT; - - r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, id, val); if (r == -EINVAL) { r = 0; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_DAR: - kvmppc_set_dar(vcpu, set_reg_val(reg->id, val)); + kvmppc_set_dar(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_DSISR: - kvmppc_set_dsisr(vcpu, set_reg_val(reg->id, val)); + kvmppc_set_dsisr(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: - i = reg->id - KVM_REG_PPC_FPR0; - VCPU_FPR(vcpu, i) = set_reg_val(reg->id, val); + i = id - KVM_REG_PPC_FPR0; + VCPU_FPR(vcpu, i) = set_reg_val(id, *val); break; case KVM_REG_PPC_FPSCR: - vcpu->arch.fp.fpscr = set_reg_val(reg->id, val); - break; -#ifdef CONFIG_ALTIVEC - case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: - if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { - r = -ENXIO; - break; - } - vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; - break; - case KVM_REG_PPC_VSCR: - if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { - r = -ENXIO; - break; - } - vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); - break; - case KVM_REG_PPC_VRSAVE: - if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { - r = -ENXIO; - break; - } - vcpu->arch.vrsave = set_reg_val(reg->id, val); + vcpu->arch.fp.fpscr = set_reg_val(id, *val); break; -#endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: if (cpu_has_feature(CPU_FTR_VSX)) { - long int i = reg->id - KVM_REG_PPC_VSR0; - vcpu->arch.fp.fpr[i][0] = val.vsxval[0]; - vcpu->arch.fp.fpr[i][1] = val.vsxval[1]; + i = id - KVM_REG_PPC_VSR0; + vcpu->arch.fp.fpr[i][0] = val->vsxval[0]; + vcpu->arch.fp.fpr[i][1] = val->vsxval[1]; } else { r = -ENXIO; } @@ -715,29 +652,29 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) break; } r = kvmppc_xics_set_icp(vcpu, - set_reg_val(reg->id, val)); + set_reg_val(id, *val)); break; #endif /* CONFIG_KVM_XICS */ case KVM_REG_PPC_FSCR: - vcpu->arch.fscr = set_reg_val(reg->id, val); + vcpu->arch.fscr = set_reg_val(id, *val); break; case KVM_REG_PPC_TAR: - vcpu->arch.tar = set_reg_val(reg->id, val); + vcpu->arch.tar = set_reg_val(id, *val); break; case KVM_REG_PPC_EBBHR: - vcpu->arch.ebbhr = set_reg_val(reg->id, val); + vcpu->arch.ebbhr = set_reg_val(id, *val); break; case KVM_REG_PPC_EBBRR: - vcpu->arch.ebbrr = set_reg_val(reg->id, val); + vcpu->arch.ebbrr = set_reg_val(id, *val); break; case KVM_REG_PPC_BESCR: - vcpu->arch.bescr = set_reg_val(reg->id, val); + vcpu->arch.bescr = set_reg_val(id, *val); break; case KVM_REG_PPC_VTB: - vcpu->arch.vtb = set_reg_val(reg->id, val); + vcpu->arch.vtb = set_reg_val(id, *val); break; case KVM_REG_PPC_IC: - vcpu->arch.ic = set_reg_val(reg->id, val); + vcpu->arch.ic = set_reg_val(id, *val); break; default: r = -EINVAL; @@ -778,13 +715,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - return -EINVAL; + vcpu->guest_debug = dbg->control; + return 0; } -void kvmppc_decrementer_func(unsigned long data) +void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - kvmppc_core_queue_dec(vcpu); kvm_vcpu_kick(vcpu); } @@ -851,9 +787,9 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm->arch.kvm_ops->age_hva(kvm, hva); + return kvm->arch.kvm_ops->age_hva(kvm, start, end); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 4bf956cf94d6..d2b3ec088b8c 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -17,7 +17,8 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end); -extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, + unsigned long end); extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 79294c4c5015..d40770248b6a 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1002,11 +1002,11 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return ret; } -int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva) +int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) { if (!kvm->arch.using_mmu_notifiers) return 0; - return kvm_handle_hva(kvm, hva, kvm_age_rmapp); + return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp); } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 27cced9c7249..e63587d30b70 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -725,6 +725,30 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd) return kvmppc_hcall_impl_hv_realmode(cmd); } +static int kvmppc_emulate_debug_inst(struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + u32 last_inst; + + if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != + EMULATE_DONE) { + /* + * Fetch failed, so return to guest and + * try executing it again. + */ + return RESUME_GUEST; + } + + if (last_inst == KVMPPC_INST_SW_BREAKPOINT) { + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = kvmppc_get_pc(vcpu); + return RESUME_HOST; + } else { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + return RESUME_GUEST; + } +} + static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -807,12 +831,18 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; /* * This occurs if the guest executes an illegal instruction. - * We just generate a program interrupt to the guest, since - * we don't emulate any guest instructions at this stage. + * If the guest debug is disabled, generate a program interrupt + * to the guest. If guest debug is enabled, we need to check + * whether the instruction is a software breakpoint instruction. + * Accordingly return to Guest or Host. */ case BOOK3S_INTERRUPT_H_EMUL_ASSIST: - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - r = RESUME_GUEST; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { + r = kvmppc_emulate_debug_inst(run, vcpu); + } else { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + r = RESUME_GUEST; + } break; /* * This occurs if the guest (kernel or userspace), does something that @@ -856,7 +886,9 @@ static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, { int i, j; - kvmppc_set_pvr_hv(vcpu, sregs->pvr); + /* Only accept the same PVR as the host's, since we can't spoof it */ + if (sregs->pvr != vcpu->arch.pvr) + return -EINVAL; j = 0; for (i = 0; i < vcpu->arch.slb_nr; i++) { @@ -922,6 +954,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, long int i; switch (id) { + case KVM_REG_PPC_DEBUG_INST: + *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); + break; case KVM_REG_PPC_HIOR: *val = get_reg_val(id, 0); break; @@ -1489,7 +1524,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, static int kvmppc_grab_hwthread(int cpu) { struct paca_struct *tpaca; - long timeout = 1000; + long timeout = 10000; tpaca = &paca[cpu]; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index b9615ba5b083..4fdc27c80f4c 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -163,6 +163,12 @@ void __init kvm_cma_reserve(void) unsigned long align_size; struct memblock_region *reg; phys_addr_t selected_size = 0; + + /* + * We need CMA reservation only when we are in HV mode + */ + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; /* * We cannot use memblock_phys_mem_size() here, because * memblock_analyze() has not been called yet. diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index f0c4db7704c3..edb2ccdbb2ba 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -355,6 +355,7 @@ kvmppc_hv_entry: * MSR = ~IR|DR * R13 = PACA * R1 = host R1 + * R2 = TOC * all other volatile GPRS = free */ mflr r0 @@ -503,7 +504,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) toc_tlbie_lock: .tc native_tlbie_lock[TC],native_tlbie_lock .previous - ld r3,toc_tlbie_lock@toc(2) + ld r3,toc_tlbie_lock@toc(r2) #ifdef __BIG_ENDIAN__ lwz r8,PACA_LOCK_TOKEN(r13) #else diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index faffb27badd9..cf2eb16846d1 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -295,7 +295,8 @@ static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, return 0; } -static int kvm_age_hva_pr(struct kvm *kvm, unsigned long hva) +static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start, + unsigned long end) { /* XXX could be more clever ;) */ return 0; @@ -1319,6 +1320,9 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, int r = 0; switch (id) { + case KVM_REG_PPC_DEBUG_INST: + *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); + break; case KVM_REG_PPC_HIOR: *val = get_reg_val(id, to_book3s(vcpu)->hior); break; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index b4c89fa6f109..9b55dec2d6cc 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -124,6 +124,40 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) } #endif +/* + * Load up guest vcpu FP state if it's needed. + * It also set the MSR_FP in thread so that host know + * we're holding FPU, and then host can help to save + * guest vcpu FP state if other threads require to use FPU. + * This simulates an FP unavailable fault. + * + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU + if (!(current->thread.regs->msr & MSR_FP)) { + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + current->thread.fp_save_area = &vcpu->arch.fp; + current->thread.regs->msr |= MSR_FP; + } +#endif +} + +/* + * Save guest vcpu FP state into thread. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU + if (current->thread.regs->msr & MSR_FP) + giveup_fpu(current); + current->thread.fp_save_area = NULL; +#endif +} + static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) { #if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV) @@ -134,6 +168,40 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +/* + * Simulate AltiVec unavailable fault to load guest state + * from thread to AltiVec unit. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_load_guest_altivec(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) { + if (!(current->thread.regs->msr & MSR_VEC)) { + enable_kernel_altivec(); + load_vr_state(&vcpu->arch.vr); + current->thread.vr_save_area = &vcpu->arch.vr; + current->thread.regs->msr |= MSR_VEC; + } + } +#endif +} + +/* + * Save guest vcpu AltiVec state into thread. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_save_guest_altivec(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) { + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + current->thread.vr_save_area = NULL; + } +#endif +} + static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) { /* Synchronize guest's desire to get debug interrupts into shadow MSR */ @@ -267,6 +335,16 @@ static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); } +void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DEBUG); +} + +void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_DEBUG, &vcpu->arch.pending_exceptions); +} + static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) { kvmppc_set_srr0(vcpu, srr0); @@ -341,9 +419,15 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_ITLB_MISS: case BOOKE_IRQPRIO_SYSCALL: case BOOKE_IRQPRIO_FP_UNAVAIL: +#ifdef CONFIG_SPE_POSSIBLE case BOOKE_IRQPRIO_SPE_UNAVAIL: case BOOKE_IRQPRIO_SPE_FP_DATA: case BOOKE_IRQPRIO_SPE_FP_ROUND: +#endif +#ifdef CONFIG_ALTIVEC + case BOOKE_IRQPRIO_ALTIVEC_UNAVAIL: + case BOOKE_IRQPRIO_ALTIVEC_ASSIST: +#endif case BOOKE_IRQPRIO_AP_UNAVAIL: allowed = 1; msr_mask = MSR_CE | MSR_ME | MSR_DE; @@ -377,7 +461,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, allowed = vcpu->arch.shared->msr & MSR_DE; allowed = allowed && !crit; msr_mask = MSR_ME; - int_class = INT_CLASS_CRIT; + if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) + int_class = INT_CLASS_DBG; + else + int_class = INT_CLASS_CRIT; + break; } @@ -654,20 +742,27 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* * Since we can't trap on MSR_FP in GS-mode, we consider the guest - * as always using the FPU. Kernel usage of FP (via - * enable_kernel_fp()) in this thread must not occur while - * vcpu->fpu_active is set. + * as always using the FPU. */ - vcpu->fpu_active = 1; - kvmppc_load_guest_fp(vcpu); #endif +#ifdef CONFIG_ALTIVEC + /* Save userspace AltiVec state in stack */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + enable_kernel_altivec(); + /* + * Since we can't trap on MSR_VEC in GS-mode, we consider the guest + * as always using the AltiVec. + */ + kvmppc_load_guest_altivec(vcpu); +#endif + /* Switch to guest debug context */ - debug = vcpu->arch.shadow_dbg_reg; + debug = vcpu->arch.dbg_reg; switch_booke_debug_regs(&debug); debug = current->thread.debug; - current->thread.debug = vcpu->arch.shadow_dbg_reg; + current->thread.debug = vcpu->arch.dbg_reg; vcpu->arch.pgdir = current->mm->pgd; kvmppc_fix_ee_before_entry(); @@ -683,8 +778,10 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #ifdef CONFIG_PPC_FPU kvmppc_save_guest_fp(vcpu); +#endif - vcpu->fpu_active = 0; +#ifdef CONFIG_ALTIVEC + kvmppc_save_guest_altivec(vcpu); #endif out: @@ -728,9 +825,36 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu) { - struct debug_reg *dbg_reg = &(vcpu->arch.shadow_dbg_reg); + struct debug_reg *dbg_reg = &(vcpu->arch.dbg_reg); u32 dbsr = vcpu->arch.dbsr; + if (vcpu->guest_debug == 0) { + /* + * Debug resources belong to Guest. + * Imprecise debug event is not injected + */ + if (dbsr & DBSR_IDE) { + dbsr &= ~DBSR_IDE; + if (!dbsr) + return RESUME_GUEST; + } + + if (dbsr && (vcpu->arch.shared->msr & MSR_DE) && + (vcpu->arch.dbg_reg.dbcr0 & DBCR0_IDM)) + kvmppc_core_queue_debug(vcpu); + + /* Inject a program interrupt if trap debug is not allowed */ + if ((dbsr & DBSR_TIE) && !(vcpu->arch.shared->msr & MSR_DE)) + kvmppc_core_queue_program(vcpu, ESR_PTR); + + return RESUME_GUEST; + } + + /* + * Debug resource owned by userspace. + * Clear guest dbsr (vcpu->arch.dbsr) + */ + vcpu->arch.dbsr = 0; run->debug.arch.status = 0; run->debug.arch.address = vcpu->arch.pc; @@ -868,7 +992,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOKE_INTERRUPT_DATA_STORAGE: case BOOKE_INTERRUPT_DTLB_MISS: case BOOKE_INTERRUPT_HV_PRIV: - emulated = kvmppc_get_last_inst(vcpu, false, &last_inst); + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + break; + case BOOKE_INTERRUPT_PROGRAM: + /* SW breakpoints arrive as illegal instructions on HV */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); break; default: break; @@ -947,6 +1076,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOKE_INTERRUPT_PROGRAM: + if ((vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) && + (last_inst == KVMPPC_INST_SW_BREAKPOINT)) { + /* + * We are here because of an SW breakpoint instr, + * so lets return to host to handle. + */ + r = kvmppc_handle_debug(run, vcpu); + run->exit_reason = KVM_EXIT_DEBUG; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + break; + } + if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) { /* * Program traps generated by user-level software must @@ -991,7 +1132,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND); r = RESUME_GUEST; break; -#else +#elif defined(CONFIG_SPE_POSSIBLE) case BOOKE_INTERRUPT_SPE_UNAVAIL: /* * Guest wants SPE, but host kernel doesn't support it. Send @@ -1012,6 +1153,22 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run->hw.hardware_exit_reason = exit_nr; r = RESUME_HOST; break; +#endif /* CONFIG_SPE_POSSIBLE */ + +/* + * On cores with Vector category, KVM is loaded only if CONFIG_ALTIVEC, + * see kvmppc_core_check_processor_compat(). + */ +#ifdef CONFIG_ALTIVEC + case BOOKE_INTERRUPT_ALTIVEC_UNAVAIL: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_ALTIVEC_ASSIST: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_ASSIST); + r = RESUME_GUEST; + break; #endif case BOOKE_INTERRUPT_DATA_STORAGE: @@ -1188,6 +1345,8 @@ out: else { /* interrupts now hard-disabled */ kvmppc_fix_ee_before_entry(); + kvmppc_load_guest_fp(vcpu); + kvmppc_load_guest_altivec(vcpu); } } @@ -1243,6 +1402,11 @@ int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, (unsigned long)vcpu); + /* + * Clear DBSR.MRR to avoid guest debug interrupt as + * this is of host interest + */ + mtspr(SPRN_DBSR, DBSR_MRR); return 0; } @@ -1457,144 +1621,125 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); } -int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; - union kvmppc_one_reg val; - int size; - - size = one_reg_size(reg->id); - if (size > sizeof(val)) - return -EINVAL; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_IAC1: - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac1); + *val = get_reg_val(id, vcpu->arch.dbg_reg.iac1); break; case KVM_REG_PPC_IAC2: - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac2); + *val = get_reg_val(id, vcpu->arch.dbg_reg.iac2); break; #if CONFIG_PPC_ADV_DEBUG_IACS > 2 case KVM_REG_PPC_IAC3: - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac3); + *val = get_reg_val(id, vcpu->arch.dbg_reg.iac3); break; case KVM_REG_PPC_IAC4: - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac4); + *val = get_reg_val(id, vcpu->arch.dbg_reg.iac4); break; #endif case KVM_REG_PPC_DAC1: - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac1); + *val = get_reg_val(id, vcpu->arch.dbg_reg.dac1); break; case KVM_REG_PPC_DAC2: - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac2); + *val = get_reg_val(id, vcpu->arch.dbg_reg.dac2); break; case KVM_REG_PPC_EPR: { u32 epr = kvmppc_get_epr(vcpu); - val = get_reg_val(reg->id, epr); + *val = get_reg_val(id, epr); break; } #if defined(CONFIG_64BIT) case KVM_REG_PPC_EPCR: - val = get_reg_val(reg->id, vcpu->arch.epcr); + *val = get_reg_val(id, vcpu->arch.epcr); break; #endif case KVM_REG_PPC_TCR: - val = get_reg_val(reg->id, vcpu->arch.tcr); + *val = get_reg_val(id, vcpu->arch.tcr); break; case KVM_REG_PPC_TSR: - val = get_reg_val(reg->id, vcpu->arch.tsr); + *val = get_reg_val(id, vcpu->arch.tsr); break; case KVM_REG_PPC_DEBUG_INST: - val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV_DEBUG); + *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); break; case KVM_REG_PPC_VRSAVE: - val = get_reg_val(reg->id, vcpu->arch.vrsave); + *val = get_reg_val(id, vcpu->arch.vrsave); break; default: - r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, id, val); break; } - if (r) - return r; - - if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) - r = -EFAULT; - return r; } -int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; - union kvmppc_one_reg val; - int size; - size = one_reg_size(reg->id); - if (size > sizeof(val)) - return -EINVAL; - - if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) - return -EFAULT; - - switch (reg->id) { + switch (id) { case KVM_REG_PPC_IAC1: - vcpu->arch.dbg_reg.iac1 = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.iac1 = set_reg_val(id, *val); break; case KVM_REG_PPC_IAC2: - vcpu->arch.dbg_reg.iac2 = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.iac2 = set_reg_val(id, *val); break; #if CONFIG_PPC_ADV_DEBUG_IACS > 2 case KVM_REG_PPC_IAC3: - vcpu->arch.dbg_reg.iac3 = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.iac3 = set_reg_val(id, *val); break; case KVM_REG_PPC_IAC4: - vcpu->arch.dbg_reg.iac4 = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.iac4 = set_reg_val(id, *val); break; #endif case KVM_REG_PPC_DAC1: - vcpu->arch.dbg_reg.dac1 = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.dac1 = set_reg_val(id, *val); break; case KVM_REG_PPC_DAC2: - vcpu->arch.dbg_reg.dac2 = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.dac2 = set_reg_val(id, *val); break; case KVM_REG_PPC_EPR: { - u32 new_epr = set_reg_val(reg->id, val); + u32 new_epr = set_reg_val(id, *val); kvmppc_set_epr(vcpu, new_epr); break; } #if defined(CONFIG_64BIT) case KVM_REG_PPC_EPCR: { - u32 new_epcr = set_reg_val(reg->id, val); + u32 new_epcr = set_reg_val(id, *val); kvmppc_set_epcr(vcpu, new_epcr); break; } #endif case KVM_REG_PPC_OR_TSR: { - u32 tsr_bits = set_reg_val(reg->id, val); + u32 tsr_bits = set_reg_val(id, *val); kvmppc_set_tsr_bits(vcpu, tsr_bits); break; } case KVM_REG_PPC_CLEAR_TSR: { - u32 tsr_bits = set_reg_val(reg->id, val); + u32 tsr_bits = set_reg_val(id, *val); kvmppc_clr_tsr_bits(vcpu, tsr_bits); break; } case KVM_REG_PPC_TSR: { - u32 tsr = set_reg_val(reg->id, val); + u32 tsr = set_reg_val(id, *val); kvmppc_set_tsr(vcpu, tsr); break; } case KVM_REG_PPC_TCR: { - u32 tcr = set_reg_val(reg->id, val); + u32 tcr = set_reg_val(id, *val); kvmppc_set_tcr(vcpu, tcr); break; } case KVM_REG_PPC_VRSAVE: - vcpu->arch.vrsave = set_reg_val(reg->id, val); + vcpu->arch.vrsave = set_reg_val(id, *val); break; default: - r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, id, val); break; } @@ -1694,10 +1839,8 @@ void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) update_timer_ints(vcpu); } -void kvmppc_decrementer_func(unsigned long data) +void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - if (vcpu->arch.tcr & TCR_ARE) { vcpu->arch.dec = vcpu->arch.decar; kvmppc_emulate_dec(vcpu); @@ -1842,7 +1985,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int n, b = 0, w = 0; if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { - vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + vcpu->arch.dbg_reg.dbcr0 = 0; vcpu->guest_debug = 0; kvm_guest_protect_msr(vcpu, MSR_DE, false); return 0; @@ -1850,15 +1993,13 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, kvm_guest_protect_msr(vcpu, MSR_DE, true); vcpu->guest_debug = dbg->control; - vcpu->arch.shadow_dbg_reg.dbcr0 = 0; - /* Set DBCR0_EDM in guest visible DBCR0 register. */ - vcpu->arch.dbg_reg.dbcr0 = DBCR0_EDM; + vcpu->arch.dbg_reg.dbcr0 = 0; if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.shadow_dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC; + vcpu->arch.dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC; /* Code below handles only HW breakpoints */ - dbg_reg = &(vcpu->arch.shadow_dbg_reg); + dbg_reg = &(vcpu->arch.dbg_reg); #ifdef CONFIG_KVM_BOOKE_HV /* diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index f753543c56fa..22ba08ea68e9 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -32,9 +32,15 @@ #define BOOKE_IRQPRIO_ALIGNMENT 2 #define BOOKE_IRQPRIO_PROGRAM 3 #define BOOKE_IRQPRIO_FP_UNAVAIL 4 +#ifdef CONFIG_SPE_POSSIBLE #define BOOKE_IRQPRIO_SPE_UNAVAIL 5 #define BOOKE_IRQPRIO_SPE_FP_DATA 6 #define BOOKE_IRQPRIO_SPE_FP_ROUND 7 +#endif +#ifdef CONFIG_PPC_E500MC +#define BOOKE_IRQPRIO_ALTIVEC_UNAVAIL 5 +#define BOOKE_IRQPRIO_ALTIVEC_ASSIST 6 +#endif #define BOOKE_IRQPRIO_SYSCALL 8 #define BOOKE_IRQPRIO_AP_UNAVAIL 9 #define BOOKE_IRQPRIO_DTLB_MISS 10 @@ -116,40 +122,6 @@ extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); -/* - * Load up guest vcpu FP state if it's needed. - * It also set the MSR_FP in thread so that host know - * we're holding FPU, and then host can help to save - * guest vcpu FP state if other threads require to use FPU. - * This simulates an FP unavailable fault. - * - * It requires to be called with preemption disabled. - */ -static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_PPC_FPU - if (vcpu->fpu_active && !(current->thread.regs->msr & MSR_FP)) { - enable_kernel_fp(); - load_fp_state(&vcpu->arch.fp); - current->thread.fp_save_area = &vcpu->arch.fp; - current->thread.regs->msr |= MSR_FP; - } -#endif -} - -/* - * Save guest vcpu FP state into thread. - * It requires to be called with preemption disabled. - */ -static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_PPC_FPU - if (vcpu->fpu_active && (current->thread.regs->msr & MSR_FP)) - giveup_fpu(current); - current->thread.fp_save_area = NULL; -#endif -} - static inline void kvmppc_clear_dbsr(void) { mtspr(SPRN_DBSR, mfspr(SPRN_DBSR)); diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 28c158881d23..a82f64502de1 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -25,6 +25,7 @@ #define OP_19_XOP_RFI 50 #define OP_19_XOP_RFCI 51 +#define OP_19_XOP_RFDI 39 #define OP_31_XOP_MFMSR 83 #define OP_31_XOP_WRTEE 131 @@ -37,6 +38,12 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); } +static void kvmppc_emul_rfdi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pc = vcpu->arch.dsrr0; + kvmppc_set_msr(vcpu, vcpu->arch.dsrr1); +} + static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu) { vcpu->arch.pc = vcpu->arch.csrr0; @@ -65,6 +72,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, *advance = 0; break; + case OP_19_XOP_RFDI: + kvmppc_emul_rfdi(vcpu); + kvmppc_set_exit_type(vcpu, EMULATED_RFDI_EXITS); + *advance = 0; + break; + default: emulated = EMULATE_FAIL; break; @@ -118,6 +131,7 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; + bool debug_inst = false; switch (sprn) { case SPRN_DEAR: @@ -132,14 +146,128 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_CSRR1: vcpu->arch.csrr1 = spr_val; break; + case SPRN_DSRR0: + vcpu->arch.dsrr0 = spr_val; + break; + case SPRN_DSRR1: + vcpu->arch.dsrr1 = spr_val; + break; + case SPRN_IAC1: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.iac1 = spr_val; + break; + case SPRN_IAC2: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.iac2 = spr_val; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case SPRN_IAC3: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.iac3 = spr_val; + break; + case SPRN_IAC4: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.iac4 = spr_val; + break; +#endif + case SPRN_DAC1: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.dac1 = spr_val; + break; + case SPRN_DAC2: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.dac2 = spr_val; + break; case SPRN_DBCR0: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + spr_val &= (DBCR0_IDM | DBCR0_IC | DBCR0_BT | DBCR0_TIE | + DBCR0_IAC1 | DBCR0_IAC2 | DBCR0_IAC3 | DBCR0_IAC4 | + DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W); + vcpu->arch.dbg_reg.dbcr0 = spr_val; break; case SPRN_DBCR1: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; vcpu->arch.dbg_reg.dbcr1 = spr_val; break; + case SPRN_DBCR2: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.dbcr2 = spr_val; + break; case SPRN_DBSR: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + vcpu->arch.dbsr &= ~spr_val; + if (!(vcpu->arch.dbsr & ~DBSR_IDE)) + kvmppc_core_dequeue_debug(vcpu); break; case SPRN_TSR: kvmppc_clr_tsr_bits(vcpu, spr_val); @@ -252,6 +380,10 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) emulated = EMULATE_FAIL; } + if (debug_inst) { + current->thread.debug = vcpu->arch.dbg_reg; + switch_booke_debug_regs(&vcpu->arch.dbg_reg); + } return emulated; } @@ -278,12 +410,43 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_CSRR1: *spr_val = vcpu->arch.csrr1; break; + case SPRN_DSRR0: + *spr_val = vcpu->arch.dsrr0; + break; + case SPRN_DSRR1: + *spr_val = vcpu->arch.dsrr1; + break; + case SPRN_IAC1: + *spr_val = vcpu->arch.dbg_reg.iac1; + break; + case SPRN_IAC2: + *spr_val = vcpu->arch.dbg_reg.iac2; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case SPRN_IAC3: + *spr_val = vcpu->arch.dbg_reg.iac3; + break; + case SPRN_IAC4: + *spr_val = vcpu->arch.dbg_reg.iac4; + break; +#endif + case SPRN_DAC1: + *spr_val = vcpu->arch.dbg_reg.dac1; + break; + case SPRN_DAC2: + *spr_val = vcpu->arch.dbg_reg.dac2; + break; case SPRN_DBCR0: *spr_val = vcpu->arch.dbg_reg.dbcr0; + if (vcpu->guest_debug) + *spr_val = *spr_val | DBCR0_EDM; break; case SPRN_DBCR1: *spr_val = vcpu->arch.dbg_reg.dbcr1; break; + case SPRN_DBCR2: + *spr_val = vcpu->arch.dbg_reg.dbcr2; + break; case SPRN_DBSR: *spr_val = vcpu->arch.dbsr; break; diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index e9fa56a911fd..81bd8a07aa51 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -238,7 +238,7 @@ kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \ kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \ SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR) kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \ - SPRN_SRR0, SPRN_SRR1,NEED_ESR + SPRN_SRR0, SPRN_SRR1, (NEED_ESR | NEED_EMU) kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \ SPRN_SRR0, SPRN_SRR1, 0 kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \ @@ -256,11 +256,9 @@ kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \ SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \ SPRN_SRR0, SPRN_SRR1, 0 -kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \ +kvm_handler BOOKE_INTERRUPT_ALTIVEC_UNAVAIL, EX_PARAMS(GEN), \ SPRN_SRR0, SPRN_SRR1, 0 -kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \ - SPRN_SRR0, SPRN_SRR1, 0 -kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \ +kvm_handler BOOKE_INTERRUPT_ALTIVEC_ASSIST, EX_PARAMS(GEN), \ SPRN_SRR0, SPRN_SRR1, 0 kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \ SPRN_SRR0, SPRN_SRR1, 0 @@ -350,7 +348,7 @@ kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR) -kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, (NEED_ESR | NEED_EMU) kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 @@ -361,9 +359,6 @@ kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \ kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \ SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0 -kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 -kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, SPRN_SRR0, SPRN_SRR1, 0 -kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, SPRN_SRR0, SPRN_SRR1, 0 kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0 kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0 kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \ diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index a326178bdea5..72920bed3ac6 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -22,6 +22,7 @@ #include <linux/kvm_host.h> #include <asm/mmu-book3e.h> #include <asm/tlb.h> +#include <asm/cputhreads.h> enum vcpu_ftr { VCPU_FTR_MMU_V2 @@ -289,6 +290,25 @@ void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500); #define kvmppc_e500_get_tlb_stid(vcpu, gtlbe) get_tlb_tid(gtlbe) #define get_tlbmiss_tid(vcpu) get_cur_pid(vcpu) #define get_tlb_sts(gtlbe) (gtlbe->mas1 & MAS1_TS) + +/* + * These functions should be called with preemption disabled + * and the returned value is valid only in that context + */ +static inline int get_thread_specific_lpid(int vm_lpid) +{ + int vcpu_lpid = vm_lpid; + + if (threads_per_core == 2) + vcpu_lpid |= smp_processor_id() & 1; + + return vcpu_lpid; +} + +static inline int get_lpid(struct kvm_vcpu *vcpu) +{ + return get_thread_specific_lpid(vcpu->kvm->arch.lpid); +} #else unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, struct kvm_book3e_206_tlb_entry *gtlbe); diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index c99c40e9182a..ce7291c79f6c 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -259,6 +259,7 @@ int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_va break; /* extra exceptions */ +#ifdef CONFIG_SPE_POSSIBLE case SPRN_IVOR32: vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val; break; @@ -268,6 +269,15 @@ int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_va case SPRN_IVOR34: vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val; break; +#endif +#ifdef CONFIG_ALTIVEC + case SPRN_IVOR32: + vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_UNAVAIL] = spr_val; + break; + case SPRN_IVOR33: + vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_ASSIST] = spr_val; + break; +#endif case SPRN_IVOR35: vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val; break; @@ -381,6 +391,7 @@ int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_v break; /* extra exceptions */ +#ifdef CONFIG_SPE_POSSIBLE case SPRN_IVOR32: *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; break; @@ -390,6 +401,15 @@ int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_v case SPRN_IVOR34: *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; break; +#endif +#ifdef CONFIG_ALTIVEC + case SPRN_IVOR32: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_UNAVAIL]; + break; + case SPRN_IVOR33: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_ASSIST]; + break; +#endif case SPRN_IVOR35: *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; break; diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 08f14bb57897..769778f855b0 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -69,7 +69,8 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) * writing shadow tlb entry to host TLB */ static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, - uint32_t mas0) + uint32_t mas0, + uint32_t lpid) { unsigned long flags; @@ -80,7 +81,7 @@ static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); #ifdef CONFIG_KVM_BOOKE_HV - mtspr(SPRN_MAS8, stlbe->mas8); + mtspr(SPRN_MAS8, MAS8_TGS | get_thread_specific_lpid(lpid)); #endif asm volatile("isync; tlbwe" : : : "memory"); @@ -129,11 +130,12 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, if (tlbsel == 0) { mas0 = get_host_mas0(stlbe->mas2); - __write_host_tlbe(stlbe, mas0); + __write_host_tlbe(stlbe, mas0, vcpu_e500->vcpu.kvm->arch.lpid); } else { __write_host_tlbe(stlbe, MAS0_TLBSEL(1) | - MAS0_ESEL(to_htlb1_esel(sesel))); + MAS0_ESEL(to_htlb1_esel(sesel)), + vcpu_e500->vcpu.kvm->arch.lpid); } } @@ -176,7 +178,7 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu) MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; magic.mas8 = 0; - __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); + __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index), 0); preempt_enable(); } #endif @@ -317,10 +319,6 @@ static void kvmppc_e500_setup_stlbe( stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR); stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); - -#ifdef CONFIG_KVM_BOOKE_HV - stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid; -#endif } static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, @@ -633,7 +631,7 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type, local_irq_save(flags); mtspr(SPRN_MAS6, (vcpu->arch.pid << MAS6_SPID_SHIFT) | addr_space); - mtspr(SPRN_MAS5, MAS5_SGS | vcpu->kvm->arch.lpid); + mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(vcpu)); asm volatile("tlbsx 0, %[geaddr]\n" : : [geaddr] "r" (geaddr)); mtspr(SPRN_MAS5, 0); @@ -732,7 +730,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) return 0; } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { /* XXX could be more clever ;) */ return 0; diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 164bad2a19bf..2fdc8722e324 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -48,10 +48,11 @@ void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type) return; } - - tag = PPC_DBELL_LPID(vcpu->kvm->arch.lpid) | vcpu->vcpu_id; + preempt_disable(); + tag = PPC_DBELL_LPID(get_lpid(vcpu)) | vcpu->vcpu_id; mb(); ppc_msgsnd(dbell_type, 0, tag); + preempt_enable(); } /* gtlbe must not be mapped by more than one host tlb entry */ @@ -60,12 +61,11 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, { unsigned int tid, ts; gva_t eaddr; - u32 val, lpid; + u32 val; unsigned long flags; ts = get_tlb_ts(gtlbe); tid = get_tlb_tid(gtlbe); - lpid = vcpu_e500->vcpu.kvm->arch.lpid; /* We search the host TLB to invalidate its shadow TLB entry */ val = (tid << 16) | ts; @@ -74,7 +74,7 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, local_irq_save(flags); mtspr(SPRN_MAS6, val); - mtspr(SPRN_MAS5, MAS5_SGS | lpid); + mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(&vcpu_e500->vcpu)); asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr)); val = mfspr(SPRN_MAS1); @@ -95,7 +95,7 @@ void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) unsigned long flags; local_irq_save(flags); - mtspr(SPRN_MAS5, MAS5_SGS | vcpu_e500->vcpu.kvm->arch.lpid); + mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(&vcpu_e500->vcpu)); asm volatile("tlbilxlpid"); mtspr(SPRN_MAS5, 0); local_irq_restore(flags); @@ -110,6 +110,7 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) { } +/* We use two lpids per VM */ static DEFINE_PER_CPU(struct kvm_vcpu *[KVMPPC_NR_LPIDS], last_vcpu_of_lpid); static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) @@ -118,10 +119,12 @@ static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) kvmppc_booke_vcpu_load(vcpu, cpu); - mtspr(SPRN_LPID, vcpu->kvm->arch.lpid); + mtspr(SPRN_LPID, get_lpid(vcpu)); mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); mtspr(SPRN_GPIR, vcpu->vcpu_id); mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp); + vcpu->arch.eplc = EPC_EGS | (get_lpid(vcpu) << EPC_ELPID_SHIFT); + vcpu->arch.epsc = vcpu->arch.eplc; mtspr(SPRN_EPLC, vcpu->arch.eplc); mtspr(SPRN_EPSC, vcpu->arch.epsc); @@ -141,12 +144,10 @@ static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) mtspr(SPRN_GESR, vcpu->arch.shared->esr); if (vcpu->arch.oldpir != mfspr(SPRN_PIR) || - __get_cpu_var(last_vcpu_of_lpid)[vcpu->kvm->arch.lpid] != vcpu) { + __get_cpu_var(last_vcpu_of_lpid)[get_lpid(vcpu)] != vcpu) { kvmppc_e500_tlbil_all(vcpu_e500); - __get_cpu_var(last_vcpu_of_lpid)[vcpu->kvm->arch.lpid] = vcpu; + __get_cpu_var(last_vcpu_of_lpid)[get_lpid(vcpu)] = vcpu; } - - kvmppc_load_guest_fp(vcpu); } static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu) @@ -179,6 +180,16 @@ int kvmppc_core_check_processor_compat(void) r = 0; else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0) r = 0; +#ifdef CONFIG_ALTIVEC + /* + * Since guests have the priviledge to enable AltiVec, we need AltiVec + * support in the host to save/restore their context. + * Don't use CPU_FTR_ALTIVEC to identify cores with AltiVec unit + * because it's cleared in the absence of CONFIG_ALTIVEC! + */ + else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0) + r = 0; +#endif else r = -ENOTSUPP; @@ -194,9 +205,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) #ifdef CONFIG_64BIT vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM; #endif - vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; - vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); - vcpu->arch.epsc = vcpu->arch.eplc; + vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_PMMP; vcpu->arch.pvr = mfspr(SPRN_PVR); vcpu_e500->svr = mfspr(SPRN_SVR); @@ -356,13 +365,26 @@ static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) if (lpid < 0) return lpid; + /* + * Use two lpids per VM on cores with two threads like e6500. Use + * even numbers to speedup vcpu lpid computation with consecutive lpids + * per VM. vm1 will use lpids 2 and 3, vm2 lpids 4 and 5, and so on. + */ + if (threads_per_core == 2) + lpid <<= 1; + kvm->arch.lpid = lpid; return 0; } static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm) { - kvmppc_free_lpid(kvm->arch.lpid); + int lpid = kvm->arch.lpid; + + if (threads_per_core == 2) + lpid >>= 1; + + kvmppc_free_lpid(lpid); } static struct kvmppc_ops kvm_ops_e500mc = { @@ -390,7 +412,13 @@ static int __init kvmppc_e500mc_init(void) if (r) goto err_out; - kvmppc_init_lpid(64); + /* + * Use two lpids per VM on dual threaded processors like e6500 + * to workarround the lack of tlb write conditional instruction. + * Expose half the number of available hardware lpids to the lpid + * allocator. + */ + kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core); kvmppc_claim_lpid(0); /* host */ r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index e96b50d0bdab..5cc2e7af3a7b 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -219,7 +219,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) /* this default type might be overwritten by subcategories */ kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); - emulated = kvmppc_get_last_inst(vcpu, false, &inst); + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst); if (emulated != EMULATE_DONE) return emulated; @@ -274,6 +274,21 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } break; + case 0: + /* + * Instruction with primary opcode 0. Based on PowerISA + * these are illegal instructions. + */ + if (inst == KVMPPC_INST_SW_BREAKPOINT) { + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = kvmppc_get_pc(vcpu); + emulated = EMULATE_EXIT_USER; + advance = 0; + } else + emulated = EMULATE_FAIL; + + break; + default: emulated = EMULATE_FAIL; } diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 0de4ffa175a9..6d3c0ee1d744 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -58,7 +58,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) /* this default type might be overwritten by subcategories */ kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); - emulated = kvmppc_get_last_inst(vcpu, false, &inst); + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst); if (emulated != EMULATE_DONE) return emulated; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4c79284b58be..c1f8f53cd312 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -294,7 +294,7 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { u32 last_inst; - kvmppc_get_last_inst(vcpu, false, &last_inst); + kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); /* XXX Deliver Program interrupt to guest. */ pr_emerg("%s: emulation failed (%08x)\n", __func__, last_inst); r = RESUME_HOST; @@ -384,24 +384,16 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, } EXPORT_SYMBOL_GPL(kvmppc_ld); -int kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void) { return 0; } -void kvm_arch_hardware_disable(void *garbage) -{ -} - int kvm_arch_hardware_setup(void) { return 0; } -void kvm_arch_hardware_unsetup(void) -{ -} - void kvm_arch_check_processor_compat(void *rtn) { *(int *)rtn = kvmppc_core_check_processor_compat(); @@ -462,10 +454,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) module_put(kvm->arch.kvm_ops->owner); } -void kvm_arch_sync_events(struct kvm *kvm) -{ -} - int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; @@ -608,10 +596,6 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return kvmppc_core_create_memslot(kvm, slot, npages); } -void kvm_arch_memslots_updated(struct kvm *kvm) -{ -} - int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem, @@ -628,10 +612,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvmppc_core_commit_memory_region(kvm, mem, old); } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ -} - void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { @@ -658,7 +638,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { /* Make sure we're not using the vcpu anymore */ hrtimer_cancel(&vcpu->arch.dec_timer); - tasklet_kill(&vcpu->arch.tasklet); kvmppc_remove_vcpu_debugfs(vcpu); @@ -684,16 +663,12 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -/* - * low level hrtimer wake routine. Because this runs in hardirq context - * we schedule a tasklet to do the real work. - */ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) { struct kvm_vcpu *vcpu; vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); - tasklet_schedule(&vcpu->arch.tasklet); + kvmppc_decrementer_func(vcpu); return HRTIMER_NORESTART; } @@ -703,7 +678,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) int ret; hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; vcpu->arch.dec_expires = ~(u64)0; @@ -927,6 +901,103 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvmppc_handle_store); +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = 0; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + r = kvmppc_get_one_reg(vcpu, reg->id, &val); + if (r == -EINVAL) { + r = 0; + switch (reg->id) { +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_VRSAVE: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; +#endif /* CONFIG_ALTIVEC */ + default: + r = -EINVAL; + break; + } + } + + if (r) + return r; + + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) + r = -EFAULT; + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) + return -EFAULT; + + r = kvmppc_set_one_reg(vcpu, reg->id, &val); + if (r == -EINVAL) { + r = 0; + switch (reg->id) { +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); + break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); + break; +#endif /* CONFIG_ALTIVEC */ + default: + r = -EINVAL; + break; + } + } + + return r; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r; @@ -1343,9 +1414,4 @@ int kvm_arch_init(void *opaque) return 0; } -void kvm_arch_exit(void) -{ - -} - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index e8bc40869cbd..7d9ee3d8c618 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -303,9 +303,13 @@ config PPC_ICSWX_USE_SIGILL If in doubt, say N here. +config SPE_POSSIBLE + def_bool y + depends on E200 || (E500 && !PPC_E500MC) + config SPE bool "SPE Support" - depends on E200 || (E500 && !PPC_E500MC) + depends on SPE_POSSIBLE default y ---help--- This option enables kernel support for the Signal Processing diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 773bef7614d8..2175f911a73a 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -13,8 +13,11 @@ #ifndef ASM_KVM_HOST_H #define ASM_KVM_HOST_H + +#include <linux/types.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> +#include <linux/kvm_types.h> #include <linux/kvm_host.h> #include <linux/kvm.h> #include <asm/debug.h> @@ -154,7 +157,9 @@ struct kvm_s390_sie_block { __u8 armid; /* 0x00e3 */ __u8 reservede4[4]; /* 0x00e4 */ __u64 tecmc; /* 0x00e8 */ - __u8 reservedf0[16]; /* 0x00f0 */ + __u8 reservedf0[12]; /* 0x00f0 */ +#define CRYCB_FORMAT1 0x00000001 + __u32 crycbd; /* 0x00fc */ __u64 gcr[16]; /* 0x0100 */ __u64 gbea; /* 0x0180 */ __u8 reserved188[24]; /* 0x0188 */ @@ -187,6 +192,7 @@ struct kvm_vcpu_stat { u32 exit_stop_request; u32 exit_validity; u32 exit_instruction; + u32 halt_wakeup; u32 instruction_lctl; u32 instruction_lctlg; u32 instruction_stctl; @@ -407,6 +413,15 @@ struct s390_io_adapter { #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8) #define MAX_S390_ADAPTER_MAPS 256 +struct kvm_s390_crypto { + struct kvm_s390_crypto_cb *crycb; + __u32 crycbd; +}; + +struct kvm_s390_crypto_cb { + __u8 reserved00[128]; /* 0x0000 */ +}; + struct kvm_arch{ struct sca_block *sca; debug_info_t *dbf; @@ -420,6 +435,7 @@ struct kvm_arch{ struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; spinlock_t start_stop_lock; + struct kvm_s390_crypto crypto; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -431,8 +447,6 @@ static inline bool kvm_is_error_hva(unsigned long addr) } #define ASYNC_PF_PER_VCPU 64 -struct kvm_vcpu; -struct kvm_async_pf; struct kvm_arch_async_pf { unsigned long pfault_token; }; @@ -450,4 +464,18 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; + +static inline void kvm_arch_hardware_disable(void) {} +static inline void kvm_arch_check_processor_compat(void *rtn) {} +static inline void kvm_arch_exit(void) {} +static inline void kvm_arch_sync_events(struct kvm *kvm) {} +static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_free_memslot(struct kvm *kvm, + struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} +static inline void kvm_arch_memslots_updated(struct kvm *kvm) {} +static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} +static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) {} + #endif diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 9e18a61d3df3..d39a31c3cdf2 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -18,9 +18,9 @@ unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); -unsigned long *page_table_alloc(struct mm_struct *, unsigned long); +unsigned long *page_table_alloc(struct mm_struct *); void page_table_free(struct mm_struct *, unsigned long *); -void page_table_free_rcu(struct mmu_gather *, unsigned long *); +void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long, bool init_skey); @@ -145,8 +145,8 @@ static inline void pmd_populate(struct mm_struct *mm, /* * page table entry allocation/free routines. */ -#define pte_alloc_one_kernel(mm, vmaddr) ((pte_t *) page_table_alloc(mm, vmaddr)) -#define pte_alloc_one(mm, vmaddr) ((pte_t *) page_table_alloc(mm, vmaddr)) +#define pte_alloc_one_kernel(mm, vmaddr) ((pte_t *) page_table_alloc(mm)) +#define pte_alloc_one(mm, vmaddr) ((pte_t *) page_table_alloc(mm)) #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 5efb2fe186e7..b7054356cc98 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/page-flags.h> +#include <linux/radix-tree.h> #include <asm/bug.h> #include <asm/page.h> @@ -789,82 +790,67 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) /** * struct gmap_struct - guest address space + * @crst_list: list of all crst tables used in the guest address space * @mm: pointer to the parent mm_struct + * @guest_to_host: radix tree with guest to host address translation + * @host_to_guest: radix tree with pointer to segment table entries + * @guest_table_lock: spinlock to protect all entries in the guest page table * @table: pointer to the page directory * @asce: address space control element for gmap page table - * @crst_list: list of all crst tables used in the guest address space * @pfault_enabled: defines if pfaults are applicable for the guest */ struct gmap { struct list_head list; + struct list_head crst_list; struct mm_struct *mm; + struct radix_tree_root guest_to_host; + struct radix_tree_root host_to_guest; + spinlock_t guest_table_lock; unsigned long *table; unsigned long asce; + unsigned long asce_end; void *private; - struct list_head crst_list; bool pfault_enabled; }; /** - * struct gmap_rmap - reverse mapping for segment table entries - * @gmap: pointer to the gmap_struct - * @entry: pointer to a segment table entry - * @vmaddr: virtual address in the guest address space - */ -struct gmap_rmap { - struct list_head list; - struct gmap *gmap; - unsigned long *entry; - unsigned long vmaddr; -}; - -/** - * struct gmap_pgtable - gmap information attached to a page table - * @vmaddr: address of the 1MB segment in the process virtual memory - * @mapper: list of segment table entries mapping a page table - */ -struct gmap_pgtable { - unsigned long vmaddr; - struct list_head mapper; -}; - -/** * struct gmap_notifier - notify function block for page invalidation * @notifier_call: address of callback function */ struct gmap_notifier { struct list_head list; - void (*notifier_call)(struct gmap *gmap, unsigned long address); + void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); }; -struct gmap *gmap_alloc(struct mm_struct *mm); +struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); void gmap_free(struct gmap *gmap); void gmap_enable(struct gmap *gmap); void gmap_disable(struct gmap *gmap); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); -unsigned long __gmap_translate(unsigned long address, struct gmap *); -unsigned long gmap_translate(unsigned long address, struct gmap *); -unsigned long __gmap_fault(unsigned long address, struct gmap *); -unsigned long gmap_fault(unsigned long address, struct gmap *); -void gmap_discard(unsigned long from, unsigned long to, struct gmap *); -void __gmap_zap(unsigned long address, struct gmap *); +unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); +unsigned long gmap_translate(struct gmap *, unsigned long gaddr); +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); +int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); +void gmap_discard(struct gmap *, unsigned long from, unsigned long to); +void __gmap_zap(struct gmap *, unsigned long gaddr); bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *); void gmap_register_ipte_notifier(struct gmap_notifier *); void gmap_unregister_ipte_notifier(struct gmap_notifier *); int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); -void gmap_do_ipte_notify(struct mm_struct *, pte_t *); +void gmap_do_ipte_notify(struct mm_struct *, unsigned long addr, pte_t *); static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE if (pgste_val(pgste) & PGSTE_IN_BIT) { pgste_val(pgste) &= ~PGSTE_IN_BIT; - gmap_do_ipte_notify(mm, ptep); + gmap_do_ipte_notify(mm, addr, ptep); } #endif return pgste; @@ -1110,7 +1096,7 @@ static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm, pgste_val(pgste) &= ~PGSTE_UC_BIT; pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_ipte_notify(mm, ptep, pgste); + pgste = pgste_ipte_notify(mm, addr, ptep, pgste); __ptep_ipte(addr, ptep); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; @@ -1132,7 +1118,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, if (mm_has_pgste(vma->vm_mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(vma->vm_mm, ptep, pgste); + pgste = pgste_ipte_notify(vma->vm_mm, addr, ptep, pgste); } oldpte = pte = *ptep; @@ -1179,7 +1165,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(mm, ptep, pgste); + pgste = pgste_ipte_notify(mm, address, ptep, pgste); } pte = *ptep; @@ -1203,7 +1189,7 @@ static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste_ipte_notify(mm, ptep, pgste); + pgste_ipte_notify(mm, address, ptep, pgste); } pte = *ptep; @@ -1240,7 +1226,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, if (mm_has_pgste(vma->vm_mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(vma->vm_mm, ptep, pgste); + pgste = pgste_ipte_notify(vma->vm_mm, address, ptep, pgste); } pte = *ptep; @@ -1274,7 +1260,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, if (!full && mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(mm, ptep, pgste); + pgste = pgste_ipte_notify(mm, address, ptep, pgste); } pte = *ptep; @@ -1299,7 +1285,7 @@ static inline pte_t ptep_set_wrprotect(struct mm_struct *mm, if (pte_write(pte)) { if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(mm, ptep, pgste); + pgste = pgste_ipte_notify(mm, address, ptep, pgste); } ptep_flush_lazy(mm, address, ptep); @@ -1325,7 +1311,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, return 0; if (mm_has_pgste(vma->vm_mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(vma->vm_mm, ptep, pgste); + pgste = pgste_ipte_notify(vma->vm_mm, address, ptep, pgste); } ptep_flush_direct(vma->vm_mm, address, ptep); diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index a25f09fbaf36..572c59949004 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -105,7 +105,7 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long address) { - page_table_free_rcu(tlb, (unsigned long *) pte); + page_table_free_rcu(tlb, (unsigned long *) pte, address); } /* diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 0fc26430a1e5..48eda3ab4944 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -111,12 +111,22 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_GPRS (1UL << 1) #define KVM_SYNC_ACRS (1UL << 2) #define KVM_SYNC_CRS (1UL << 3) +#define KVM_SYNC_ARCH0 (1UL << 4) +#define KVM_SYNC_PFAULT (1UL << 5) /* definition of registers in kvm_run */ struct kvm_sync_regs { __u64 prefix; /* prefix register */ __u64 gprs[16]; /* general purpose registers */ __u32 acrs[16]; /* access registers */ __u64 crs[16]; /* control registers */ + __u64 todpr; /* tod programmable register [ARCH0] */ + __u64 cputm; /* cpu timer [ARCH0] */ + __u64 ckc; /* clock comparator [ARCH0] */ + __u64 pp; /* program parameter [ARCH0] */ + __u64 gbea; /* guest breaking-event address [ARCH0] */ + __u64 pft; /* pfault token [PFAULT] */ + __u64 pfs; /* pfault select [PFAULT] */ + __u64 pfc; /* pfault compare [PFAULT] */ }; #define KVM_REG_S390_TODPR (KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 59bd8f991b98..9254afff250c 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -28,22 +28,32 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; - if (start & ~PAGE_MASK || end & ~PAGE_MASK || start > end + if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end || start < 2 * PAGE_SIZE) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end); vcpu->stat.diagnose_10++; - /* we checked for start > end above */ - if (end < prefix || start >= prefix + 2 * PAGE_SIZE) { - gmap_discard(start, end, vcpu->arch.gmap); + /* + * We checked for start >= end above, so lets check for the + * fast path (no prefix swap page involved) + */ + if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) { + gmap_discard(vcpu->arch.gmap, start, end); } else { - if (start < prefix) - gmap_discard(start, prefix, vcpu->arch.gmap); - if (end >= prefix) - gmap_discard(prefix + 2 * PAGE_SIZE, - end, vcpu->arch.gmap); + /* + * This is slow path. gmap_discard will check for start + * so lets split this into before prefix, prefix, after + * prefix and let gmap_discard make some of these calls + * NOPs. + */ + gmap_discard(vcpu->arch.gmap, start, prefix); + if (start <= prefix) + gmap_discard(vcpu->arch.gmap, 0, 4096); + if (end > prefix + 4096) + gmap_discard(vcpu->arch.gmap, 4096, 8192); + gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); } return 0; } diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 4653ac6e182b..0f961a1c64b3 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -254,8 +254,7 @@ static void ipte_unlock_simple(struct kvm_vcpu *vcpu) new = old = ACCESS_ONCE(*ic); new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - if (!ipte_lock_count) - wake_up(&vcpu->kvm->arch.ipte_wq); + wake_up(&vcpu->kvm->arch.ipte_wq); out: mutex_unlock(&ipte_mutex); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f4c819bfc193..a39838457f01 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -26,8 +26,9 @@ #define IOINT_SSID_MASK 0x00030000 #define IOINT_CSSID_MASK 0x03fc0000 #define IOINT_AI_MASK 0x04000000 +#define PFAULT_INIT 0x0600 -static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu); +static int __must_check deliver_ckc_interrupt(struct kvm_vcpu *vcpu); static int is_ioint(u64 type) { @@ -76,7 +77,7 @@ static u64 int_word_to_isc_bits(u32 int_word) return (0x80 >> isc) << 24; } -static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, +static int __must_check __interrupt_is_deliverable(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info *inti) { switch (inti->type) { @@ -85,6 +86,7 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, return 0; if (vcpu->arch.sie_block->gcr[0] & 0x2000ul) return 1; + return 0; case KVM_S390_INT_EMERGENCY: if (psw_extint_disabled(vcpu)) return 0; @@ -205,11 +207,30 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu, } } -static int __deliver_prog_irq(struct kvm_vcpu *vcpu, - struct kvm_s390_pgm_info *pgm_info) +static u16 get_ilc(struct kvm_vcpu *vcpu) { const unsigned short table[] = { 2, 4, 4, 6 }; + + switch (vcpu->arch.sie_block->icptcode) { + case ICPT_INST: + case ICPT_INSTPROGI: + case ICPT_OPEREXC: + case ICPT_PARTEXEC: + case ICPT_IOINST: + /* last instruction only stored for these icptcodes */ + return table[vcpu->arch.sie_block->ipa >> 14]; + case ICPT_PROGI: + return vcpu->arch.sie_block->pgmilc; + default: + return 0; + } +} + +static int __must_check __deliver_prog_irq(struct kvm_vcpu *vcpu, + struct kvm_s390_pgm_info *pgm_info) +{ int rc = 0; + u16 ilc = get_ilc(vcpu); switch (pgm_info->code & ~PGM_PER) { case PGM_AFX_TRANSLATION: @@ -276,25 +297,7 @@ static int __deliver_prog_irq(struct kvm_vcpu *vcpu, (u8 *) __LC_PER_ACCESS_ID); } - switch (vcpu->arch.sie_block->icptcode) { - case ICPT_INST: - case ICPT_INSTPROGI: - case ICPT_OPEREXC: - case ICPT_PARTEXEC: - case ICPT_IOINST: - /* last instruction only stored for these icptcodes */ - rc |= put_guest_lc(vcpu, table[vcpu->arch.sie_block->ipa >> 14], - (u16 *) __LC_PGM_ILC); - break; - case ICPT_PROGI: - rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->pgmilc, - (u16 *) __LC_PGM_ILC); - break; - default: - rc |= put_guest_lc(vcpu, 0, - (u16 *) __LC_PGM_ILC); - } - + rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC); rc |= put_guest_lc(vcpu, pgm_info->code, (u16 *)__LC_PGM_INT_CODE); rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, @@ -305,7 +308,7 @@ static int __deliver_prog_irq(struct kvm_vcpu *vcpu, return rc; } -static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, +static int __must_check __do_deliver_interrupt(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info *inti) { const unsigned short table[] = { 2, 4, 4, 6 }; @@ -343,7 +346,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, case KVM_S390_INT_CLOCK_COMP: trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->ext.ext_params, 0); - deliver_ckc_interrupt(vcpu); + rc = deliver_ckc_interrupt(vcpu); break; case KVM_S390_INT_CPU_TIMER: trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, @@ -376,8 +379,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, case KVM_S390_INT_PFAULT_INIT: trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0, inti->ext.ext_params2); - rc = put_guest_lc(vcpu, 0x2603, (u16 *) __LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0x0600, (u16 *) __LC_EXT_CPU_ADDR); + rc = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE, + (u16 *) __LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, PFAULT_INIT, (u16 *) __LC_EXT_CPU_ADDR); rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, @@ -501,14 +505,11 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, default: BUG(); } - if (rc) { - printk("kvm: The guest lowcore is not mapped during interrupt " - "delivery, killing userspace\n"); - do_exit(SIGKILL); - } + + return rc; } -static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu) +static int __must_check deliver_ckc_interrupt(struct kvm_vcpu *vcpu) { int rc; @@ -518,11 +519,7 @@ static void deliver_ckc_interrupt(struct kvm_vcpu *vcpu) rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc) { - printk("kvm: The guest lowcore is not mapped during interrupt " - "delivery, killing userspace\n"); - do_exit(SIGKILL); - } + return rc; } /* Check whether SIGP interpretation facility has an external call pending */ @@ -629,6 +626,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) */ vcpu->preempted = true; wake_up_interruptible(&vcpu->wq); + vcpu->stat.halt_wakeup++; } } @@ -661,12 +659,13 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu) &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl); } -void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) +int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; struct kvm_s390_interrupt_info *n, *inti = NULL; int deliver; + int rc = 0; __reset_intercept_indicators(vcpu); if (atomic_read(&li->active)) { @@ -685,16 +684,16 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) atomic_set(&li->active, 0); spin_unlock(&li->lock); if (deliver) { - __do_deliver_interrupt(vcpu, inti); + rc = __do_deliver_interrupt(vcpu, inti); kfree(inti); } - } while (deliver); + } while (!rc && deliver); } - if (kvm_cpu_has_pending_timer(vcpu)) - deliver_ckc_interrupt(vcpu); + if (!rc && kvm_cpu_has_pending_timer(vcpu)) + rc = deliver_ckc_interrupt(vcpu); - if (atomic_read(&fi->active)) { + if (!rc && atomic_read(&fi->active)) { do { deliver = 0; spin_lock(&fi->lock); @@ -711,67 +710,13 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) atomic_set(&fi->active, 0); spin_unlock(&fi->lock); if (deliver) { - __do_deliver_interrupt(vcpu, inti); - kfree(inti); - } - } while (deliver); - } -} - -void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu) -{ - struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; - struct kvm_s390_interrupt_info *n, *inti = NULL; - int deliver; - - __reset_intercept_indicators(vcpu); - if (atomic_read(&li->active)) { - do { - deliver = 0; - spin_lock(&li->lock); - list_for_each_entry_safe(inti, n, &li->list, list) { - if ((inti->type == KVM_S390_MCHK) && - __interrupt_is_deliverable(vcpu, inti)) { - list_del(&inti->list); - deliver = 1; - break; - } - __set_intercept_indicator(vcpu, inti); - } - if (list_empty(&li->list)) - atomic_set(&li->active, 0); - spin_unlock(&li->lock); - if (deliver) { - __do_deliver_interrupt(vcpu, inti); + rc = __do_deliver_interrupt(vcpu, inti); kfree(inti); } - } while (deliver); + } while (!rc && deliver); } - if (atomic_read(&fi->active)) { - do { - deliver = 0; - spin_lock(&fi->lock); - list_for_each_entry_safe(inti, n, &fi->list, list) { - if ((inti->type == KVM_S390_MCHK) && - __interrupt_is_deliverable(vcpu, inti)) { - list_del(&inti->list); - fi->irq_count--; - deliver = 1; - break; - } - __set_intercept_indicator(vcpu, inti); - } - if (list_empty(&fi->list)) - atomic_set(&fi->active, 0); - spin_unlock(&fi->lock); - if (deliver) { - __do_deliver_interrupt(vcpu, inti); - kfree(inti); - } - } while (deliver); - } + return rc; } int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) @@ -1048,7 +993,6 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm, s390int->parm64, 2); - mutex_lock(&vcpu->kvm->lock); li = &vcpu->arch.local_int; spin_lock(&li->lock); if (inti->type == KVM_S390_PROGRAM_INT) @@ -1060,7 +1004,6 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, li->action_bits |= ACTION_STOP_ON_STOP; atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); spin_unlock(&li->lock); - mutex_unlock(&vcpu->kvm->lock); kvm_s390_vcpu_wakeup(vcpu); return 0; } @@ -1300,7 +1243,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) } INIT_LIST_HEAD(&map->list); map->guest_addr = addr; - map->addr = gmap_translate(addr, kvm->arch.gmap); + map->addr = gmap_translate(kvm->arch.gmap, addr); if (map->addr == -EFAULT) { ret = -EFAULT; goto out; @@ -1410,7 +1353,6 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) r = enqueue_floating_irq(dev, attr); break; case KVM_DEV_FLIC_CLEAR_IRQS: - r = 0; kvm_s390_clear_float_irqs(dev->kvm); break; case KVM_DEV_FLIC_APF_ENABLE: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 81b0e11521e4..55aade49b6d1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -50,6 +50,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "exit_instruction", VCPU_STAT(exit_instruction) }, { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, { "instruction_lctl", VCPU_STAT(instruction_lctl) }, { "instruction_stctl", VCPU_STAT(instruction_stctl) }, @@ -100,16 +101,12 @@ int test_vfacility(unsigned long nr) } /* Section: not file related */ -int kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void) { /* every s390 is virtualization enabled ;-) */ return 0; } -void kvm_arch_hardware_disable(void *garbage) -{ -} - static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address); int kvm_arch_hardware_setup(void) @@ -124,17 +121,10 @@ void kvm_arch_hardware_unsetup(void) gmap_unregister_ipte_notifier(&gmap_notifier); } -void kvm_arch_check_processor_compat(void *rtn) -{ -} - int kvm_arch_init(void *opaque) { - return 0; -} - -void kvm_arch_exit(void) -{ + /* Register floating interrupt controller interface. */ + return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); } /* Section: device related */ @@ -404,6 +394,22 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } +static int kvm_s390_crypto_init(struct kvm *kvm) +{ + if (!test_vfacility(76)) + return 0; + + kvm->arch.crypto.crycb = kzalloc(sizeof(*kvm->arch.crypto.crycb), + GFP_KERNEL | GFP_DMA); + if (!kvm->arch.crypto.crycb) + return -ENOMEM; + + kvm->arch.crypto.crycbd = (__u32) (unsigned long) kvm->arch.crypto.crycb | + CRYCB_FORMAT1; + + return 0; +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int rc; @@ -441,6 +447,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.dbf) goto out_nodbf; + if (kvm_s390_crypto_init(kvm) < 0) + goto out_crypto; + spin_lock_init(&kvm->arch.float_int.lock); INIT_LIST_HEAD(&kvm->arch.float_int.list); init_waitqueue_head(&kvm->arch.ipte_wq); @@ -451,7 +460,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (type & KVM_VM_S390_UCONTROL) { kvm->arch.gmap = NULL; } else { - kvm->arch.gmap = gmap_alloc(current->mm); + kvm->arch.gmap = gmap_alloc(current->mm, (1UL << 44) - 1); if (!kvm->arch.gmap) goto out_nogmap; kvm->arch.gmap->private = kvm; @@ -465,6 +474,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; out_nogmap: + kfree(kvm->arch.crypto.crycb); +out_crypto: debug_unregister(kvm->arch.dbf); out_nodbf: free_page((unsigned long)(kvm->arch.sca)); @@ -514,15 +525,12 @@ static void kvm_free_vcpus(struct kvm *kvm) mutex_unlock(&kvm->lock); } -void kvm_arch_sync_events(struct kvm *kvm) -{ -} - void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_vcpus(kvm); free_page((unsigned long)(kvm->arch.sca)); debug_unregister(kvm->arch.dbf); + kfree(kvm->arch.crypto.crycb); if (!kvm_is_ucontrol(kvm)) gmap_free(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); @@ -535,7 +543,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); if (kvm_is_ucontrol(vcpu->kvm)) { - vcpu->arch.gmap = gmap_alloc(current->mm); + vcpu->arch.gmap = gmap_alloc(current->mm, -1UL); if (!vcpu->arch.gmap) return -ENOMEM; vcpu->arch.gmap->private = vcpu->kvm; @@ -546,15 +554,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | - KVM_SYNC_CRS; + KVM_SYNC_CRS | + KVM_SYNC_ARCH0 | + KVM_SYNC_PFAULT; return 0; } -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - /* Nothing todo */ -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { save_fp_ctl(&vcpu->arch.host_fpregs.fpc); @@ -607,6 +612,14 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) return 0; } +static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) +{ + if (!test_vfacility(76)) + return; + + vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; +} + void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) { free_page(vcpu->arch.sie_block->cbrlo); @@ -653,6 +666,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; get_cpu_id(&vcpu->arch.cpu_id); vcpu->arch.cpu_id.version = 0xff; + + kvm_s390_vcpu_crypto_setup(vcpu); + return rc; } @@ -1049,6 +1065,11 @@ retry: goto retry; } + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + vcpu->arch.sie_block->ihcpu = 0xffff; + goto retry; + } + if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) { if (!ibs_enabled(vcpu)) { trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1); @@ -1085,18 +1106,8 @@ retry: */ long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable) { - struct mm_struct *mm = current->mm; - hva_t hva; - long rc; - - hva = gmap_fault(gpa, vcpu->arch.gmap); - if (IS_ERR_VALUE(hva)) - return (long)hva; - down_read(&mm->mmap_sem); - rc = get_user_pages(current, mm, hva, 1, writable, 0, NULL, NULL); - up_read(&mm->mmap_sem); - - return rc < 0 ? rc : 0; + return gmap_fault(vcpu->arch.gmap, gpa, + writable ? FAULT_FLAG_WRITE : 0); } static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, @@ -1191,8 +1202,11 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) if (test_cpu_flag(CIF_MCCK_PENDING)) s390_handle_mcck(); - if (!kvm_is_ucontrol(vcpu->kvm)) - kvm_s390_deliver_pending_interrupts(vcpu); + if (!kvm_is_ucontrol(vcpu->kvm)) { + rc = kvm_s390_deliver_pending_interrupts(vcpu); + if (rc) + return rc; + } rc = kvm_s390_handle_requests(vcpu); if (rc) @@ -1296,6 +1310,48 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return rc; } +static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; + vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) + kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); + if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { + memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); + /* some control register changes require a tlb flush */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { + vcpu->arch.sie_block->cputm = kvm_run->s.regs.cputm; + vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; + vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr; + vcpu->arch.sie_block->pp = kvm_run->s.regs.pp; + vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea; + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PFAULT) { + vcpu->arch.pfault_token = kvm_run->s.regs.pft; + vcpu->arch.pfault_select = kvm_run->s.regs.pfs; + vcpu->arch.pfault_compare = kvm_run->s.regs.pfc; + } + kvm_run->kvm_dirty_regs = 0; +} + +static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; + kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; + kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); + memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); + kvm_run->s.regs.cputm = vcpu->arch.sie_block->cputm; + kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc; + kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; + kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; + kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; + kvm_run->s.regs.pft = vcpu->arch.pfault_token; + kvm_run->s.regs.pfs = vcpu->arch.pfault_select; + kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int rc; @@ -1317,17 +1373,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return -EINVAL; } - vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; - vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; - if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) { - kvm_run->kvm_dirty_regs &= ~KVM_SYNC_PREFIX; - kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); - } - if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { - kvm_run->kvm_dirty_regs &= ~KVM_SYNC_CRS; - memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); - kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); - } + sync_regs(vcpu, kvm_run); might_fault(); rc = __vcpu_run(vcpu); @@ -1357,10 +1403,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) rc = 0; } - kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; - kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; - kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); - memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); + store_regs(vcpu, kvm_run); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -1489,7 +1532,7 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) * Another VCPU might have used IBS while we were offline. * Let's play safe and flush the VCPU at startup. */ - vcpu->arch.sie_block->ihcpu = 0xffff; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); spin_unlock(&vcpu->kvm->arch.start_stop_lock); return; } @@ -1644,9 +1687,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_S390_VCPU_FAULT: { - r = gmap_fault(arg, vcpu->arch.gmap); - if (!IS_ERR_VALUE(r)) - r = 0; + r = gmap_fault(vcpu->arch.gmap, arg, 0); break; } case KVM_ENABLE_CAP: @@ -1677,21 +1718,12 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ -} - int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) { return 0; } -void kvm_arch_memslots_updated(struct kvm *kvm) -{ -} - /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -1737,15 +1769,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, return; } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ -} - -void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ -} - static int __init kvm_s390_init(void) { int ret; @@ -1764,7 +1787,7 @@ static int __init kvm_s390_init(void) return -ENOMEM; } memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16); - vfacilities[0] &= 0xff82fff3f4fc2000UL; + vfacilities[0] &= 0xff82fffbf47c2000UL; vfacilities[1] &= 0x005c000000000000UL; return 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3862fa2cefe0..244d02303182 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -70,7 +70,7 @@ static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu) static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) { vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; - vcpu->arch.sie_block->ihcpu = 0xffff; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); } @@ -138,8 +138,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); -void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); -void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu); +int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu); void kvm_s390_clear_float_irqs(struct kvm *kvm); int __must_check kvm_s390_inject_vm(struct kvm *kvm, @@ -228,6 +227,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int psw_extint_disabled(struct kvm_vcpu *vcpu); void kvm_s390_destroy_adapters(struct kvm *kvm); int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu); +extern struct kvm_device_ops kvm_flic_ops; /* implemented in guestdbg.c */ void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index f89c1cd67751..72bb2dd8b9cd 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -352,13 +352,6 @@ static int handle_stfl(struct kvm_vcpu *vcpu) return 0; } -static void handle_new_psw(struct kvm_vcpu *vcpu) -{ - /* Check whether the new psw is enabled for machine checks. */ - if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK) - kvm_s390_deliver_pending_machine_checks(vcpu); -} - #define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA) #define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL #define PSW_ADDR_24 0x0000000000ffffffUL @@ -405,7 +398,6 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE; if (!is_valid_psw(gpsw)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - handle_new_psw(vcpu); return 0; } @@ -427,7 +419,6 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gpsw = new_psw; if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - handle_new_psw(vcpu); return 0; } @@ -738,7 +729,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* invalid entry */ break; /* try to free backing */ - __gmap_zap(cbrle, gmap); + __gmap_zap(gmap, cbrle); } up_read(&gmap->mm->mmap_sem); if (i < entries) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 3f3b35403d0a..a2b81d6ce8a5 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -442,18 +442,15 @@ static inline int do_exception(struct pt_regs *regs, int access) down_read(&mm->mmap_sem); #ifdef CONFIG_PGSTE - gmap = (struct gmap *) - ((current->flags & PF_VCPU) ? S390_lowcore.gmap : 0); + gmap = (current->flags & PF_VCPU) ? + (struct gmap *) S390_lowcore.gmap : NULL; if (gmap) { - address = __gmap_fault(address, gmap); + current->thread.gmap_addr = address; + address = __gmap_translate(gmap, address); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; goto out_up; } - if (address == -ENOMEM) { - fault = VM_FAULT_OOM; - goto out_up; - } if (gmap->pfault_enabled) flags |= FAULT_FLAG_RETRY_NOWAIT; } @@ -530,6 +527,20 @@ retry: goto retry; } } +#ifdef CONFIG_PGSTE + if (gmap) { + address = __gmap_link(gmap, current->thread.gmap_addr, + address); + if (address == -EFAULT) { + fault = VM_FAULT_BADMAP; + goto out_up; + } + if (address == -ENOMEM) { + fault = VM_FAULT_OOM; + goto out_up; + } + } +#endif fault = 0; out_up: up_read(&mm->mmap_sem); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 5404a6261db9..296b61a4af59 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -145,30 +145,56 @@ void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) /** * gmap_alloc - allocate a guest address space * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space * * Returns a guest address space structure. */ -struct gmap *gmap_alloc(struct mm_struct *mm) +struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) { struct gmap *gmap; struct page *page; unsigned long *table; - + unsigned long etype, atype; + + if (limit < (1UL << 31)) { + limit = (1UL << 31) - 1; + atype = _ASCE_TYPE_SEGMENT; + etype = _SEGMENT_ENTRY_EMPTY; + } else if (limit < (1UL << 42)) { + limit = (1UL << 42) - 1; + atype = _ASCE_TYPE_REGION3; + etype = _REGION3_ENTRY_EMPTY; + } else if (limit < (1UL << 53)) { + limit = (1UL << 53) - 1; + atype = _ASCE_TYPE_REGION2; + etype = _REGION2_ENTRY_EMPTY; + } else { + limit = -1UL; + atype = _ASCE_TYPE_REGION1; + etype = _REGION1_ENTRY_EMPTY; + } gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); if (!gmap) goto out; INIT_LIST_HEAD(&gmap->crst_list); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); + spin_lock_init(&gmap->guest_table_lock); gmap->mm = mm; page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); if (!page) goto out_free; + page->index = 0; list_add(&page->lru, &gmap->crst_list); table = (unsigned long *) page_to_phys(page); - crst_table_init(table, _REGION1_ENTRY_EMPTY); + crst_table_init(table, etype); gmap->table = table; - gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | __pa(table); + gmap->asce = atype | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | __pa(table); + gmap->asce_end = limit; + down_write(&mm->mmap_sem); list_add(&gmap->list, &mm->context.gmap_list); + up_write(&mm->mmap_sem); return gmap; out_free: @@ -178,36 +204,38 @@ out: } EXPORT_SYMBOL_GPL(gmap_alloc); -static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table) -{ - struct gmap_pgtable *mp; - struct gmap_rmap *rmap; - struct page *page; - - if (*table & _SEGMENT_ENTRY_INVALID) - return 0; - page = pfn_to_page(*table >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - list_for_each_entry(rmap, &mp->mapper, list) { - if (rmap->entry != table) - continue; - list_del(&rmap->list); - kfree(rmap); - break; - } - *table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT; - return 1; -} - static void gmap_flush_tlb(struct gmap *gmap) { if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table | - _ASCE_TYPE_REGION1); + __tlb_flush_asce(gmap->mm, gmap->asce); else __tlb_flush_global(); } +static void gmap_radix_tree_free(struct radix_tree_root *root) +{ + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + radix_tree_delete(root, index); + } + } while (nr > 0); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure @@ -215,31 +243,21 @@ static void gmap_flush_tlb(struct gmap *gmap) void gmap_free(struct gmap *gmap) { struct page *page, *next; - unsigned long *table; - int i; - /* Flush tlb. */ if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table | - _ASCE_TYPE_REGION1); + __tlb_flush_asce(gmap->mm, gmap->asce); else __tlb_flush_global(); /* Free all segment & region tables. */ - down_read(&gmap->mm->mmap_sem); - spin_lock(&gmap->mm->page_table_lock); - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { - table = (unsigned long *) page_to_phys(page); - if ((*table & _REGION_ENTRY_TYPE_MASK) == 0) - /* Remove gmap rmap structures for segment table. */ - for (i = 0; i < PTRS_PER_PMD; i++, table++) - gmap_unlink_segment(gmap, table); + list_for_each_entry_safe(page, next, &gmap->crst_list, lru) __free_pages(page, ALLOC_ORDER); - } - spin_unlock(&gmap->mm->page_table_lock); - up_read(&gmap->mm->mmap_sem); + gmap_radix_tree_free(&gmap->guest_to_host); + gmap_radix_tree_free(&gmap->host_to_guest); + down_write(&gmap->mm->mmap_sem); list_del(&gmap->list); + up_write(&gmap->mm->mmap_sem); kfree(gmap); } EXPORT_SYMBOL_GPL(gmap_free); @@ -267,42 +285,97 @@ EXPORT_SYMBOL_GPL(gmap_disable); /* * gmap_alloc_table is assumed to be called with mmap_sem held */ -static int gmap_alloc_table(struct gmap *gmap, - unsigned long *table, unsigned long init) - __releases(&gmap->mm->page_table_lock) - __acquires(&gmap->mm->page_table_lock) +static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, + unsigned long init, unsigned long gaddr) { struct page *page; unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - spin_unlock(&gmap->mm->page_table_lock); page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); - spin_lock(&gmap->mm->page_table_lock); if (!page) return -ENOMEM; new = (unsigned long *) page_to_phys(page); crst_table_init(new, init); + spin_lock(&gmap->mm->page_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); *table = (unsigned long) new | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); - } else + page->index = gaddr; + page = NULL; + } + spin_unlock(&gmap->mm->page_table_lock); + if (page) __free_pages(page, ALLOC_ORDER); return 0; } /** + * __gmap_segment_gaddr - find virtual address from segment pointer + * @entry: pointer to a segment table entry in the guest address space + * + * Returns the virtual address in the guest address space for the segment + */ +static unsigned long __gmap_segment_gaddr(unsigned long *entry) +{ + struct page *page; + unsigned long offset; + + offset = (unsigned long) entry / sizeof(unsigned long); + offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; + page = pmd_to_page((pmd_t *) entry); + return page->index + offset; +} + +/** + * __gmap_unlink_by_vmaddr - unlink a single segment via a host address + * @gmap: pointer to the guest address space structure + * @vmaddr: address in the host process address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) +{ + unsigned long *entry; + int flush = 0; + + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); + if (entry) { + flush = (*entry != _SEGMENT_ENTRY_INVALID); + *entry = _SEGMENT_ENTRY_INVALID; + } + spin_unlock(&gmap->guest_table_lock); + return flush; +} + +/** + * __gmap_unmap_by_gaddr - unmap a single segment via a guest address + * @gmap: pointer to the guest address space structure + * @gaddr: address in the guest address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; +} + +/** * gmap_unmap_segment - unmap segment from the guest address space * @gmap: pointer to the guest address space structure - * @addr: address in the guest address space + * @to: address in the guest address space * @len: length of the memory area to unmap * * Returns 0 if the unmap succeeded, -EINVAL if not. */ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) { - unsigned long *table; unsigned long off; int flush; @@ -312,31 +385,10 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) return -EINVAL; flush = 0; - down_read(&gmap->mm->mmap_sem); - spin_lock(&gmap->mm->page_table_lock); - for (off = 0; off < len; off += PMD_SIZE) { - /* Walk the guest addr space page table */ - table = gmap->table + (((to + off) >> 53) & 0x7ff); - if (*table & _REGION_ENTRY_INVALID) - goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 42) & 0x7ff); - if (*table & _REGION_ENTRY_INVALID) - goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 31) & 0x7ff); - if (*table & _REGION_ENTRY_INVALID) - goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 20) & 0x7ff); - - /* Clear segment table entry in guest address space. */ - flush |= gmap_unlink_segment(gmap, table); - *table = _SEGMENT_ENTRY_INVALID; - } -out: - spin_unlock(&gmap->mm->page_table_lock); - up_read(&gmap->mm->mmap_sem); + down_write(&gmap->mm->mmap_sem); + for (off = 0; off < len; off += PMD_SIZE) + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + up_write(&gmap->mm->mmap_sem); if (flush) gmap_flush_tlb(gmap); return 0; @@ -348,87 +400,47 @@ EXPORT_SYMBOL_GPL(gmap_unmap_segment); * @gmap: pointer to the guest address space structure * @from: source address in the parent address space * @to: target address in the guest address space + * @len: length of the memory area to map * * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. */ int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len) { - unsigned long *table; unsigned long off; int flush; if ((from | to | len) & (PMD_SIZE - 1)) return -EINVAL; - if (len == 0 || from + len > TASK_MAX_SIZE || - from + len < from || to + len < to) + if (len == 0 || from + len < from || to + len < to || + from + len > TASK_MAX_SIZE || to + len > gmap->asce_end) return -EINVAL; flush = 0; - down_read(&gmap->mm->mmap_sem); - spin_lock(&gmap->mm->page_table_lock); + down_write(&gmap->mm->mmap_sem); for (off = 0; off < len; off += PMD_SIZE) { - /* Walk the gmap address space page table */ - table = gmap->table + (((to + off) >> 53) & 0x7ff); - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY)) - goto out_unmap; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 42) & 0x7ff); - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY)) - goto out_unmap; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 31) & 0x7ff); - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY)) - goto out_unmap; - table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN); - table = table + (((to + off) >> 20) & 0x7ff); - - /* Store 'from' address in an invalid segment table entry. */ - flush |= gmap_unlink_segment(gmap, table); - *table = (from + off) | (_SEGMENT_ENTRY_INVALID | - _SEGMENT_ENTRY_PROTECT); + /* Remove old translation */ + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + /* Store new translation */ + if (radix_tree_insert(&gmap->guest_to_host, + (to + off) >> PMD_SHIFT, + (void *) from + off)) + break; } - spin_unlock(&gmap->mm->page_table_lock); - up_read(&gmap->mm->mmap_sem); + up_write(&gmap->mm->mmap_sem); if (flush) gmap_flush_tlb(gmap); - return 0; - -out_unmap: - spin_unlock(&gmap->mm->page_table_lock); - up_read(&gmap->mm->mmap_sem); + if (off >= len) + return 0; gmap_unmap_segment(gmap, to, len); return -ENOMEM; } EXPORT_SYMBOL_GPL(gmap_map_segment); -static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) -{ - unsigned long *table; - - table = gmap->table + ((address >> 53) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) - return ERR_PTR(-EFAULT); - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 42) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) - return ERR_PTR(-EFAULT); - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 31) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) - return ERR_PTR(-EFAULT); - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 20) & 0x7ff); - return table; -} - /** * __gmap_translate - translate a guest address to a user space address - * @address: guest address * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address * * Returns user space address which corresponds to the guest address or * -EFAULT if no such mapping exists. @@ -436,168 +448,161 @@ static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) * The mmap_sem of the mm that belongs to the address space must be held * when this function gets called. */ -unsigned long __gmap_translate(unsigned long address, struct gmap *gmap) +unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) { - unsigned long *segment_ptr, vmaddr, segment; - struct gmap_pgtable *mp; - struct page *page; + unsigned long vmaddr; - current->thread.gmap_addr = address; - segment_ptr = gmap_table_walk(address, gmap); - if (IS_ERR(segment_ptr)) - return PTR_ERR(segment_ptr); - /* Convert the gmap address to an mm address. */ - segment = *segment_ptr; - if (!(segment & _SEGMENT_ENTRY_INVALID)) { - page = pfn_to_page(segment >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - return mp->vmaddr | (address & ~PMD_MASK); - } else if (segment & _SEGMENT_ENTRY_PROTECT) { - vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; - return vmaddr | (address & ~PMD_MASK); - } - return -EFAULT; + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; } EXPORT_SYMBOL_GPL(__gmap_translate); /** * gmap_translate - translate a guest address to a user space address - * @address: guest address * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address * * Returns user space address which corresponds to the guest address or * -EFAULT if no such mapping exists. * This function does not establish potentially missing page table entries. */ -unsigned long gmap_translate(unsigned long address, struct gmap *gmap) +unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) { unsigned long rc; down_read(&gmap->mm->mmap_sem); - rc = __gmap_translate(address, gmap); + rc = __gmap_translate(gmap, gaddr); up_read(&gmap->mm->mmap_sem); return rc; } EXPORT_SYMBOL_GPL(gmap_translate); -static int gmap_connect_pgtable(unsigned long address, unsigned long segment, - unsigned long *segment_ptr, struct gmap *gmap) +/** + * gmap_unlink - disconnect a page table from the gmap shadow tables + * @gmap: pointer to guest mapping meta data structure + * @table: pointer to the host page table + * @vmaddr: vm address associated with the host page table + */ +static void gmap_unlink(struct mm_struct *mm, unsigned long *table, + unsigned long vmaddr) +{ + struct gmap *gmap; + int flush; + + list_for_each_entry(gmap, &mm->context.gmap_list, list) { + flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); + if (flush) + gmap_flush_tlb(gmap); + } +} + +/** + * gmap_link - set up shadow page tables to connect a host to a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @vmaddr: vm address + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + */ +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) { - unsigned long vmaddr; - struct vm_area_struct *vma; - struct gmap_pgtable *mp; - struct gmap_rmap *rmap; struct mm_struct *mm; - struct page *page; + unsigned long *table; + spinlock_t *ptl; pgd_t *pgd; pud_t *pud; pmd_t *pmd; + int rc; - mm = gmap->mm; - vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; - vma = find_vma(mm, vmaddr); - if (!vma || vma->vm_start > vmaddr) - return -EFAULT; + /* Create higher level tables in the gmap page table */ + table = gmap->table; + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { + table += (gaddr >> 53) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, + gaddr & 0xffe0000000000000)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { + table += (gaddr >> 42) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, + gaddr & 0xfffffc0000000000)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { + table += (gaddr >> 31) & 0x7ff; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, + gaddr & 0xffffffff80000000)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + table += (gaddr >> 20) & 0x7ff; /* Walk the parent mm page table */ + mm = gmap->mm; pgd = pgd_offset(mm, vmaddr); - pud = pud_alloc(mm, pgd, vmaddr); - if (!pud) - return -ENOMEM; - pmd = pmd_alloc(mm, pud, vmaddr); - if (!pmd) - return -ENOMEM; - if (!pmd_present(*pmd) && - __pte_alloc(mm, vma, pmd, vmaddr)) - return -ENOMEM; + VM_BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, vmaddr); + VM_BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, vmaddr); + VM_BUG_ON(pmd_none(*pmd)); /* large pmds cannot yet be handled */ if (pmd_large(*pmd)) return -EFAULT; - /* pmd now points to a valid segment table entry. */ - rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); - if (!rmap) - return -ENOMEM; /* Link gmap segment table entry location to page table. */ - page = pmd_page(*pmd); - mp = (struct gmap_pgtable *) page->index; - rmap->gmap = gmap; - rmap->entry = segment_ptr; - rmap->vmaddr = address & PMD_MASK; - spin_lock(&mm->page_table_lock); - if (*segment_ptr == segment) { - list_add(&rmap->list, &mp->mapper); - /* Set gmap segment table entry to page table. */ - *segment_ptr = pmd_val(*pmd) & PAGE_MASK; - rmap = NULL; - } - spin_unlock(&mm->page_table_lock); - kfree(rmap); - return 0; -} - -static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table) -{ - struct gmap_rmap *rmap, *next; - struct gmap_pgtable *mp; - struct page *page; - int flush; - - flush = 0; - spin_lock(&mm->page_table_lock); - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - list_for_each_entry_safe(rmap, next, &mp->mapper, list) { - *rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID | - _SEGMENT_ENTRY_PROTECT); - list_del(&rmap->list); - kfree(rmap); - flush = 1; - } - spin_unlock(&mm->page_table_lock); - if (flush) - __tlb_flush_global(); + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + return rc; + ptl = pmd_lock(mm, pmd); + spin_lock(&gmap->guest_table_lock); + if (*table == _SEGMENT_ENTRY_INVALID) { + rc = radix_tree_insert(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT, table); + if (!rc) + *table = pmd_val(*pmd); + } else + rc = 0; + spin_unlock(&gmap->guest_table_lock); + spin_unlock(ptl); + radix_tree_preload_end(); + return rc; } -/* - * this function is assumed to be called with mmap_sem held +/** + * gmap_fault - resolve a fault on a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @fault_flags: flags to pass down to handle_mm_fault() + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. */ -unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) +int gmap_fault(struct gmap *gmap, unsigned long gaddr, + unsigned int fault_flags) { - unsigned long *segment_ptr, segment; - struct gmap_pgtable *mp; - struct page *page; + unsigned long vmaddr; int rc; - current->thread.gmap_addr = address; - segment_ptr = gmap_table_walk(address, gmap); - if (IS_ERR(segment_ptr)) - return -EFAULT; - /* Convert the gmap address to an mm address. */ - while (1) { - segment = *segment_ptr; - if (!(segment & _SEGMENT_ENTRY_INVALID)) { - /* Page table is present */ - page = pfn_to_page(segment >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - return mp->vmaddr | (address & ~PMD_MASK); - } - if (!(segment & _SEGMENT_ENTRY_PROTECT)) - /* Nothing mapped in the gmap address space. */ - break; - rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap); - if (rc) - return rc; - } - return -EFAULT; -} - -unsigned long gmap_fault(unsigned long address, struct gmap *gmap) -{ - unsigned long rc; - down_read(&gmap->mm->mmap_sem); - rc = __gmap_fault(address, gmap); + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + goto out_up; + } + if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { + rc = -EFAULT; + goto out_up; + } + rc = __gmap_link(gmap, gaddr, vmaddr); +out_up: up_read(&gmap->mm->mmap_sem); - return rc; } EXPORT_SYMBOL_GPL(gmap_fault); @@ -617,17 +622,24 @@ static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) free_swap_and_cache(entry); } -/** - * The mm->mmap_sem lock must be held +/* + * this function is assumed to be called with mmap_sem held */ -static void gmap_zap_unused(struct mm_struct *mm, unsigned long address) +void __gmap_zap(struct gmap *gmap, unsigned long gaddr) { - unsigned long ptev, pgstev; + unsigned long vmaddr, ptev, pgstev; + pte_t *ptep, pte; spinlock_t *ptl; pgste_t pgste; - pte_t *ptep, pte; - ptep = get_locked_pte(mm, address, &ptl); + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (!vmaddr) + return; + vmaddr |= gaddr & ~PMD_MASK; + /* Get pointer to the page table entry */ + ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); if (unlikely(!ptep)) return; pte = *ptep; @@ -639,87 +651,34 @@ static void gmap_zap_unused(struct mm_struct *mm, unsigned long address) ptev = pte_val(pte); if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { - gmap_zap_swap_entry(pte_to_swp_entry(pte), mm); - pte_clear(mm, address, ptep); + gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); + pte_clear(gmap->mm, vmaddr, ptep); } pgste_set_unlock(ptep, pgste); out_pte: pte_unmap_unlock(*ptep, ptl); } - -/* - * this function is assumed to be called with mmap_sem held - */ -void __gmap_zap(unsigned long address, struct gmap *gmap) -{ - unsigned long *table, *segment_ptr; - unsigned long segment, pgstev, ptev; - struct gmap_pgtable *mp; - struct page *page; - - segment_ptr = gmap_table_walk(address, gmap); - if (IS_ERR(segment_ptr)) - return; - segment = *segment_ptr; - if (segment & _SEGMENT_ENTRY_INVALID) - return; - page = pfn_to_page(segment >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - address = mp->vmaddr | (address & ~PMD_MASK); - /* Page table is present */ - table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN); - table = table + ((address >> 12) & 0xff); - pgstev = table[PTRS_PER_PTE]; - ptev = table[0]; - /* quick check, checked again with locks held */ - if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || - ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) - gmap_zap_unused(gmap->mm, address); -} EXPORT_SYMBOL_GPL(__gmap_zap); -void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) +void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) { - - unsigned long *table, address, size; + unsigned long gaddr, vmaddr, size; struct vm_area_struct *vma; - struct gmap_pgtable *mp; - struct page *page; down_read(&gmap->mm->mmap_sem); - address = from; - while (address < to) { - /* Walk the gmap address space page table */ - table = gmap->table + ((address >> 53) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 42) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) { - address = (address + PMD_SIZE) & PMD_MASK; + for (gaddr = from; gaddr < to; + gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (!vmaddr) continue; - } - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 31) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INVALID)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 20) & 0x7ff); - if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - page = pfn_to_page(*table >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - vma = find_vma(gmap->mm, mp->vmaddr); - size = min(to - address, PMD_SIZE - (address & ~PMD_MASK)); - zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK), - size, NULL); - address = (address + PMD_SIZE) & PMD_MASK; + vmaddr |= gaddr & ~PMD_MASK; + /* Find vma in the parent mm */ + vma = find_vma(gmap->mm, vmaddr); + size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); + zap_page_range(vma, vmaddr, size, NULL); } up_read(&gmap->mm->mmap_sem); } @@ -755,7 +714,7 @@ EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); /** * gmap_ipte_notify - mark a range of ptes for invalidation notification * @gmap: pointer to guest mapping meta data structure - * @start: virtual address in the guest address space + * @gaddr: virtual address in the guest address space * @len: size of area * * Returns 0 if for each page in the given range a gmap mapping exists and @@ -763,7 +722,7 @@ EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); * for one or more pages -EFAULT is returned. If no memory could be allocated * -ENOMEM is returned. This function establishes missing page table entries. */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) +int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) { unsigned long addr; spinlock_t *ptl; @@ -771,12 +730,12 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) pgste_t pgste; int rc = 0; - if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK)) + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) return -EINVAL; down_read(&gmap->mm->mmap_sem); while (len) { /* Convert gmap address and connect the page tables */ - addr = __gmap_fault(start, gmap); + addr = __gmap_translate(gmap, gaddr); if (IS_ERR_VALUE(addr)) { rc = addr; break; @@ -786,6 +745,9 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) rc = -EFAULT; break; } + rc = __gmap_link(gmap, gaddr, addr); + if (rc) + break; /* Walk the process page table, lock and get pte pointer */ ptep = get_locked_pte(gmap->mm, addr, &ptl); if (unlikely(!ptep)) @@ -796,7 +758,7 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len) pgste = pgste_get_lock(ptep); pgste_val(pgste) |= PGSTE_IN_BIT; pgste_set_unlock(ptep, pgste); - start += PAGE_SIZE; + gaddr += PAGE_SIZE; len -= PAGE_SIZE; } spin_unlock(ptl); @@ -809,28 +771,30 @@ EXPORT_SYMBOL_GPL(gmap_ipte_notify); /** * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. * @mm: pointer to the process mm_struct + * @addr: virtual address in the process address space * @pte: pointer to the page table entry * * This function is assumed to be called with the page table lock held * for the pte to notify. */ -void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte) +void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) { - unsigned long segment_offset; + unsigned long offset, gaddr; + unsigned long *table; struct gmap_notifier *nb; - struct gmap_pgtable *mp; - struct gmap_rmap *rmap; - struct page *page; + struct gmap *gmap; - segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - segment_offset = segment_offset * (4096 / sizeof(pte_t)); - page = pfn_to_page(__pa(pte) >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; + offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); + offset = offset * (4096 / sizeof(pte_t)); spin_lock(&gmap_notifier_lock); - list_for_each_entry(rmap, &mp->mapper, list) { + list_for_each_entry(gmap, &mm->context.gmap_list, list) { + table = radix_tree_lookup(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (!table) + continue; + gaddr = __gmap_segment_gaddr(table) + offset; list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(rmap->gmap, - rmap->vmaddr + segment_offset); + nb->notifier_call(gmap, gaddr); } spin_unlock(&gmap_notifier_lock); } @@ -841,29 +805,18 @@ static inline int page_table_with_pgste(struct page *page) return atomic_read(&page->_mapcount) == 0; } -static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, - unsigned long vmaddr) +static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) { struct page *page; unsigned long *table; - struct gmap_pgtable *mp; page = alloc_page(GFP_KERNEL|__GFP_REPEAT); if (!page) return NULL; - mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT); - if (!mp) { - __free_page(page); - return NULL; - } if (!pgtable_page_ctor(page)) { - kfree(mp); __free_page(page); return NULL; } - mp->vmaddr = vmaddr & PMD_MASK; - INIT_LIST_HEAD(&mp->mapper); - page->index = (unsigned long) mp; atomic_set(&page->_mapcount, 0); table = (unsigned long *) page_to_phys(page); clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); @@ -874,14 +827,10 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, static inline void page_table_free_pgste(unsigned long *table) { struct page *page; - struct gmap_pgtable *mp; page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - mp = (struct gmap_pgtable *) page->index; - BUG_ON(!list_empty(&mp->mapper)); pgtable_page_dtor(page); atomic_set(&page->_mapcount, -1); - kfree(mp); __free_page(page); } @@ -994,13 +943,13 @@ retry: } if (!(pte_val(*ptep) & _PAGE_INVALID) && (pte_val(*ptep) & _PAGE_PROTECT)) { - pte_unmap_unlock(*ptep, ptl); - if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { - up_read(&mm->mmap_sem); - return -EFAULT; - } - goto retry; + pte_unmap_unlock(*ptep, ptl); + if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { + up_read(&mm->mmap_sem); + return -EFAULT; } + goto retry; + } new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | @@ -1038,8 +987,7 @@ static inline int page_table_with_pgste(struct page *page) return 0; } -static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, - unsigned long vmaddr) +static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) { return NULL; } @@ -1053,8 +1001,8 @@ static inline void page_table_free_pgste(unsigned long *table) { } -static inline void gmap_disconnect_pgtable(struct mm_struct *mm, - unsigned long *table) +static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, + unsigned long vmaddr) { } @@ -1074,14 +1022,14 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) /* * page table entry allocation/free routines. */ -unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr) +unsigned long *page_table_alloc(struct mm_struct *mm) { unsigned long *uninitialized_var(table); struct page *uninitialized_var(page); unsigned int mask, bit; if (mm_has_pgste(mm)) - return page_table_alloc_pgste(mm, vmaddr); + return page_table_alloc_pgste(mm); /* Allocate fragments of a 4K page as 1K/2K page table */ spin_lock_bh(&mm->context.list_lock); mask = FRAG_MASK; @@ -1123,10 +1071,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) unsigned int bit, mask; page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (page_table_with_pgste(page)) { - gmap_disconnect_pgtable(mm, table); + if (page_table_with_pgste(page)) return page_table_free_pgste(table); - } /* Free 1K/2K page table fragment of a 4K page */ bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); spin_lock_bh(&mm->context.list_lock); @@ -1158,7 +1104,8 @@ static void __page_table_free_rcu(void *table, unsigned bit) } } -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) +void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, + unsigned long vmaddr) { struct mm_struct *mm; struct page *page; @@ -1167,7 +1114,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) mm = tlb->mm; page = pfn_to_page(__pa(table) >> PAGE_SHIFT); if (page_table_with_pgste(page)) { - gmap_disconnect_pgtable(mm, table); + gmap_unlink(mm, table, vmaddr); table = (unsigned long *) (__pa(table) | FRAG_MASK); tlb_remove_table(tlb, table); return; @@ -1303,7 +1250,7 @@ again: if (page_table_with_pgste(page)) continue; /* Allocate new page table with pgstes */ - new = page_table_alloc_pgste(mm, addr); + new = page_table_alloc_pgste(mm); if (!new) return -ENOMEM; @@ -1318,7 +1265,7 @@ again: /* Establish new table */ pmd_populate(mm, pmd, (pte_t *) new); /* Free old table with rcu, there might be a walker! */ - page_table_free_rcu(tlb, table); + page_table_free_rcu(tlb, table, addr); new = NULL; } spin_unlock(ptl); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index fe9012a49aa5..fdbd7888cb07 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -65,7 +65,7 @@ static pte_t __ref *vmem_pte_alloc(unsigned long address) pte_t *pte; if (slab_is_available()) - pte = (pte_t *) page_table_alloc(&init_mm, address); + pte = (pte_t *) page_table_alloc(&init_mm); else pte = alloc_bootmem_align(PTRS_PER_PTE * sizeof(pte_t), PTRS_PER_PTE * sizeof(pte_t)); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 516903b98e06..094292a63e74 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -202,6 +202,7 @@ #define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */ #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7c492ed9087b..7d603a71ab3a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -99,10 +99,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define ASYNC_PF_PER_VCPU 64 -struct kvm_vcpu; -struct kvm; -struct kvm_async_pf; - enum kvm_reg { VCPU_REGS_RAX = 0, VCPU_REGS_RCX = 1, @@ -266,7 +262,8 @@ struct kvm_mmu { struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, struct x86_exception *exception); - gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); + gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); @@ -481,6 +478,7 @@ struct kvm_vcpu_arch { u64 mmio_gva; unsigned access; gfn_t mmio_gfn; + u64 mmio_gen; struct kvm_pmu pmu; @@ -576,11 +574,10 @@ struct kvm_arch { struct kvm_apic_map *apic_map; unsigned int tss_addr; - struct page *apic_access_page; + bool apic_access_page_done; gpa_t wall_clock; - struct page *ept_identity_pagetable; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; @@ -665,8 +662,8 @@ struct msr_data { struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ - int (*hardware_enable)(void *dummy); - void (*hardware_disable)(void *dummy); + int (*hardware_enable)(void); + void (*hardware_disable)(void); void (*check_processor_compatibility)(void *rtn); int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ @@ -710,7 +707,6 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -740,6 +736,7 @@ struct kvm_x86_ops { void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); + void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); @@ -772,6 +769,8 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); + + void (*sched_in)(struct kvm_vcpu *kvm, int cpu); }; struct kvm_arch_async_pf { @@ -895,7 +894,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); static inline int __kvm_irq_line_state(unsigned long *irq_state, @@ -917,7 +915,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); int fx_init(struct kvm_vcpu *vcpu); -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); @@ -926,7 +923,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, @@ -946,7 +944,8 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); void kvm_enable_tdp(void); void kvm_disable_tdp(void); -static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) { return gpa; } @@ -1037,7 +1036,7 @@ asmlinkage void kvm_spurious_fault(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); @@ -1046,6 +1045,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu); +void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); +void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c7678e43465b..e62cf897f781 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -2,6 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include <asm/processor.h> +#include <asm/alternative.h> #include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); @@ -16,10 +17,15 @@ static inline bool kvm_check_and_clear_guest_paused(void) } #endif /* CONFIG_KVM_GUEST */ -/* This instruction is vmcall. On non-VT architectures, it will generate a - * trap that we will then rewrite to the appropriate instruction. +#ifdef CONFIG_DEBUG_RODATA +#define KVM_HYPERCALL \ + ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) +#else +/* On AMD processors, vmcall will generate a trap that we will + * then rewrite to the appropriate instruction. */ #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" +#endif /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 60e5497681f5..813d29d00a17 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -525,6 +525,13 @@ static void early_init_amd(struct cpuinfo_x86 *c) } #endif + /* + * This is only needed to tell the kernel whether to use VMCALL + * and VMMCALL. VMMCALL is never executed except under virt, so + * we can set it unconditionally. + */ + set_cpu_cap(c, X86_FEATURE_VMMCALL); + /* F16h erratum 793, CVE-2013-6885 */ if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 38a0afe83c6b..976e3a57f9ea 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -53,14 +53,14 @@ u64 kvm_supported_xcr0(void) return xcr0; } -void kvm_update_cpuid(struct kvm_vcpu *vcpu) +int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) - return; + return 0; /* Update OSXSAVE bit */ if (cpu_has_xsave && best->function == 0x1) { @@ -88,7 +88,17 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) xstate_required_size(vcpu->arch.xcr0); } + /* + * The existing code assumes virtual address is 48-bit in the canonical + * address checks; exit if it is ever changed. + */ + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best && ((best->eax & 0xff00) >> 8) != 48 && + ((best->eax & 0xff00) >> 8) != 0) + return -EINVAL; + kvm_pmu_cpuid_update(vcpu); + return 0; } static int is_efer_nx(void) @@ -112,8 +122,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) break; } } - if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { - entry->edx &= ~(1 << 20); + if (entry && (entry->edx & bit(X86_FEATURE_NX)) && !is_efer_nx()) { + entry->edx &= ~bit(X86_FEATURE_NX); printk(KERN_INFO "kvm: guest NX capability removed\n"); } } @@ -151,10 +161,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, } vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); - r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - kvm_update_cpuid(vcpu); + r = kvm_update_cpuid(vcpu); out_free: vfree(cpuid_entries); @@ -178,9 +187,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - kvm_update_cpuid(vcpu); - return 0; - + r = kvm_update_cpuid(vcpu); out: return r; } @@ -767,6 +774,12 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) if (!best) best = check_cpuid_limit(vcpu, function, index); + /* + * Perfmon not yet supported for L2 guest. + */ + if (is_guest_mode(vcpu) && function == 0xa) + best = NULL; + if (best) { *eax = best->eax; *ebx = best->ebx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a5380590ab0e..4452eedfaedd 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -3,7 +3,7 @@ #include "x86.h" -void kvm_update_cpuid(struct kvm_vcpu *vcpu); +int kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, @@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_X2APIC)); } +static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0, 0); + return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; +} + static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 03954f7900f5..a46207a05835 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -527,6 +527,7 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { + WARN_ON(vec > 0x1f); ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; @@ -1468,7 +1469,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; err_code = selector & 0xfffc; - err_vec = GP_VECTOR; + err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; /* can't load system descriptor into segment selector */ if (seg <= VCPU_SREG_GS && !seg_desc.s) @@ -1503,6 +1504,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (rpl > cpl || dpl != cpl) goto exception; } + /* in long-mode d/b must be clear if l is set */ + if (seg_desc.d && seg_desc.l) { + u64 efer = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + goto exception; + } + /* CS(RPL) <- CPL */ selector = (selector & 0xfffc) | cpl; break; @@ -1549,8 +1559,7 @@ load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); return X86EMUL_CONTINUE; exception: - emulate_exception(ctxt, err_vec, err_code, true); - return X86EMUL_PROPAGATE_FAULT; + return emulate_exception(ctxt, err_vec, err_code, true); } static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, @@ -2723,8 +2732,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || desc_limit < 0x2b)) { - emulate_ts(ctxt, tss_selector & 0xfffc); - return X86EMUL_PROPAGATE_FAULT; + return emulate_ts(ctxt, tss_selector & 0xfffc); } if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { @@ -3016,7 +3024,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) ctxt->dst.val = swab64(ctxt->src.val); break; default: - return X86EMUL_PROPAGATE_FAULT; + BUG(); } return X86EMUL_CONTINUE; } @@ -3140,12 +3148,8 @@ static int em_clts(struct x86_emulate_ctxt *ctxt) static int em_vmcall(struct x86_emulate_ctxt *ctxt) { - int rc; - - if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1) - return X86EMUL_UNHANDLEABLE; + int rc = ctxt->ops->fix_hypercall(ctxt); - rc = ctxt->ops->fix_hypercall(ctxt); if (rc != X86EMUL_CONTINUE) return rc; @@ -3563,6 +3567,12 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +static const struct opcode group7_rm0[] = { + N, + I(SrcNone | Priv | EmulateOnUD, em_vmcall), + N, N, N, N, N, N, +}; + static const struct opcode group7_rm1[] = { DI(SrcNone | Priv, monitor), DI(SrcNone | Priv, mwait), @@ -3656,7 +3666,7 @@ static const struct group_dual group7 = { { II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | Priv | EmulateOnUD, em_vmcall), + EXT(0, group7_rm0), EXT(0, group7_rm1), N, EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, @@ -3687,14 +3697,18 @@ static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; -static const struct gprefix pfx_vmovntpx = { - I(0, em_mov), N, N, N, +static const struct gprefix pfx_0f_2b = { + I(0, em_mov), I(0, em_mov), N, N, }; static const struct gprefix pfx_0f_28_0f_29 = { I(Aligned, em_mov), I(Aligned, em_mov), N, N, }; +static const struct gprefix pfx_0f_e7 = { + N, I(Sse, em_mov), N, N, +}; + static const struct escape escape_d9 = { { N, N, N, N, N, N, N, I(DstMem, em_fnstcw), }, { @@ -3901,7 +3915,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), - N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), @@ -3965,7 +3979,8 @@ static const struct opcode twobyte_table[256] = { /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7), + N, N, N, N, N, N, N, N, /* 0xF0 - 0xFF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; @@ -4829,8 +4844,10 @@ writeback: ctxt->eip = ctxt->_eip; done: - if (rc == X86EMUL_PROPAGATE_FAULT) + if (rc == X86EMUL_PROPAGATE_FAULT) { + WARN_ON(ctxt->exception.vector > 0x1f); ctxt->have_exception = true; + } if (rc == X86EMUL_INTERCEPTED) return EMULATION_INTERCEPTED; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 08e8a899e005..b8345dd41b25 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -112,17 +112,6 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) struct static_key_deferred apic_hw_disabled __read_mostly; struct static_key_deferred apic_sw_disabled __read_mostly; -static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) -{ - if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) { - if (val & APIC_SPIV_APIC_ENABLED) - static_key_slow_dec_deferred(&apic_sw_disabled); - else - static_key_slow_inc(&apic_sw_disabled.key); - } - apic_set_reg(apic, APIC_SPIV, val); -} - static inline int apic_enabled(struct kvm_lapic *apic) { return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); @@ -210,6 +199,20 @@ out: kvm_vcpu_request_scan_ioapic(kvm); } +static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) +{ + u32 prev = kvm_apic_get_reg(apic, APIC_SPIV); + + apic_set_reg(apic, APIC_SPIV, val); + if ((prev ^ val) & APIC_SPIV_APIC_ENABLED) { + if (val & APIC_SPIV_APIC_ENABLED) { + static_key_slow_dec_deferred(&apic_sw_disabled); + recalculate_apic_map(apic->vcpu->kvm); + } else + static_key_slow_inc(&apic_sw_disabled.key); + } +} + static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) { apic_set_reg(apic, APIC_ID, id << 24); @@ -706,6 +709,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); switch (delivery_mode) { case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; @@ -727,8 +732,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, false); break; case APIC_DM_REMRD: @@ -1352,6 +1355,9 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) return; hrtimer_cancel(&apic->lapic_timer.timer); + /* Inject here so clearing tscdeadline won't override new value */ + if (apic_has_pending_timer(vcpu)) + kvm_inject_apic_timer_irqs(vcpu); apic->lapic_timer.tscdeadline = data; start_apic_timer(apic); } @@ -1639,6 +1645,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) if (atomic_read(&apic->lapic_timer.pending) > 0) { kvm_apic_local_deliver(apic, APIC_LVTT); + if (apic_lvtt_tscdeadline(apic)) + apic->lapic_timer.tscdeadline = 0; atomic_set(&apic->lapic_timer.pending, 0); } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 931467881da7..3201e93ebd07 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -199,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); /* - * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, - * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation - * number. + * the low bit of the generation number is always presumed to be zero. + * This disables mmio caching during memslot updates. The concept is + * similar to a seqcount but instead of retrying the access we just punt + * and ignore the cache. + * + * spte bits 3-11 are used as bits 1-9 of the generation number, + * the bits 52-61 are used as bits 10-19 of the generation number. */ -#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_LOW_SHIFT 2 #define MMIO_SPTE_GEN_HIGH_SHIFT 52 -#define MMIO_GEN_SHIFT 19 -#define MMIO_GEN_LOW_SHIFT 9 -#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_SHIFT 20 +#define MMIO_GEN_LOW_SHIFT 10 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) #define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) @@ -236,12 +240,7 @@ static unsigned int get_mmio_spte_generation(u64 spte) static unsigned int kvm_current_mmio_generation(struct kvm *kvm) { - /* - * Init kvm generation close to MMIO_MAX_GEN to easily test the - * code of handling generation number wrap-around. - */ - return (kvm_memslots(kvm)->generation + - MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; + return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; } static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, @@ -296,11 +295,6 @@ static bool check_mmio_spte(struct kvm *kvm, u64 spte) return likely(kvm_gen == spte_gen); } -static inline u64 rsvd_bits(int s, int e) -{ - return ((1ULL << (e - s + 1)) - 1) << s; -} - void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask) { @@ -1180,7 +1174,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. * - * Note: write protection is difference between drity logging and spte + * Note: write protection is difference between dirty logging and spte * protection: * - for dirty logging, the spte can be set to writable at anytime if * its dirty bitmap is properly set. @@ -1268,7 +1262,8 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1276,7 +1271,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((sptep = rmap_get_first(*rmapp, &iter))) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n", + sptep, *sptep, gfn, level); drop_spte(kvm, sptep); need_tlb_flush = 1; @@ -1286,7 +1282,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1300,7 +1297,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!is_shadow_present_pte(*sptep)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", + sptep, *sptep, gfn, level); need_flush = 1; @@ -1334,6 +1332,8 @@ static int kvm_handle_hva_range(struct kvm *kvm, int (*handler)(struct kvm *kvm, unsigned long *rmapp, struct kvm_memory_slot *slot, + gfn_t gfn, + int level, unsigned long data)) { int j; @@ -1363,6 +1363,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { unsigned long idx, idx_end; unsigned long *rmapp; + gfn_t gfn = gfn_start; /* * {idx(page_j) | page_j intersects with @@ -1373,8 +1374,10 @@ static int kvm_handle_hva_range(struct kvm *kvm, rmapp = __gfn_to_rmap(gfn_start, j, memslot); - for (; idx <= idx_end; ++idx) - ret |= handler(kvm, rmapp++, memslot, data); + for (; idx <= idx_end; + ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j))) + ret |= handler(kvm, rmapp++, memslot, + gfn, j, data); } } @@ -1385,6 +1388,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data, int (*handler)(struct kvm *kvm, unsigned long *rmapp, struct kvm_memory_slot *slot, + gfn_t gfn, int level, unsigned long data)) { return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); @@ -1406,24 +1410,14 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) } static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) { u64 *sptep; struct rmap_iterator uninitialized_var(iter); int young = 0; - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask) { - young = kvm_unmap_rmapp(kvm, rmapp, slot, data); - goto out; - } + BUG_ON(!shadow_accessed_mask); for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { @@ -1435,14 +1429,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, (unsigned long *)sptep); } } -out: - /* @data has hva passed to kvm_age_hva(). */ - trace_kvm_age_page(data, slot, young); + trace_kvm_age_page(gfn, level, slot, young); return young; } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, unsigned long data) + struct kvm_memory_slot *slot, gfn_t gfn, + int level, unsigned long data) { u64 *sptep; struct rmap_iterator iter; @@ -1480,13 +1473,33 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0); + kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp); + /* + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ + if (!shadow_accessed_mask) { + /* + * We are holding the kvm->mmu_lock, and we are blowing up + * shadow PTEs. MMU notifier consumers need to be kept at bay. + * This is correct as long as we don't decouple the mmu_lock + * protected regions (like invalidate_range_start|end does). + */ + kvm->mmu_notifier_seq++; + return kvm_handle_hva_range(kvm, start, end, 0, + kvm_unmap_rmapp); + } + + return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) @@ -1749,7 +1762,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 1; } - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -1802,7 +1815,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); if (flush) - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } struct mmu_page_path { @@ -2536,7 +2549,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, true, host_writable)) { if (write_fault) *emulate = 1; - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep) && emulate)) @@ -3163,7 +3176,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - vcpu_clear_mmio_info(vcpu, ~0ul); + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -3206,7 +3219,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, { if (exception) exception->error_code = 0; - return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); + return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -3450,13 +3463,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); - void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); @@ -3518,6 +3524,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; + u64 nonleaf_bit8_rsvd = 0; context->bad_mt_xwr = 0; @@ -3525,6 +3532,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, exb_bit_rsvd = rsvd_bits(63, 63); if (!guest_cpuid_has_gbpages(vcpu)) gbpages_bit_rsvd = rsvd_bits(7, 7); + + /* + * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for + * leaf entries) on AMD CPUs only. + */ + if (guest_cpuid_is_amd(vcpu)) + nonleaf_bit8_rsvd = rsvd_bits(8, 8); + switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ @@ -3559,9 +3574,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); + nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[0][0] = exb_bit_rsvd | @@ -3962,7 +3977,7 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); else if (local_flush) - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, @@ -4223,7 +4238,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { vcpu->arch.mmu.invlpg(vcpu, gva); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -4433,7 +4448,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) { + if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index b982112d2ca5..bde8ee725754 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -56,6 +56,11 @@ #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL << (e - s + 1)) - 1) << s; +} + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 410776528265..806d58e3c320 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -298,8 +298,7 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); + ASSERT(!is_long_mode(vcpu) && is_pae(vcpu)); accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; @@ -321,9 +320,22 @@ retry_walk: walker->pte_gpa[walker->level - 1] = pte_gpa; real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - PFERR_USER_MASK|PFERR_WRITE_MASK); + PFERR_USER_MASK|PFERR_WRITE_MASK, + &walker->fault); + + /* + * FIXME: This can happen if emulation (for of an INS/OUTS + * instruction) triggers a nested page fault. The exit + * qualification / exit info field will incorrectly have + * "guest page access" as the nested page fault's cause, + * instead of "guest page structure access". To fix this, + * the x86_exception struct should be augmented with enough + * information to fix the exit_qualification or exit_info_1 + * fields. + */ if (unlikely(real_gfn == UNMAPPED_GVA)) - goto error; + return 0; + real_gfn = gpa_to_gfn(real_gfn); host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, @@ -364,7 +376,7 @@ retry_walk: if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == UNMAPPED_GVA) return 0; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3dd6accb64ec..8e6b7d869d2f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -15,6 +15,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <asm/perf_event.h> #include "x86.h" #include "cpuid.h" #include "lapic.h" @@ -463,7 +464,8 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_cpuid_entry2 *entry; - unsigned bitmap_len; + union cpuid10_eax eax; + union cpuid10_edx edx; pmu->nr_arch_gp_counters = 0; pmu->nr_arch_fixed_counters = 0; @@ -475,25 +477,27 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) return; + eax.full = entry->eax; + edx.full = entry->edx; - pmu->version = entry->eax & 0xff; + pmu->version = eax.split.version_id; if (!pmu->version) return; - pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, - INTEL_PMC_MAX_GENERIC); - pmu->counter_bitmask[KVM_PMC_GP] = - ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; - bitmap_len = (entry->eax >> 24) & 0xff; - pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, + INTEL_PMC_MAX_GENERIC); + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + pmu->available_event_types = ~entry->ebx & + ((1ull << eax.split.mask_length) - 1); if (pmu->version == 1) { pmu->nr_arch_fixed_counters = 0; } else { - pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), + pmu->nr_arch_fixed_counters = + min_t(int, edx.split.num_counters_fixed, INTEL_PMC_MAX_FIXED); pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; + ((u64)1 << edx.split.bit_width_fixed) - 1; } pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ddf742768ecf..f7f6a4a157a6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -622,7 +622,7 @@ static int has_svm(void) return 1; } -static void svm_hardware_disable(void *garbage) +static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) @@ -633,7 +633,7 @@ static void svm_hardware_disable(void *garbage) amd_pmu_disable_virt(); } -static int svm_hardware_enable(void *garbage) +static int svm_hardware_enable(void) { struct svm_cpu_data *sd; @@ -1257,7 +1257,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; @@ -1974,10 +1975,26 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = fault->error_code; - svm->vmcb->control.exit_info_2 = fault->address; + if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { + /* + * TODO: track the cause of the nested page fault, and + * correctly fill in the high bits of exit_info_1. + */ + svm->vmcb->control.exit_code = SVM_EXIT_NPF; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = (1ULL << 32); + svm->vmcb->control.exit_info_2 = fault->address; + } + + svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; + svm->vmcb->control.exit_info_1 |= fault->error_code; + + /* + * The present bit is always zero for page structure faults on real + * hardware. + */ + if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) + svm->vmcb->control.exit_info_1 &= ~1; nested_svm_vmexit(svm); } @@ -3031,7 +3048,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } -u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); return vmcb->control.tsc_offset + @@ -4305,6 +4322,10 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -4349,7 +4370,6 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -4406,6 +4426,8 @@ static struct kvm_x86_ops svm_x86_ops = { .check_intercept = svm_check_intercept, .handle_external_intr = svm_handle_external_intr, + + .sched_in = svm_sched_in, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e850a7d332be..6b06ab8748dd 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -415,15 +415,14 @@ TRACE_EVENT(kvm_apic_ipi, ); TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), - TP_ARGS(apicid, dm, tm, vec, coalesced), + TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec), + TP_ARGS(apicid, dm, tm, vec), TP_STRUCT__entry( __field( __u32, apicid ) __field( __u16, dm ) __field( __u8, tm ) __field( __u8, vec ) - __field( bool, coalesced ) ), TP_fast_assign( @@ -431,14 +430,12 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->dm = dm; __entry->tm = tm; __entry->vec = vec; - __entry->coalesced = coalesced; ), - TP_printk("apicid %x vec %u (%s|%s)%s", + TP_printk("apicid %x vec %u (%s|%s)", __entry->apicid, __entry->vec, __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), - __entry->tm ? "level" : "edge", - __entry->coalesced ? " (coalesced)" : "") + __entry->tm ? "level" : "edge") ); TRACE_EVENT(kvm_eoi, @@ -850,6 +847,36 @@ TRACE_EVENT(kvm_track_tsc, #endif /* CONFIG_X86_64 */ +TRACE_EVENT(kvm_ple_window, + TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), + TP_ARGS(grow, vcpu_id, new, old), + + TP_STRUCT__entry( + __field( bool, grow ) + __field( unsigned int, vcpu_id ) + __field( int, new ) + __field( int, old ) + ), + + TP_fast_assign( + __entry->grow = grow; + __entry->vcpu_id = vcpu_id; + __entry->new = new; + __entry->old = old; + ), + + TP_printk("vcpu %u: ple_window %d (%s %d)", + __entry->vcpu_id, + __entry->new, + __entry->grow ? "grow" : "shrink", + __entry->old) +); + +#define trace_kvm_ple_window_grow(vcpu_id, new, old) \ + trace_kvm_ple_window(true, vcpu_id, new, old) +#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ + trace_kvm_ple_window(false, vcpu_id, new, old) + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bfe11cf124a1..04fa1b8298c8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -125,14 +125,32 @@ module_param(nested, bool, S_IRUGO); * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_GAP 128 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 +#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 +#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ + INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW + static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +/* Default doubles per-vcpu window every exit. */ +static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, int, S_IRUGO); + +/* Default resets per-vcpu window every exit to ple_window. */ +static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, int, S_IRUGO); + +/* Default is to compute the maximum so we can never overflow. */ +static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, int, S_IRUGO); + extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 @@ -379,6 +397,7 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + struct page *virtual_apic_page; u64 msr_ia32_feature_control; struct hrtimer preemption_timer; @@ -484,6 +503,10 @@ struct vcpu_vmx { /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; + + /* Dynamic PLE window. */ + int ple_window; + bool ple_window_dirty; }; enum segment_cache_field { @@ -533,6 +556,7 @@ static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static unsigned long shadow_read_write_fields[] = { + TPR_THRESHOLD, GUEST_RIP, GUEST_RSP, GUEST_CR0, @@ -743,6 +767,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); +static int alloc_identity_pagetable(struct kvm *kvm); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -2135,7 +2160,7 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { u64 tsc_offset; @@ -2330,7 +2355,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | - CPU_BASED_PAUSE_EXITING | + CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the @@ -2601,6 +2626,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; @@ -2704,7 +2731,7 @@ static void kvm_cpu_vmxon(u64 addr) : "memory", "cc"); } -static int hardware_enable(void *garbage) +static int hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2768,7 +2795,7 @@ static void kvm_cpu_vmxoff(void) asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); } -static void hardware_disable(void *garbage) +static void hardware_disable(void) { if (vmm_exclusive) { vmclear_local_loaded_vmcss(); @@ -3107,9 +3134,17 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; - if (!cpu_has_vmx_flexpriority()) + if (!cpu_has_vmx_flexpriority()) { flexpriority_enabled = 0; + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if the + * processor does not have the APIC_ACCESS_ADDR VMCS field. + */ + kvm_x86_ops->set_apic_access_page_addr = NULL; + } + if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; @@ -3905,7 +3940,7 @@ static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; u16 data = 0; - int r, idx, ret = 0; + int idx, r; idx = srcu_read_lock(&kvm->srcu); fn = kvm->arch.tss_addr >> PAGE_SHIFT; @@ -3927,32 +3962,32 @@ static int init_rmode_tss(struct kvm *kvm) r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, sizeof(u8)); - if (r < 0) - goto out; - - ret = 1; out: srcu_read_unlock(&kvm->srcu, idx); - return ret; + return r; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, idx, r, ret; + int i, idx, r = 0; pfn_t identity_map_pfn; u32 tmp; if (!enable_ept) - return 1; - if (unlikely(!kvm->arch.ept_identity_pagetable)) { - printk(KERN_ERR "EPT: identity-mapping pagetable " - "haven't been allocated!\n"); return 0; - } + + /* Protect kvm->arch.ept_identity_pagetable_done. */ + mutex_lock(&kvm->slots_lock); + if (likely(kvm->arch.ept_identity_pagetable_done)) - return 1; - ret = 0; + goto out2; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + + r = alloc_identity_pagetable(kvm); + if (r < 0) + goto out2; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) @@ -3967,10 +4002,13 @@ static int init_rmode_identity_map(struct kvm *kvm) goto out; } kvm->arch.ept_identity_pagetable_done = true; - ret = 1; + out: srcu_read_unlock(&kvm->srcu, idx); - return ret; + +out2: + mutex_unlock(&kvm->slots_lock); + return r; } static void seg_setup(int seg) @@ -3995,23 +4033,28 @@ static int alloc_apic_access_page(struct kvm *kvm) int r = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page) + if (kvm->arch.apic_access_page_done) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; + kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; - page = gfn_to_page(kvm, 0xfee00); + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) { r = -EFAULT; goto out; } - kvm->arch.apic_access_page = page; + /* + * Do not pin the page in memory, so that memory hot-unplug + * is able to migrate it. + */ + put_page(page); + kvm->arch.apic_access_page_done = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -4019,31 +4062,20 @@ out: static int alloc_identity_pagetable(struct kvm *kvm) { - struct page *page; + /* Called with kvm->slots_lock held. */ + struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - mutex_lock(&kvm->slots_lock); - if (kvm->arch.ept_identity_pagetable) - goto out; + BUG_ON(kvm->arch.ept_identity_pagetable_done); + kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); - if (r) - goto out; - - page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); - if (is_error_page(page)) { - r = -EFAULT; - goto out; - } - kvm->arch.ept_identity_pagetable = page; -out: - mutex_unlock(&kvm->slots_lock); return r; } @@ -4402,7 +4434,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); - vmcs_write32(PLE_WINDOW, ple_window); + vmx->ple_window = ple_window; + vmx->ple_window_dirty = true; } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); @@ -4477,7 +4510,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); - apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&vmx->vcpu)) apic_base_msr.data |= MSR_IA32_APICBASE_BSP; apic_base_msr.host_initiated = true; @@ -4537,9 +4570,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, 0); } - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + kvm_vcpu_reload_apic_access_page(vcpu); if (vmx_vm_has_apicv(vcpu->kvm)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); @@ -4729,10 +4760,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (ret) return ret; kvm->arch.tss_addr = addr; - if (!init_rmode_tss(kvm)) - return -ENOMEM; - - return 0; + return init_rmode_tss(kvm); } static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) @@ -5521,17 +5549,18 @@ static u64 ept_rsvd_mask(u64 spte, int level) for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) mask |= (1ULL << i); - if (level > 2) + if (level == 4) /* bits 7:3 reserved */ mask |= 0xf8; - else if (level == 2) { - if (spte & (1ULL << 7)) - /* 2MB ref, bits 20:12 reserved */ - mask |= 0x1ff000; - else - /* bits 6:3 reserved */ - mask |= 0x78; - } + else if (spte & (1ULL << 7)) + /* + * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, + * level == 1 if the hypervisor is using the ignored bit 7. + */ + mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; + else if (level > 1) + /* bits 6:3 reserved */ + mask |= 0x78; return mask; } @@ -5561,7 +5590,8 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, WARN_ON(1); } - if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + /* bits 5:3 are _not_ reserved for large page or leaf page */ + if ((rsvd_bits & 0x38) == 0) { u64 ept_mem_type = (spte & 0x38) >> 3; if (ept_mem_type == 2 || ept_mem_type == 3 || @@ -5676,12 +5706,85 @@ out: return ret; } +static int __grow_ple_window(int val) +{ + if (ple_window_grow < 1) + return ple_window; + + val = min(val, ple_window_actual_max); + + if (ple_window_grow < ple_window) + val *= ple_window_grow; + else + val += ple_window_grow; + + return val; +} + +static int __shrink_ple_window(int val, int modifier, int minimum) +{ + if (modifier < 1) + return ple_window; + + if (modifier < ple_window) + val /= modifier; + else + val -= modifier; + + return max(val, minimum); +} + +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, + ple_window_shrink, ple_window); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); +} + +/* + * ple_window_actual_max is computed to be one grow_ple_window() below + * ple_window_max. (See __grow_ple_window for the reason.) + * This prevents overflows, because ple_window_max is int. + * ple_window_max effectively rounded down to a multiple of ple_window_grow in + * this process. + * ple_window_max is also prevented from setting vmx->ple_window < ple_window. + */ +static void update_ple_window_actual_max(void) +{ + ple_window_actual_max = + __shrink_ple_window(max(ple_window_max, ple_window), + ple_window_grow, INT_MIN); +} + /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. */ static int handle_pause(struct kvm_vcpu *vcpu) { + if (ple_gap) + grow_ple_window(vcpu); + skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); @@ -6146,7 +6249,11 @@ static void free_nested(struct vcpu_vmx *vmx) /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; } nested_free_all_saved_vmcss(vmx); @@ -6617,7 +6724,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_GLOBAL: kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); nested_vmx_succeed(vcpu); break; default: @@ -6892,6 +6999,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return 1; case EXIT_REASON_CPUID: + if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) + return 0; return 1; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); @@ -6936,7 +7045,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_MCE_DURING_VMENTRY: return 0; case EXIT_REASON_TPR_BELOW_THRESHOLD: - return 1; + return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); @@ -7057,6 +7166,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (is_guest_mode(vcpu) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return; + if (irr == -1 || tpr < irr) { vmcs_write32(TPR_THRESHOLD, 0); return; @@ -7094,6 +7209,29 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) vmx_set_msr_bitmap(vcpu); } +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Currently we do not handle the nested case where L2 has an + * APIC access page of its own; that page is still pinned. + * Hence, we skip the case where the VCPU is in guest mode _and_ + * L1 prepared an APIC access page for L2. + * + * For the case where L1 and L2 share the same APIC access page + * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear + * in the vmcs12), this function will only update either the vmcs01 + * or the vmcs02. If the former, the vmcs02 will be updated by + * prepare_vmcs02. If the latter, the vmcs01 will be updated in + * the next L2->L1 exit. + */ + if (!is_guest_mode(vcpu) || + !nested_cpu_has2(vmx->nested.current_vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + vmcs_write64(APIC_ACCESS_ADDR, hpa); +} + static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) { u16 status; @@ -7387,6 +7525,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return; + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } + if (vmx->nested.sync_shadow_vmcs) { copy_vmcs12_to_shadow(vmx); vmx->nested.sync_shadow_vmcs = false; @@ -7642,10 +7785,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - err = -ENOMEM; - if (alloc_identity_pagetable(kvm) != 0) - goto free_vmcs; - if (!init_rmode_identity_map(kvm)) + err = init_rmode_identity_map(kvm); + if (err) goto free_vmcs; } @@ -7824,6 +7965,55 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, kvm_inject_page_fault(vcpu, fault); } +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* TODO: Also verify bits beyond physical address width are 0 */ + if (!PAGE_ALIGNED(vmcs12->apic_access_addr)) + return false; + + /* + * Translate L1 physical address to host physical + * address for vmcs02. Keep the page pinned, so this + * physical address remains valid. We keep a reference + * to it so we can release it later. + */ + if (vmx->nested.apic_access_page) /* shouldn't happen */ + nested_release_page(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = + nested_get_page(vcpu, vmcs12->apic_access_addr); + } + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + /* TODO: Also verify bits beyond physical address width are 0 */ + if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr)) + return false; + + if (vmx->nested.virtual_apic_page) /* shouldn't happen */ + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = + nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); + + /* + * Failing the vm entry is _not_ what the processor does + * but it's basically the only possibility we have. + * We could still enter the guest if CR8 load exits are + * enabled, CR8 store exits are enabled, and virtualize APIC + * access is disabled; in this case the processor would never + * use the TPR shadow and we could simply clear the bit from + * the execution control. But such a configuration is useless, + * so let's keep the code simple. + */ + if (!vmx->nested.virtual_apic_page) + return false; + } + + return true; +} + static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) { u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; @@ -7849,7 +8039,7 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 * guest in a way that will both be appropriate to L1's requests, and our * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various @@ -7970,16 +8160,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) /* shouldn't happen */ - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = - nested_get_page(vcpu, vmcs12->apic_access_addr); - /* * If translation failed, no matter: This feature asks * to exit when accessing the given address, and if it * can never be accessed, this feature won't do @@ -7994,8 +8174,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vcpu->kvm->arch.apic_access_page)); + kvm_vcpu_reload_apic_access_page(vcpu); } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); @@ -8024,6 +8203,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->nested.virtual_apic_page)); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } + /* * Merging of IO and MSR bitmaps not currently supported. * Rather, exit every time. @@ -8185,8 +8371,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !PAGE_ALIGNED(vmcs12->apic_access_addr)) { + if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8790,10 +8975,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; + vmx->nested.apic_access_page = NULL; + } + if (vmx->nested.virtual_apic_page) { + nested_release_page(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; } /* + * We are now running in L2, mmu_notifier will force to reload the + * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. + */ + kvm_vcpu_reload_apic_access_page(vcpu); + + /* * Exiting from L2 to L1, we're now back to L1 which thinks it just * finished a VMLAUNCH or VMRESUME instruction, so we need to set the * success or failure flag accordingly. @@ -8846,6 +9041,12 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + if (ple_gap) + shrink_ple_window(vcpu); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -8890,7 +9091,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -8913,6 +9113,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .vm_has_apicv = vmx_vm_has_apicv, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, @@ -8951,6 +9152,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .mpx_supported = vmx_mpx_supported, .check_nested_events = vmx_check_nested_events, + + .sched_in = vmx_sched_in, }; static int __init vmx_init(void) @@ -9065,6 +9268,8 @@ static int __init vmx_init(void) } else kvm_disable_tdp(); + update_ple_window_actual_max(); + return 0; out7: @@ -9098,7 +9303,7 @@ static void __exit vmx_exit(void) free_page((unsigned long)vmx_vmread_bitmap); #ifdef CONFIG_KEXEC - rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f1e22d3b286..5430e4b0af29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -246,7 +246,7 @@ void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) } EXPORT_SYMBOL_GPL(kvm_set_shared_msr); -static void drop_user_return_notifiers(void *ignore) +static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); @@ -408,12 +408,14 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { if (mmu_is_nested(vcpu) && !fault->nested_page_fault) vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); else vcpu->arch.mmu.inject_page_fault(vcpu, fault); + + return fault->nested_page_fault; } void kvm_inject_nmi(struct kvm_vcpu *vcpu) @@ -457,11 +459,12 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t ngfn, void *data, int offset, int len, u32 access) { + struct x86_exception exception; gfn_t real_gfn; gpa_t ngpa; ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access); + real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); if (real_gfn == UNMAPPED_GVA) return -EFAULT; @@ -726,7 +729,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -1518,7 +1521,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) pvclock_update_vm_gtod_copy(kvm); kvm_for_each_vcpu(i, vcpu, kvm) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) @@ -1661,7 +1664,7 @@ static void kvmclock_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_vcpu_kick(vcpu); } } @@ -1670,7 +1673,7 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { struct kvm *kvm = v->kvm; - set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); schedule_delayed_work(&kvm->arch.kvmclock_update_work, KVMCLOCK_UPDATE_DELAY); } @@ -1723,9 +1726,10 @@ static bool valid_mtrr_type(unsigned t) return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ } -static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int i; + u64 mask; if (!msr_mtrr_valid(msr)) return false; @@ -1747,14 +1751,31 @@ static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - return valid_mtrr_type(data & 0xff); + WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + + mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + if ((msr & 1) == 0) { + /* MTRR base */ + if (!valid_mtrr_type(data & 0xff)) + return false; + mask |= 0xf00; + } else + /* MTRR mask */ + mask |= 0x7ff; + if (data & mask) { + kvm_inject_gp(vcpu, 0); + return false; + } + + return true; } +EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - if (!mtrr_valid(vcpu, msr, data)) + if (!kvm_mtrr_valid(vcpu, msr, data)) return 1; if (msr == MSR_MTRRdefType) { @@ -1805,7 +1826,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; default: if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + msr < MSR_IA32_MCx_CTL(bank_num)) { u32 offset = msr - MSR_IA32_MC0_CTL; /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to @@ -2164,7 +2185,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr, data); /* Performance counters are not protected by a CPUID bit, @@ -2330,7 +2351,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) break; default: if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + msr < MSR_IA32_MCx_CTL(bank_num)) { u32 offset = msr - MSR_IA32_MC0_CTL; data = vcpu->arch.mce_banks[offset]; break; @@ -2419,7 +2440,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: @@ -2505,7 +2532,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_CAP: case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr, pdata); case MSR_K7_CLK_CTL: /* @@ -2823,7 +2850,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); vcpu->arch.tsc_offset_adjustment = 0; - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { @@ -4040,16 +4067,16 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_ops->get_segment(vcpu, var, seg); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) { gpa_t t_gpa; - struct x86_exception exception; BUG_ON(!mmu_is_nested(vcpu)); /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); + t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception); return t_gpa; } @@ -4906,16 +4933,18 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) } } -static void inject_emulated_exception(struct kvm_vcpu *vcpu) +static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) - kvm_propagate_fault(vcpu, &ctxt->exception); - else if (ctxt->exception.error_code_valid) + return kvm_propagate_fault(vcpu, &ctxt->exception); + + if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, ctxt->exception.error_code); else kvm_queue_exception(vcpu, ctxt->exception.vector); + return false; } static void init_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -4972,7 +5001,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (!is_guest_mode(vcpu)) { + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -5224,6 +5253,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, ctxt->interruptibility = 0; ctxt->have_exception = false; + ctxt->exception.vector = -1; ctxt->perm_ok = false; ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; @@ -5276,8 +5306,9 @@ restart: } if (ctxt->have_exception) { - inject_emulated_exception(vcpu); r = EMULATE_DONE; + if (inject_emulated_exception(vcpu)) + return r; } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ @@ -5545,7 +5576,7 @@ static void kvm_set_mmio_spte_mask(void) * entry to generate page fault with PFER.RSV = 1. */ /* Mask the reserved physical address bits. */ - mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr; + mask = rsvd_bits(maxphyaddr, 51); /* Bit 62 is always reserved for 32bit host. */ mask |= 0x3ull << 62; @@ -5576,7 +5607,7 @@ static void pvclock_gtod_update_fn(struct work_struct *work) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); spin_unlock(&kvm_lock); } @@ -5989,6 +6020,44 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_apic_update_tmr(vcpu, tmr); } +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops->tlb_flush(vcpu); +} + +void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) +{ + struct page *page = NULL; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + if (!kvm_x86_ops->set_apic_access_page_addr) + return; + + page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); + + /* + * Do not pin apic access page in memory, the MMU notifier + * will call us again if it is migrated or swapped out. + */ + put_page(page); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); + +void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address) +{ + /* + * The physical address of apic access page is stored in the VMCS. + * Update it when it becomes invalid. + */ + if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); +} + /* * Returns 1 to let __vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -6018,7 +6087,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_x86_ops->tlb_flush(vcpu); + kvm_vcpu_flush_tlb(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; @@ -6049,6 +6118,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); + if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) + kvm_vcpu_reload_apic_access_page(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -6934,7 +7005,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector) kvm_rip_write(vcpu, 0); } -int kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; @@ -6945,7 +7016,7 @@ int kvm_arch_hardware_enable(void *garbage) bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(garbage); + ret = kvm_x86_ops->hardware_enable(); if (ret != 0) return ret; @@ -6954,7 +7025,7 @@ int kvm_arch_hardware_enable(void *garbage) list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (!stable && vcpu->cpu == smp_processor_id()) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (stable && vcpu->arch.last_host_tsc > local_tsc) { backwards_tsc = true; if (vcpu->arch.last_host_tsc > max_tsc) @@ -7008,8 +7079,7 @@ int kvm_arch_hardware_enable(void *garbage) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, - &vcpu->requests); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); } /* @@ -7026,10 +7096,10 @@ int kvm_arch_hardware_enable(void *garbage) return 0; } -void kvm_arch_hardware_disable(void *garbage) +void kvm_arch_hardware_disable(void) { - kvm_x86_ops->hardware_disable(garbage); - drop_user_return_notifiers(garbage); + kvm_x86_ops->hardware_disable(); + drop_user_return_notifiers(); } int kvm_arch_hardware_setup(void) @@ -7146,6 +7216,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) static_key_slow_dec(&kvm_no_apic_vcpu); } +void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + kvm_x86_ops->sched_in(vcpu, cpu); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { if (type) @@ -7237,10 +7312,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - if (kvm->arch.apic_access_page) - put_page(kvm->arch.apic_access_page); - if (kvm->arch.ept_identity_pagetable) - put_page(kvm->arch.ept_identity_pagetable); kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); } @@ -7643,3 +7714,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 306a1b77581f..7cb9c45a5fe0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -88,15 +88,23 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, vcpu->arch.mmio_gva = gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; + vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; +} + +static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation; } /* - * Clear the mmio cache info for the given gva, - * specially, if gva is ~0ul, we clear all mmio cache info. + * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we + * clear all mmio cache info. */ +#define MMIO_GVA_ANY (~(gva_t)0) + static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) { - if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) + if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) return; vcpu->arch.mmio_gva = 0; @@ -104,7 +112,8 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) { - if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) + if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva && + vcpu->arch.mmio_gva == (gva & PAGE_MASK)) return true; return false; @@ -112,7 +121,8 @@ static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { - if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) + if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn && + vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) return true; return false; @@ -149,6 +159,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); + #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ | XSTATE_BNDREGS | XSTATE_BNDCSR) extern u64 host_xcr0; |