diff options
23 files changed, 1005 insertions, 88 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4a661e555c09..110484915aa0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3035,6 +3035,63 @@ Returns: 0 on success, -1 on error Queues an SMI on the thread's vcpu. +4.97 KVM_CAP_PPC_MULTITCE + +Capability: KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This capability means the kernel is capable of handling hypercalls +H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user +space. This significantly accelerates DMA operations for PPC KVM guests. +User space should expect that its handlers for these hypercalls +are not going to be called if user space previously registered LIOBN +in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). + +In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, +user space might have to advertise it for the guest. For example, +IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is +present in the "ibm,hypertas-functions" device-tree property. + +The hypercalls mentioned above may or may not be processed successfully +in the kernel based fast path. If they can not be handled by the kernel, +they will get passed on to user space. So user space still has to have +an implementation for these despite the in kernel acceleration. + +This capability is always enabled. + +4.98 KVM_CREATE_SPAPR_TCE_64 + +Capability: KVM_CAP_SPAPR_TCE_64 +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_64 (in) +Returns: file descriptor for manipulating the created TCE table + +This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit +windows, described in 4.62 KVM_CREATE_SPAPR_TCE + +This capability uses extended struct in ioctl interface: + +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u32 flags; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ +}; + +The aim of extension is to support an additional bigger DMA window with +a variable page size. +KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and +a bus offset of the corresponding DMA window, @size and @offset are numbers +of IOMMU pages. + +@flags are not used at the moment. + +The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. + 5. The kvm_run structure ------------------------ diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 2aa79c864e91..7529aab068f5 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) } #endif -#define SPAPR_TCE_SHIFT 12 - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9d08d8cbed1a..2e7c79101652 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -182,7 +182,10 @@ struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; u64 liobn; - u32 window_size; + struct rcu_head rcu; + u32 page_shift; + u64 offset; /* in pages */ + u64 size; /* window size in pages */ struct page *pages[0]; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2241d5357129..2544edabe7f3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -165,9 +165,25 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args); + struct kvm_create_spapr_tce_64 *args); +extern struct kvmppc_spapr_tce_table *kvmppc_find_table( + struct kvm_vcpu *vcpu, unsigned long liobn); +extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages); +extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, + unsigned long tce); +extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, + unsigned long *ua, unsigned long **prmap); +extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, + unsigned long idx, unsigned long tce); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); +extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages); +extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba); extern struct page *kvm_alloc_hpt(unsigned long nr_pages); @@ -437,6 +453,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.irq_type == KVMPPC_IRQ_XICS; } +extern void kvmppc_alloc_host_rm_ops(void); +extern void kvmppc_free_host_rm_ops(void); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); @@ -445,7 +463,11 @@ extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); +extern void kvmppc_xics_ipi_action(void); +extern int h_ipi_redirect; #else +static inline void kvmppc_alloc_host_rm_ops(void) {}; +static inline void kvmppc_free_host_rm_ops(void) {}; static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } @@ -459,6 +481,33 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { return 0; } #endif +/* + * Host-side operations we want to set up while running in real + * mode in the guest operating on the xics. + * Currently only VCPU wakeup is supported. + */ + +union kvmppc_rm_state { + unsigned long raw; + struct { + u32 in_host; + u32 rm_action; + }; +}; + +struct kvmppc_host_rm_core { + union kvmppc_rm_state rm_state; + void *rm_data; + char pad[112]; +}; + +struct kvmppc_host_rm_ops { + struct kvmppc_host_rm_core *rm_core; + void (*vcpu_kick)(struct kvm_vcpu *vcpu); +}; + +extern struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; + static inline unsigned long kvmppc_get_epr(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_BOOKE_HV diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index ac9fb114e25d..47897a30982d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -78,6 +78,9 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); } + +unsigned long vmalloc_to_phys(void *vmalloc_addr); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 825663c30945..78083ed20792 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -114,6 +114,9 @@ extern int cpu_to_core_id(int cpu); #define PPC_MSG_TICK_BROADCAST 2 #define PPC_MSG_DEBUGGER_BREAK 3 +/* This is only used by the powernv kernel */ +#define PPC_MSG_RM_HOST_ACTION 4 + /* for irq controllers that have dedicated ipis per message (4) */ extern int smp_request_message_ipi(int virq, int message); extern const char *smp_ipi_name[]; @@ -121,6 +124,7 @@ extern const char *smp_ipi_name[]; /* for irq controllers with only a single ipi */ extern void smp_muxed_ipi_set_data(int cpu, unsigned long data); extern void smp_muxed_ipi_message_pass(int cpu, int msg); +extern void smp_muxed_ipi_set_message(int cpu, int msg); extern irqreturn_t smp_ipi_demux(void); void smp_init_pSeries(void); diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index 0e25bdb190bb..254604856e69 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -30,6 +30,7 @@ #ifdef CONFIG_PPC_ICP_NATIVE extern int icp_native_init(void); extern void icp_native_flush_interrupt(void); +extern void icp_native_cause_ipi_rm(int cpu); #else static inline int icp_native_init(void) { return -ENODEV; } #endif diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index ab4d4732c492..c93cf35ce379 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -333,6 +333,15 @@ struct kvm_create_spapr_tce { __u32 window_size; }; +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u32 flags; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ +}; + /* for KVM_ALLOCATE_RMA */ struct kvm_allocate_rma { __u64 rma_size; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ec9ec2058d2d..cb8be5dc118a 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -206,7 +206,7 @@ int smp_request_message_ipi(int virq, int msg) #ifdef CONFIG_PPC_SMP_MUXED_IPI struct cpu_messages { - int messages; /* current messages */ + long messages; /* current messages */ unsigned long data; /* data for cause ipi */ }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message); @@ -218,7 +218,7 @@ void smp_muxed_ipi_set_data(int cpu, unsigned long data) info->data = data; } -void smp_muxed_ipi_message_pass(int cpu, int msg) +void smp_muxed_ipi_set_message(int cpu, int msg) { struct cpu_messages *info = &per_cpu(ipi_message, cpu); char *message = (char *)&info->messages; @@ -228,6 +228,13 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) */ smp_mb(); message[msg] = 1; +} + +void smp_muxed_ipi_message_pass(int cpu, int msg) +{ + struct cpu_messages *info = &per_cpu(ipi_message, cpu); + + smp_muxed_ipi_set_message(cpu, msg); /* * cause_ipi functions are required to include a full barrier * before doing whatever causes the IPI. @@ -236,20 +243,31 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) } #ifdef __BIG_ENDIAN__ -#define IPI_MESSAGE(A) (1 << (24 - 8 * (A))) +#define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A))) #else -#define IPI_MESSAGE(A) (1 << (8 * (A))) +#define IPI_MESSAGE(A) (1uL << (8 * (A))) #endif irqreturn_t smp_ipi_demux(void) { struct cpu_messages *info = this_cpu_ptr(&ipi_message); - unsigned int all; + unsigned long all; mb(); /* order any irq clear */ do { all = xchg(&info->messages, 0); +#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) + /* + * Must check for PPC_MSG_RM_HOST_ACTION messages + * before PPC_MSG_CALL_FUNCTION messages because when + * a VM is destroyed, we call kick_all_cpus_sync() + * to ensure that any pending PPC_MSG_RM_HOST_ACTION + * messages have completed before we free any VCPUs. + */ + if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION)) + kvmppc_xics_ipi_action(); +#endif if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION)) generic_smp_call_function_interrupt(); if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE)) diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 0570eef83fba..7f7b6d86ac73 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o + $(KVM)/eventfd.o $(KVM)/vfio.o CFLAGS_e500_mmu.o := -I. CFLAGS_e500_mmu_host.o := -I. diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 638c6d9be9e0..b34220d2aa42 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -807,7 +807,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) { #ifdef CONFIG_PPC64 - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables); INIT_LIST_HEAD(&kvm->arch.rtas_tokens); #endif diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 54cf9bc94dad..2c2d1030843a 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com> */ #include <linux/types.h> @@ -36,28 +37,69 @@ #include <asm/ppc-opcode.h> #include <asm/kvm_host.h> #include <asm/udbg.h> +#include <asm/iommu.h> +#include <asm/tce.h> -#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) +{ + return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; +} -static long kvmppc_stt_npages(unsigned long window_size) +static unsigned long kvmppc_stt_pages(unsigned long tce_pages) { - return ALIGN((window_size >> SPAPR_TCE_SHIFT) - * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; + unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) + + (tce_pages * sizeof(struct page *)); + + return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE; } -static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) { - struct kvm *kvm = stt->kvm; - int i; + long ret = 0; - mutex_lock(&kvm->lock); - list_del(&stt->list); - for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + if (!current || !current->mm) + return ret; /* process exited */ + + down_write(¤t->mm->mmap_sem); + + if (inc) { + unsigned long locked, lock_limit; + + locked = current->mm->locked_vm + stt_pages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + current->mm->locked_vm += stt_pages; + } else { + if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm)) + stt_pages = current->mm->locked_vm; + + current->mm->locked_vm -= stt_pages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, + inc ? '+' : '-', + stt_pages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static void release_spapr_tce_table(struct rcu_head *head) +{ + struct kvmppc_spapr_tce_table *stt = container_of(head, + struct kvmppc_spapr_tce_table, rcu); + unsigned long i, npages = kvmppc_tce_pages(stt->size); + + for (i = 0; i < npages; i++) __free_page(stt->pages[i]); - kfree(stt); - mutex_unlock(&kvm->lock); - kvm_put_kvm(kvm); + kfree(stt); } static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -65,7 +107,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; struct page *page; - if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) return VM_FAULT_SIGBUS; page = stt->pages[vmf->pgoff]; @@ -88,7 +130,14 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) { struct kvmppc_spapr_tce_table *stt = filp->private_data; - release_spapr_tce_table(stt); + list_del_rcu(&stt->list); + + kvm_put_kvm(stt->kvm); + + kvmppc_account_memlimit( + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); + call_rcu(&stt->rcu, release_spapr_tce_table); + return 0; } @@ -98,20 +147,29 @@ static const struct file_operations kvm_spapr_tce_fops = { }; long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args) + struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; - long npages; + unsigned long npages, size; int ret = -ENOMEM; int i; + if (!args->size) + return -EINVAL; + /* Check this LIOBN hasn't been previously allocated */ list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { if (stt->liobn == args->liobn) return -EBUSY; } - npages = kvmppc_stt_npages(args->window_size); + size = args->size; + npages = kvmppc_tce_pages(size); + ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); + if (ret) { + stt = NULL; + goto fail; + } stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); @@ -119,7 +177,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; stt->liobn = args->liobn; - stt->window_size = args->window_size; + stt->page_shift = args->page_shift; + stt->offset = args->offset; + stt->size = size; stt->kvm = kvm; for (i = 0; i < npages; i++) { @@ -131,7 +191,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kvm_get_kvm(kvm); mutex_lock(&kvm->lock); - list_add(&stt->list, &kvm->arch.spapr_tce_tables); + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); mutex_unlock(&kvm->lock); @@ -148,3 +208,59 @@ fail: } return ret; } + +long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret = H_SUCCESS, idx; + unsigned long entry, ua = 0; + u64 __user *tces, tce; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + entry = ioba >> stt->page_shift; + /* + * SPAPR spec says that the maximum size of the list is 512 TCEs + * so the whole table fits in 4K page + */ + if (npages > 512) + return H_PARAMETER; + + if (tce_list & (SZ_4K - 1)) + return H_PARAMETER; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tces = (u64 __user *) ua; + + for (i = 0; i < npages; ++i) { + if (get_user(tce, tces + i)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tce = be64_to_cpu(tce); + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + goto unlock_exit; + + kvmppc_tce_put(stt, entry + i, tce); + } + +unlock_exit: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; +} +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 89e96b3e0039..44be73e6aa26 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com> */ #include <linux/types.h> @@ -30,76 +31,321 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> #include <asm/mmu-hash64.h> +#include <asm/mmu_context.h> #include <asm/hvcall.h> #include <asm/synch.h> #include <asm/ppc-opcode.h> #include <asm/kvm_host.h> #include <asm/udbg.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/iommu.h> #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) -/* WARNING: This will be called in real-mode on HV KVM and virtual +/* + * Finds a TCE table descriptor by LIOBN. + * + * WARNING: This will be called in real or virtual mode on HV KVM and virtual * mode on PR KVM */ -long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba, unsigned long tce) +struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, + unsigned long liobn) { struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; + list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) + if (stt->liobn == liobn) + return stt; + + return NULL; +} +EXPORT_SYMBOL_GPL(kvmppc_find_table); + +/* + * Validates IO address. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages) +{ + unsigned long mask = (1ULL << stt->page_shift) - 1; + unsigned long idx = ioba >> stt->page_shift; + + if ((ioba & mask) || (idx < stt->offset) || + (idx - stt->offset + npages > stt->size) || + (idx + npages < idx)) + return H_PARAMETER; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); + +/* + * Validates TCE address. + * At the moment flags and page mask are validated. + * As the host kernel does not access those addresses (just puts them + * to the table and user space is supposed to process them), we can skip + * checking other things (such as TCE is a guest RAM address or the page + * was actually allocated). + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) +{ + unsigned long page_mask = ~((1ULL << stt->page_shift) - 1); + unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ); + + if (tce & mask) + return H_PARAMETER; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_validate); + +/* Note on the use of page_address() in real mode, + * + * It is safe to use page_address() in real mode on ppc64 because + * page_address() is always defined as lowmem_page_address() + * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic + * operation and does not access page struct. + * + * Theoretically page_address() could be defined different + * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL + * would have to be enabled. + * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64, + * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only + * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP + * is not expected to be enabled on ppc32, page_address() + * is safe for ppc32 as well. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +static u64 *kvmppc_page_address(struct page *page) +{ +#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL) +#error TODO: fix to avoid page_address() here +#endif + return (u64 *) page_address(page); +} + +/* + * Handles TCE requests for emulated devices. + * Puts guest TCE values to the table and expects user space to convert them. + * Called in both real and virtual modes. + * Cannot fail so kvmppc_tce_validate must be called before it. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, + unsigned long idx, unsigned long tce) +{ + struct page *page; + u64 *tbl; + + idx -= stt->offset; + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = kvmppc_page_address(page); + + tbl[idx % TCES_PER_PAGE] = tce; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_put); + +long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, + unsigned long *ua, unsigned long **prmap) +{ + unsigned long gfn = gpa >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + memslot = search_memslots(kvm_memslots(kvm), gfn); + if (!memslot) + return -EINVAL; + + *ua = __gfn_to_hva_memslot(memslot, gfn) | + (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (prmap) + *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; +#endif + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce) +{ + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == liobn) { - unsigned long idx = ioba >> SPAPR_TCE_SHIFT; - struct page *page; - u64 *tbl; - - /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ - /* liobn, stt, stt->window_size); */ - if (ioba >= stt->window_size) - return H_PARAMETER; - - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); - - /* FIXME: Need to validate the TCE itself */ - /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ - tbl[idx % TCES_PER_PAGE] = tce; - return H_SUCCESS; - } - } + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + return ret; + + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); -long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba) +static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, + unsigned long ua, unsigned long *phpa) +{ + pte_t *ptep, pte; + unsigned shift = 0; + + ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift); + if (!ptep || !pte_present(*ptep)) + return -ENXIO; + pte = *ptep; + + if (!shift) + shift = PAGE_SHIFT; + + /* Avoid handling anything potentially complicated in realmode */ + if (shift > PAGE_SHIFT) + return -EAGAIN; + + if (!pte_young(pte)) + return -EAGAIN; + + *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) | + (ua & ~PAGE_MASK); + + return 0; +} + +long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages) { - struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; + long i, ret = H_SUCCESS; + unsigned long tces, entry, ua = 0; + unsigned long *rmap = NULL; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + entry = ioba >> stt->page_shift; + /* + * The spec says that the maximum size of the list is 512 TCEs + * so the whole table addressed resides in 4K page + */ + if (npages > 512) + return H_PARAMETER; + + if (tce_list & (SZ_4K - 1)) + return H_PARAMETER; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == liobn) { - unsigned long idx = ioba >> SPAPR_TCE_SHIFT; - struct page *page; - u64 *tbl; + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + return H_TOO_HARD; - if (ioba >= stt->window_size) - return H_PARAMETER; + rmap = (void *) vmalloc_to_phys(rmap); - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); + /* + * Synchronize with the MMU notifier callbacks in + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * While we have the rmap lock, code running on other CPUs + * cannot finish unmapping the host real page that backs + * this guest real page, so we are OK to access the host + * real page. + */ + lock_rmap(rmap); + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + goto unlock_exit; - vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; - return H_SUCCESS; - } + kvmppc_tce_put(stt, entry + i, tce); } - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; +unlock_exit: + unlock_rmap(rmap); + + return ret; +} + +long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + /* Check permission bits only to allow userspace poison TCE for debug */ + if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) + return H_PARAMETER; + + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba) +{ + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; + + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; + + idx = (ioba >> stt->page_shift) - stt->offset; + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; + + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); + +#endif /* KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index baeddb06811d..f47fffefadc1 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -81,6 +81,17 @@ static int target_smt_mode; module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); +#ifdef CONFIG_KVM_XICS +static struct kernel_param_ops module_param_ops = { + .set = param_set_int, + .get = param_get_int, +}; + +module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); +#endif + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -768,7 +779,31 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (kvmppc_xics_enabled(vcpu)) { ret = kvmppc_xics_hcall(vcpu, req); break; - } /* fallthrough */ + } + return RESUME_HOST; + case H_PUT_TCE: + ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_PUT_TCE_INDIRECT: + ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_STUFF_TCE: + ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; default: return RESUME_HOST; } @@ -2279,6 +2314,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) } /* + * Clear core from the list of active host cores as we are about to + * enter the guest. Only do this if it is the primary thread of the + * core (not if a subcore) that is entering the guest. + */ +static inline void kvmppc_clear_host_core(int cpu) +{ + int core; + + if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) + return; + /* + * Memory barrier can be omitted here as we will do a smp_wmb() + * later in kvmppc_start_thread and we need ensure that state is + * visible to other CPUs only after we enter guest. + */ + core = cpu >> threads_shift; + kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0; +} + +/* + * Advertise this core as an active host core since we exited the guest + * Only need to do this if it is the primary thread of the core that is + * exiting. + */ +static inline void kvmppc_set_host_core(int cpu) +{ + int core; + + if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) + return; + + /* + * Memory barrier can be omitted here because we do a spin_unlock + * immediately after this which provides the memory barrier. + */ + core = cpu >> threads_shift; + kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1; +} + +/* * Run a set of guest threads on a physical core. * Called with vc->lock held. */ @@ -2390,6 +2465,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } } + kvmppc_clear_host_core(pcpu); + /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { @@ -2486,6 +2563,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_ipi_thread(pcpu + i); } + kvmppc_set_host_core(pcpu); + spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ @@ -2984,6 +3063,114 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } +#ifdef CONFIG_KVM_XICS +static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + unsigned long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + kvmppc_set_host_core(cpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + kvmppc_clear_host_core(cpu); + break; +#endif + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block kvmppc_cpu_notifier = { + .notifier_call = kvmppc_cpu_notify, +}; + +/* + * Allocate a per-core structure for managing state about which cores are + * running in the host versus the guest and for exchanging data between + * real mode KVM and CPU running in the host. + * This is only done for the first VM. + * The allocated structure stays even if all VMs have stopped. + * It is only freed when the kvm-hv module is unloaded. + * It's OK for this routine to fail, we just don't support host + * core operations like redirecting H_IPI wakeups. + */ +void kvmppc_alloc_host_rm_ops(void) +{ + struct kvmppc_host_rm_ops *ops; + unsigned long l_ops; + int cpu, core; + int size; + + /* Not the first time here ? */ + if (kvmppc_host_rm_ops_hv != NULL) + return; + + ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL); + if (!ops) + return; + + size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core); + ops->rm_core = kzalloc(size, GFP_KERNEL); + + if (!ops->rm_core) { + kfree(ops); + return; + } + + get_online_cpus(); + + for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { + if (!cpu_online(cpu)) + continue; + + core = cpu >> threads_shift; + ops->rm_core[core].rm_state.in_host = 1; + } + + ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv; + + /* + * Make the contents of the kvmppc_host_rm_ops structure visible + * to other CPUs before we assign it to the global variable. + * Do an atomic assignment (no locks used here), but if someone + * beats us to it, just free our copy and return. + */ + smp_wmb(); + l_ops = (unsigned long) ops; + + if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { + put_online_cpus(); + kfree(ops->rm_core); + kfree(ops); + return; + } + + register_cpu_notifier(&kvmppc_cpu_notifier); + + put_online_cpus(); +} + +void kvmppc_free_host_rm_ops(void) +{ + if (kvmppc_host_rm_ops_hv) { + unregister_cpu_notifier(&kvmppc_cpu_notifier); + kfree(kvmppc_host_rm_ops_hv->rm_core); + kfree(kvmppc_host_rm_ops_hv); + kvmppc_host_rm_ops_hv = NULL; + } +} +#endif + static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; @@ -2996,6 +3183,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) return -ENOMEM; kvm->arch.lpid = lpid; + kvmppc_alloc_host_rm_ops(); + /* * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, @@ -3229,6 +3418,7 @@ static int kvmppc_book3s_init_hv(void) static void kvmppc_book3s_exit_hv(void) { + kvmppc_free_host_rm_ops(); kvmppc_hv_ops = NULL; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fd7006bf6b1a..5f0380db3eab 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -283,3 +283,6 @@ void kvmhv_commence_exit(int trap) kvmhv_interrupt_vcore(vc, ee); } } + +struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; +EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 24f58076d49e..980d8a6f7284 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -17,12 +17,16 @@ #include <asm/xics.h> #include <asm/debug.h> #include <asm/synch.h> +#include <asm/cputhreads.h> #include <asm/ppc-opcode.h> #include "book3s_xics.h" #define DEBUG_PASSUP +int h_ipi_redirect = 1; +EXPORT_SYMBOL(h_ipi_redirect); + static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); @@ -50,11 +54,84 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, /* -- ICP routines -- */ +#ifdef CONFIG_SMP +static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) +{ + int hcpu; + + hcpu = hcore << threads_shift; + kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; + smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); + icp_native_cause_ipi_rm(hcpu); +} +#else +static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } +#endif + +/* + * We start the search from our current CPU Id in the core map + * and go in a circle until we get back to our ID looking for a + * core that is running in host context and that hasn't already + * been targeted for another rm_host_ops. + * + * In the future, could consider using a fairer algorithm (one + * that distributes the IPIs better) + * + * Returns -1, if no CPU could be found in the host + * Else, returns a CPU Id which has been reserved for use + */ +static inline int grab_next_hostcore(int start, + struct kvmppc_host_rm_core *rm_core, int max, int action) +{ + bool success; + int core; + union kvmppc_rm_state old, new; + + for (core = start + 1; core < max; core++) { + old = new = READ_ONCE(rm_core[core].rm_state); + + if (!old.in_host || old.rm_action) + continue; + + /* Try to grab this host core if not taken already. */ + new.rm_action = action; + + success = cmpxchg64(&rm_core[core].rm_state.raw, + old.raw, new.raw) == old.raw; + if (success) { + /* + * Make sure that the store to the rm_action is made + * visible before we return to caller (and the + * subsequent store to rm_data) to synchronize with + * the IPI handler. + */ + smp_wmb(); + return core; + } + } + + return -1; +} + +static inline int find_available_hostcore(int action) +{ + int core; + int my_core = smp_processor_id() >> threads_shift; + struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core; + + core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action); + if (core == -1) + core = grab_next_hostcore(core, rm_core, my_core, action); + + return core; +} + static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, struct kvm_vcpu *this_vcpu) { struct kvmppc_icp *this_icp = this_vcpu->arch.icp; int cpu; + int hcore; /* Mark the target VCPU as having an interrupt pending */ vcpu->stat.queue_intr++; @@ -66,11 +143,22 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, return; } - /* Check if the core is loaded, if not, too hard */ + /* + * Check if the core is loaded, + * if not, find an available host core to post to wake the VCPU, + * if we can't find one, set up state to eventually return too hard. + */ cpu = vcpu->arch.thread_cpu; if (cpu < 0 || cpu >= nr_cpu_ids) { - this_icp->rm_action |= XICS_RM_KICK_VCPU; - this_icp->rm_kick_target = vcpu; + hcore = -1; + if (kvmppc_host_rm_ops_hv && h_ipi_redirect) + hcore = find_available_hostcore(XICS_RM_KICK_VCPU); + if (hcore != -1) { + icp_send_hcore_msg(hcore, vcpu); + } else { + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + } return; } @@ -623,3 +711,40 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) bail: return check_too_hard(xics, icp); } + +/* --- Non-real mode XICS-related built-in routines --- */ + +/** + * Host Operations poked by RM KVM + */ +static void rm_host_ipi_action(int action, void *data) +{ + switch (action) { + case XICS_RM_KICK_VCPU: + kvmppc_host_rm_ops_hv->vcpu_kick(data); + break; + default: + WARN(1, "Unexpected rm_action=%d data=%p\n", action, data); + break; + } + +} + +void kvmppc_xics_ipi_action(void) +{ + int core; + unsigned int cpu = smp_processor_id(); + struct kvmppc_host_rm_core *rm_corep; + + core = cpu >> threads_shift; + rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core]; + + if (rm_corep->rm_data) { + rm_host_ipi_action(rm_corep->rm_state.rm_action, + rm_corep->rm_data); + /* Order these stores against the real mode KVM */ + rm_corep->rm_data = NULL; + smp_wmb(); + rm_corep->rm_state.rm_action = 0; + } +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 6ee26de9a1de..ed16182a008b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2006,8 +2006,8 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table - .long 0 /* 0x138 */ - .long 0 /* 0x13c */ + .long DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index f2c75a1e0536..02176fd52f84 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -280,6 +280,37 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce = kvmppc_get_gpr(vcpu, 6); + unsigned long npages = kvmppc_get_gpr(vcpu, 7); + long rc; + + rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba, + tce, npages); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce_value = kvmppc_get_gpr(vcpu, 6); + unsigned long npages = kvmppc_get_gpr(vcpu, 7); + long rc; + + rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { long rc = kvmppc_xics_hcall(vcpu, cmd); @@ -306,6 +337,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) return kvmppc_h_pr_bulk_remove(vcpu); case H_PUT_TCE: return kvmppc_h_pr_put_tce(vcpu); + case H_PUT_TCE_INDIRECT: + return kvmppc_h_pr_put_tce_indirect(vcpu); + case H_STUFF_TCE: + return kvmppc_h_pr_stuff_tce(vcpu); case H_CEDE: kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a3b182dcb823..19aa59b0850c 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -33,6 +33,7 @@ #include <asm/tlbflush.h> #include <asm/cputhreads.h> #include <asm/irqflags.h> +#include <asm/iommu.h> #include "timing.h" #include "irq.h" #include "../mm/mmu_decl.h" @@ -437,6 +438,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm) unsigned int i; struct kvm_vcpu *vcpu; +#ifdef CONFIG_KVM_XICS + /* + * We call kick_all_cpus_sync() to ensure that all + * CPUs have executed any pending IPIs before we + * continue and free VCPUs structures below. + */ + if (is_kvmppc_hv_enabled(kvm)) + kick_all_cpus_sync(); +#endif + kvm_for_each_vcpu(i, vcpu, kvm) kvm_arch_vcpu_free(vcpu); @@ -509,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: + case KVM_CAP_SPAPR_TCE_64: case KVM_CAP_PPC_ALLOC_HTAB: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: @@ -569,6 +581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_GET_SMMU_INFO: r = 1; break; + case KVM_CAP_SPAPR_MULTITCE: + r = 1; + break; #endif default: r = 0; @@ -1331,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } #ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CREATE_SPAPR_TCE_64: { + struct kvm_create_spapr_tce_64 create_tce_64; + + r = -EFAULT; + if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64))) + goto out; + if (create_tce_64.flags) { + r = -EINVAL; + goto out; + } + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); + goto out; + } case KVM_CREATE_SPAPR_TCE: { struct kvm_create_spapr_tce create_tce; + struct kvm_create_spapr_tce_64 create_tce_64; r = -EFAULT; if (copy_from_user(&create_tce, argp, sizeof(create_tce))) goto out; - r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); + + create_tce_64.liobn = create_tce.liobn; + create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K; + create_tce_64.offset = 0; + create_tce_64.size = create_tce.window_size >> + IOMMU_PAGE_SHIFT_4K; + create_tce_64.flags = 0; + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); goto out; } case KVM_PPC_GET_SMMU_INFO: { diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 83dfd7925c72..de37ff445362 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -243,3 +243,11 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) } #endif /* CONFIG_DEBUG_VM */ +unsigned long vmalloc_to_phys(void *va) +{ + unsigned long pfn = vmalloc_to_pfn(va); + + BUG_ON(!pfn); + return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); +} +EXPORT_SYMBOL_GPL(vmalloc_to_phys); diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 9f9dfda9ed2c..3b09ecfd0aee 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -493,14 +493,6 @@ static size_t event_to_attr_ct(struct hv_24x7_event_data *event) } } -static unsigned long vmalloc_to_phys(void *v) -{ - struct page *p = vmalloc_to_page(v); - - BUG_ON(!p); - return page_to_phys(p) + offset_in_page(v); -} - /* */ struct event_uniq { struct rb_node node; diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index eae32654bdf2..afdf62f2a695 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -159,6 +159,27 @@ static void icp_native_cause_ipi(int cpu, unsigned long data) icp_native_set_qirr(cpu, IPI_PRIORITY); } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void icp_native_cause_ipi_rm(int cpu) +{ + /* + * Currently not used to send IPIs to another CPU + * on the same core. Only caller is KVM real mode. + * Need the physical address of the XICS to be + * previously saved in kvm_hstate in the paca. + */ + unsigned long xics_phys; + + /* + * Just like the cause_ipi functions, it is required to + * include a full barrier (out8 includes a sync) before + * causing the IPI. + */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY); +} +#endif + /* * Called when an interrupt is received on an off-line CPU to * clear the interrupt, so that the CPU can go back to nap mode. diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 82581b6e944d..c251f06f4e11 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -862,6 +862,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122 #define KVM_CAP_HYPERV_SYNIC 123 #define KVM_CAP_S390_RI 124 +#define KVM_CAP_SPAPR_TCE_64 125 #ifdef KVM_CAP_IRQ_ROUTING @@ -1154,6 +1155,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_PPC_ALLOC_HTAB */ #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) +#define KVM_CREATE_SPAPR_TCE_64 _IOW(KVMIO, 0xa8, \ + struct kvm_create_spapr_tce_64) /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) /* Available with KVM_CAP_PPC_HTAB_FD */ |