From 2222ce0fbbcc4ebfa9995c8d23d72c8239ad712c Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Wed, 29 Apr 2015 20:44:58 -0500 Subject: powerpc/pseries: Fix possible leaked device node reference Failure return from dlpar_configure_connector when dlpar adding cpus results in leaking references to the cpus parent device node. Move the call to of_node_put() prior to checking the result of dlpar_configure_connector. Fixes: 8d5ff320766f ("powerpc/pseries: Make dlpar_configure_connector parent node aware") Signed-off-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/dlpar.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 019d34aaf054..47d9cebe7159 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -421,11 +421,10 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count) return -ENODEV; dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent); + of_node_put(parent); if (!dn) return -EINVAL; - of_node_put(parent); - rc = dlpar_attach_node(dn); if (rc) { dlpar_release_drc(drc_index); -- cgit v1.2.3 From 5af7a6f3e2d015dcaaeffa48c6d47238415cbe66 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 10 Apr 2015 11:52:06 +1000 Subject: powerpc/pasemi: Only the build the pasemi MSI code for PASEMI=y The pasemi MSI code is currently always built when MPIC=y && PCI_MSI=y. It should not have any effect on other platforms, because it immediately checks the MPIC's compatible property for "pasemi,pwrficient-openpic". However it's odd that it's still built even when PASEMI=n. It also needn't be in sysdev, as it's only used by pasemi. So move it into platforms/pasemi, whereby it will only be built for PASEMI=y. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pasemi/Makefile | 1 + arch/powerpc/platforms/pasemi/msi.c | 165 ++++++++++++++++++++++++++++++++ arch/powerpc/sysdev/Makefile | 2 +- arch/powerpc/sysdev/mpic.h | 10 +- arch/powerpc/sysdev/mpic_pasemi_msi.c | 167 --------------------------------- 5 files changed, 172 insertions(+), 173 deletions(-) create mode 100644 arch/powerpc/platforms/pasemi/msi.c delete mode 100644 arch/powerpc/sysdev/mpic_pasemi_msi.c (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/pasemi/Makefile b/arch/powerpc/platforms/pasemi/Makefile index 8e8d4cae5ebe..60b4e0fd9808 100644 --- a/arch/powerpc/platforms/pasemi/Makefile +++ b/arch/powerpc/platforms/pasemi/Makefile @@ -1,2 +1,3 @@ obj-y += setup.o pci.o time.o idle.o powersave.o iommu.o dma_lib.o misc.o obj-$(CONFIG_PPC_PASEMI_MDIO) += gpio_mdio.o +obj-$(CONFIG_PCI_MSI) += msi.o diff --git a/arch/powerpc/platforms/pasemi/msi.c b/arch/powerpc/platforms/pasemi/msi.c new file mode 100644 index 000000000000..0b3706604543 --- /dev/null +++ b/arch/powerpc/platforms/pasemi/msi.c @@ -0,0 +1,165 @@ +/* + * Copyright 2007, Olof Johansson, PA Semi + * + * Based on arch/powerpc/sysdev/mpic_u3msi.c: + * + * Copyright 2006, Segher Boessenkool, IBM Corporation. + * Copyright 2006-2007, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 of the + * License. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Allocate 16 interrupts per device, to give an alignment of 16, + * since that's the size of the grouping w.r.t. affinity. If someone + * needs more than 32 MSI's down the road we'll have to rethink this, + * but it should be OK for now. + */ +#define ALLOC_CHUNK 16 + +#define PASEMI_MSI_ADDR 0xfc080000 + +/* A bit ugly, can we get this from the pci_dev somehow? */ +static struct mpic *msi_mpic; + + +static void mpic_pasemi_msi_mask_irq(struct irq_data *data) +{ + pr_debug("mpic_pasemi_msi_mask_irq %d\n", data->irq); + pci_msi_mask_irq(data); + mpic_mask_irq(data); +} + +static void mpic_pasemi_msi_unmask_irq(struct irq_data *data) +{ + pr_debug("mpic_pasemi_msi_unmask_irq %d\n", data->irq); + mpic_unmask_irq(data); + pci_msi_unmask_irq(data); +} + +static struct irq_chip mpic_pasemi_msi_chip = { + .irq_shutdown = mpic_pasemi_msi_mask_irq, + .irq_mask = mpic_pasemi_msi_mask_irq, + .irq_unmask = mpic_pasemi_msi_unmask_irq, + .irq_eoi = mpic_end_irq, + .irq_set_type = mpic_set_irq_type, + .irq_set_affinity = mpic_set_affinity, + .name = "PASEMI-MSI", +}; + +static void pasemi_msi_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct msi_desc *entry; + + pr_debug("pasemi_msi_teardown_msi_irqs, pdev %p\n", pdev); + + list_for_each_entry(entry, &pdev->msi_list, list) { + if (entry->irq == NO_IRQ) + continue; + + irq_set_msi_desc(entry->irq, NULL); + msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, + virq_to_hw(entry->irq), ALLOC_CHUNK); + irq_dispose_mapping(entry->irq); + } + + return; +} + +static int pasemi_msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + unsigned int virq; + struct msi_desc *entry; + struct msi_msg msg; + int hwirq; + + if (type == PCI_CAP_ID_MSIX) + pr_debug("pasemi_msi: MSI-X untested, trying anyway\n"); + pr_debug("pasemi_msi_setup_msi_irqs, pdev %p nvec %d type %d\n", + pdev, nvec, type); + + msg.address_hi = 0; + msg.address_lo = PASEMI_MSI_ADDR; + + list_for_each_entry(entry, &pdev->msi_list, list) { + /* Allocate 16 interrupts for now, since that's the grouping for + * affinity. This can be changed later if it turns out 32 is too + * few MSIs for someone, but restrictions will apply to how the + * sources can be changed independently. + */ + hwirq = msi_bitmap_alloc_hwirqs(&msi_mpic->msi_bitmap, + ALLOC_CHUNK); + if (hwirq < 0) { + pr_debug("pasemi_msi: failed allocating hwirq\n"); + return hwirq; + } + + virq = irq_create_mapping(msi_mpic->irqhost, hwirq); + if (virq == NO_IRQ) { + pr_debug("pasemi_msi: failed mapping hwirq 0x%x\n", + hwirq); + msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, hwirq, + ALLOC_CHUNK); + return -ENOSPC; + } + + /* Vector on MSI is really an offset, the hardware adds + * it to the value written at the magic address. So set + * it to 0 to remain sane. + */ + mpic_set_vector(virq, 0); + + irq_set_msi_desc(virq, entry); + irq_set_chip(virq, &mpic_pasemi_msi_chip); + irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING); + + pr_debug("pasemi_msi: allocated virq 0x%x (hw 0x%x) " \ + "addr 0x%x\n", virq, hwirq, msg.address_lo); + + /* Likewise, the device writes [0...511] into the target + * register to generate MSI [512...1023] + */ + msg.data = hwirq-0x200; + pci_write_msi_msg(virq, &msg); + } + + return 0; +} + +int mpic_pasemi_msi_init(struct mpic *mpic) +{ + int rc; + + if (!mpic->irqhost->of_node || + !of_device_is_compatible(mpic->irqhost->of_node, + "pasemi,pwrficient-openpic")) + return -ENODEV; + + rc = mpic_msi_init_allocator(mpic); + if (rc) { + pr_debug("pasemi_msi: Error allocating bitmap!\n"); + return rc; + } + + pr_debug("pasemi_msi: Registering PA Semi MPIC MSI callbacks\n"); + + msi_mpic = mpic; + WARN_ON(ppc_md.setup_msi_irqs); + ppc_md.setup_msi_irqs = pasemi_msi_setup_msi_irqs; + ppc_md.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs; + + return 0; +} diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index f7cb2a1b01fa..5b492a6438ff 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -2,7 +2,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -mpic-msi-obj-$(CONFIG_PCI_MSI) += mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o +mpic-msi-obj-$(CONFIG_PCI_MSI) += mpic_msi.o mpic_u3msi.o obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) obj-$(CONFIG_MPIC_TIMER) += mpic_timer.o obj-$(CONFIG_FSL_MPIC_TIMER_WAKEUP) += fsl_mpic_timer_wakeup.o diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h index 24bf07a63924..32971a41853b 100644 --- a/arch/powerpc/sysdev/mpic.h +++ b/arch/powerpc/sysdev/mpic.h @@ -15,7 +15,6 @@ extern void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq); extern int mpic_msi_init_allocator(struct mpic *mpic); extern int mpic_u3msi_init(struct mpic *mpic); -extern int mpic_pasemi_msi_init(struct mpic *mpic); #else static inline void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq) @@ -27,11 +26,12 @@ static inline int mpic_u3msi_init(struct mpic *mpic) { return -1; } +#endif -static inline int mpic_pasemi_msi_init(struct mpic *mpic) -{ - return -1; -} +#if defined(CONFIG_PCI_MSI) && defined(CONFIG_PPC_PASEMI) +int mpic_pasemi_msi_init(struct mpic *mpic); +#else +static inline int mpic_pasemi_msi_init(struct mpic *mpic) { return -1; } #endif extern int mpic_set_irq_type(struct irq_data *d, unsigned int flow_type); diff --git a/arch/powerpc/sysdev/mpic_pasemi_msi.c b/arch/powerpc/sysdev/mpic_pasemi_msi.c deleted file mode 100644 index a3f660eed6de..000000000000 --- a/arch/powerpc/sysdev/mpic_pasemi_msi.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright 2007, Olof Johansson, PA Semi - * - * Based on arch/powerpc/sysdev/mpic_u3msi.c: - * - * Copyright 2006, Segher Boessenkool, IBM Corporation. - * Copyright 2006-2007, Michael Ellerman, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 of the - * License. - * - */ - -#undef DEBUG - -#include -#include -#include -#include -#include -#include -#include - -#include "mpic.h" - -/* Allocate 16 interrupts per device, to give an alignment of 16, - * since that's the size of the grouping w.r.t. affinity. If someone - * needs more than 32 MSI's down the road we'll have to rethink this, - * but it should be OK for now. - */ -#define ALLOC_CHUNK 16 - -#define PASEMI_MSI_ADDR 0xfc080000 - -/* A bit ugly, can we get this from the pci_dev somehow? */ -static struct mpic *msi_mpic; - - -static void mpic_pasemi_msi_mask_irq(struct irq_data *data) -{ - pr_debug("mpic_pasemi_msi_mask_irq %d\n", data->irq); - pci_msi_mask_irq(data); - mpic_mask_irq(data); -} - -static void mpic_pasemi_msi_unmask_irq(struct irq_data *data) -{ - pr_debug("mpic_pasemi_msi_unmask_irq %d\n", data->irq); - mpic_unmask_irq(data); - pci_msi_unmask_irq(data); -} - -static struct irq_chip mpic_pasemi_msi_chip = { - .irq_shutdown = mpic_pasemi_msi_mask_irq, - .irq_mask = mpic_pasemi_msi_mask_irq, - .irq_unmask = mpic_pasemi_msi_unmask_irq, - .irq_eoi = mpic_end_irq, - .irq_set_type = mpic_set_irq_type, - .irq_set_affinity = mpic_set_affinity, - .name = "PASEMI-MSI", -}; - -static void pasemi_msi_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct msi_desc *entry; - - pr_debug("pasemi_msi_teardown_msi_irqs, pdev %p\n", pdev); - - list_for_each_entry(entry, &pdev->msi_list, list) { - if (entry->irq == NO_IRQ) - continue; - - irq_set_msi_desc(entry->irq, NULL); - msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, - virq_to_hw(entry->irq), ALLOC_CHUNK); - irq_dispose_mapping(entry->irq); - } - - return; -} - -static int pasemi_msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - unsigned int virq; - struct msi_desc *entry; - struct msi_msg msg; - int hwirq; - - if (type == PCI_CAP_ID_MSIX) - pr_debug("pasemi_msi: MSI-X untested, trying anyway\n"); - pr_debug("pasemi_msi_setup_msi_irqs, pdev %p nvec %d type %d\n", - pdev, nvec, type); - - msg.address_hi = 0; - msg.address_lo = PASEMI_MSI_ADDR; - - list_for_each_entry(entry, &pdev->msi_list, list) { - /* Allocate 16 interrupts for now, since that's the grouping for - * affinity. This can be changed later if it turns out 32 is too - * few MSIs for someone, but restrictions will apply to how the - * sources can be changed independently. - */ - hwirq = msi_bitmap_alloc_hwirqs(&msi_mpic->msi_bitmap, - ALLOC_CHUNK); - if (hwirq < 0) { - pr_debug("pasemi_msi: failed allocating hwirq\n"); - return hwirq; - } - - virq = irq_create_mapping(msi_mpic->irqhost, hwirq); - if (virq == NO_IRQ) { - pr_debug("pasemi_msi: failed mapping hwirq 0x%x\n", - hwirq); - msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, hwirq, - ALLOC_CHUNK); - return -ENOSPC; - } - - /* Vector on MSI is really an offset, the hardware adds - * it to the value written at the magic address. So set - * it to 0 to remain sane. - */ - mpic_set_vector(virq, 0); - - irq_set_msi_desc(virq, entry); - irq_set_chip(virq, &mpic_pasemi_msi_chip); - irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING); - - pr_debug("pasemi_msi: allocated virq 0x%x (hw 0x%x) " \ - "addr 0x%x\n", virq, hwirq, msg.address_lo); - - /* Likewise, the device writes [0...511] into the target - * register to generate MSI [512...1023] - */ - msg.data = hwirq-0x200; - pci_write_msi_msg(virq, &msg); - } - - return 0; -} - -int mpic_pasemi_msi_init(struct mpic *mpic) -{ - int rc; - - if (!mpic->irqhost->of_node || - !of_device_is_compatible(mpic->irqhost->of_node, - "pasemi,pwrficient-openpic")) - return -ENODEV; - - rc = mpic_msi_init_allocator(mpic); - if (rc) { - pr_debug("pasemi_msi: Error allocating bitmap!\n"); - return rc; - } - - pr_debug("pasemi_msi: Registering PA Semi MPIC MSI callbacks\n"); - - msi_mpic = mpic; - WARN_ON(ppc_md.setup_msi_irqs); - ppc_md.setup_msi_irqs = pasemi_msi_setup_msi_irqs; - ppc_md.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs; - - return 0; -} -- cgit v1.2.3 From e0d0059169945c8ee16790d2e7244cea397dfd56 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 11 May 2015 20:01:02 +1000 Subject: powerpc/vdso: Disable building the 32-bit VDSO on little endian The only little endian configuration we support is ppc64le. As such if we're building little endian we don't need a 32-bit VDSO, because there is no 32-bit userspace. This patch is a fairly ugly mess of #ifdefs, but is the minimal logic required to disable the 32-bit VDSO. We can hopefully clean up the result in future with some further refactoring. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 3 ++- arch/powerpc/kernel/vdso.c | 36 ++++++++++++++++++++++++++++++++-- arch/powerpc/platforms/Kconfig.cputype | 10 ++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index c1ebbdaac28f..87c7d1473488 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -33,11 +33,12 @@ obj-y := cputable.o ptrace.o syscalls.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o dma.o \ - misc_$(CONFIG_WORD_SIZE).o vdso32/ \ + misc_$(CONFIG_WORD_SIZE).o \ of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o +obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 8331d0bef0fb..b457bfa28436 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -49,13 +49,16 @@ /* The alignment of the vDSO */ #define VDSO_ALIGNMENT (1 << 16) -extern char vdso32_start, vdso32_end; -static void *vdso32_kbase = &vdso32_start; static unsigned int vdso32_pages; +static void *vdso32_kbase; static struct page **vdso32_pagelist; unsigned long vdso32_sigtramp; unsigned long vdso32_rt_sigtramp; +#ifdef CONFIG_VDSO32 +extern char vdso32_start, vdso32_end; +#endif + #ifdef CONFIG_PPC64 extern char vdso64_start, vdso64_end; static void *vdso64_kbase = &vdso64_start; @@ -248,6 +251,7 @@ const char *arch_vma_name(struct vm_area_struct *vma) +#ifdef CONFIG_VDSO32 static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname, unsigned long *size) { @@ -335,6 +339,20 @@ static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32, return 0; } +#else /* !CONFIG_VDSO32 */ +static unsigned long __init find_function32(struct lib32_elfinfo *lib, + const char *symname) +{ + return 0; +} + +static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64, + const char *orig, const char *fix) +{ + return 0; +} +#endif /* CONFIG_VDSO32 */ #ifdef CONFIG_PPC64 @@ -445,6 +463,7 @@ static __init int vdso_do_find_sections(struct lib32_elfinfo *v32, * Locate symbol tables & text section */ +#ifdef CONFIG_VDSO32 v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize); v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL); if (v32->dynsym == NULL || v32->dynstr == NULL) { @@ -457,6 +476,7 @@ static __init int vdso_do_find_sections(struct lib32_elfinfo *v32, return -1; } v32->text = sect - vdso32_kbase; +#endif #ifdef CONFIG_PPC64 v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize); @@ -493,7 +513,9 @@ static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32, static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32, struct lib64_elfinfo *v64) { +#ifdef CONFIG_VDSO32 Elf32_Sym *sym32; +#endif #ifdef CONFIG_PPC64 Elf64_Sym *sym64; @@ -508,6 +530,7 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32, (sym64->st_value - VDSO64_LBASE); #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_VDSO32 sym32 = find_symbol32(v32, "__kernel_datapage_offset"); if (sym32 == NULL) { printk(KERN_ERR "vDSO32: Can't find symbol " @@ -517,6 +540,7 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32, *((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) = (vdso32_pages << PAGE_SHIFT) - (sym32->st_value - VDSO32_LBASE); +#endif return 0; } @@ -550,6 +574,7 @@ static __init int vdso_fixup_features(struct lib32_elfinfo *v32, start, start + size); #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_VDSO32 start = find_section32(v32->hdr, "__ftr_fixup", &size); if (start) do_feature_fixups(cur_cpu_spec->cpu_features, @@ -571,6 +596,7 @@ static __init int vdso_fixup_features(struct lib32_elfinfo *v32, if (start) do_lwsync_fixups(cur_cpu_spec->cpu_features, start, start + size); +#endif return 0; } @@ -732,11 +758,15 @@ static int __init vdso_init(void) #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_VDSO32 + vdso32_kbase = &vdso32_start; + /* * Calculate the size of the 32 bits vDSO */ vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); +#endif /* @@ -757,6 +787,7 @@ static int __init vdso_init(void) return 0; } +#ifdef CONFIG_VDSO32 /* Make sure pages are in the correct state */ vdso32_pagelist = kzalloc(sizeof(struct page *) * (vdso32_pages + 2), GFP_KERNEL); @@ -769,6 +800,7 @@ static int __init vdso_init(void) } vdso32_pagelist[i++] = virt_to_page(vdso_data); vdso32_pagelist[i] = NULL; +#endif #ifdef CONFIG_PPC64 vdso64_pagelist = kzalloc(sizeof(struct page *) * (vdso64_pages + 2), diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7264e91190be..724ecc791404 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -405,6 +405,16 @@ config PPC_DOORBELL endmenu +config VDSO32 + def_bool y + depends on PPC32 || CPU_BIG_ENDIAN + help + This symbol controls whether we build the 32-bit VDSO. We obviously + want to do that if we're building a 32-bit kernel. If we're building + a 64-bit kernel then we only want a 32-bit VDSO if we're building for + big endian. That is because the only little endian configuration we + support is ppc64le which is 64-bit only. + choice prompt "Endianness selection" default CPU_BIG_ENDIAN -- cgit v1.2.3 From 38c0488770983b2654f075540cc1fc71f55b6df5 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 30 Apr 2015 13:50:07 +1000 Subject: powerpc/powernv: Silence SYSPARAM warning on boot OpenPower BMC machines do not place any sysparams in the device tree, so at every boot we get a warning: [ 0.437176] SYSPARAM: Opal sysparam node not found Remove the warning, and reorder the init so we don't peform allocations when there is no sysparam node in the device tree. Signed-off-by: Joel Stanley Acked-by: Neelesh Gupta Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-sysparam.c | 31 +++++++++++++------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c index 9d1acf22a099..2e52b47393e7 100644 --- a/arch/powerpc/platforms/powernv/opal-sysparam.c +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -162,10 +162,20 @@ void __init opal_sys_param_init(void) goto out; } + /* Some systems do not use sysparams; this is not an error */ + sysparam = of_find_node_by_path("/ibm,opal/sysparams"); + if (!sysparam) + goto out; + + if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) { + pr_err("SYSPARAM: Opal sysparam node not compatible\n"); + goto out_node_put; + } + sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj); if (!sysparam_kobj) { pr_err("SYSPARAM: Failed to create sysparam kobject\n"); - goto out; + goto out_node_put; } /* Allocate big enough buffer for any get/set transactions */ @@ -176,30 +186,19 @@ void __init opal_sys_param_init(void) goto out_kobj_put; } - sysparam = of_find_node_by_path("/ibm,opal/sysparams"); - if (!sysparam) { - pr_err("SYSPARAM: Opal sysparam node not found\n"); - goto out_param_buf; - } - - if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) { - pr_err("SYSPARAM: Opal sysparam node not compatible\n"); - goto out_node_put; - } - /* Number of parameters exposed through DT */ count = of_property_count_strings(sysparam, "param-name"); if (count < 0) { pr_err("SYSPARAM: No string found of property param-name in " "the node %s\n", sysparam->name); - goto out_node_put; + goto out_param_buf; } id = kzalloc(sizeof(*id) * count, GFP_KERNEL); if (!id) { pr_err("SYSPARAM: Failed to allocate memory to read parameter " "id\n"); - goto out_node_put; + goto out_param_buf; } size = kzalloc(sizeof(*size) * count, GFP_KERNEL); @@ -293,12 +292,12 @@ out_free_size: kfree(size); out_free_id: kfree(id); -out_node_put: - of_node_put(sysparam); out_param_buf: kfree(param_data_buf); out_kobj_put: kobject_put(sysparam_kobj); +out_node_put: + of_node_put(sysparam); out: return; } -- cgit v1.2.3 From 2ac3990cc36b1e42feca733b25254fb6dae15431 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 27 Apr 2015 09:25:10 +0800 Subject: powerpc/eeh: fix comment for wait_state() To retrieve the PCI slot state, EEH driver would set a timeout for that. While current comment is not aligned to what the code does. This patch fixes those comments according to the code. Signed-off-by: Wei Yang Acked-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_driver.c | 2 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +- arch/powerpc/platforms/pseries/eeh_pseries.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 24768ff3cb73..89eb4bc34d3a 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -660,7 +660,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_report_error, &result); /* Get the current PCI slot state. This can take a long time, - * sometimes over 3 seconds for certain systems. + * sometimes over 300 seconds for certain systems. */ rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index ce738ab3d5a9..d0254710595d 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -979,7 +979,7 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) /** * pnv_eeh_wait_state - Wait for PE state * @pe: EEH PE - * @max_wait: maximal period in microsecond + * @max_wait: maximal period in millisecond * * Wait for the state of associated PE. It might take some time * to retrieve the PE's state. diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 2039397cc75d..1ba55d0bb449 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -519,7 +519,7 @@ static int pseries_eeh_reset(struct eeh_pe *pe, int option) /** * pseries_eeh_wait_state - Wait for PE state * @pe: EEH PE - * @max_wait: maximal period in microsecond + * @max_wait: maximal period in millisecond * * Wait for the state of associated PE. It might take some time * to retrieve the PE's state. -- cgit v1.2.3 From e17866d5593dc6ea7bff9ee56751fae6c3f56223 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 27 Apr 2015 09:25:11 +0800 Subject: powerpc/eeh: fix powernv_eeh_wait_state delay logic As the comment indicates, powernv_eeh_get_state() will inform EEH core to delay 1 second. This means the delay doesn't happen when powernv_eeh_get_state() returns. This patch moves the delay subtraction just before msleep(), which is the same logic in pseries_eeh_wait_state(). Signed-off-by: Wei Yang Acked-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index d0254710595d..622f08cf54b6 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1000,13 +1000,13 @@ static int pnv_eeh_wait_state(struct eeh_pe *pe, int max_wait) if (ret != EEH_STATE_UNAVAILABLE) return ret; - max_wait -= mwait; if (max_wait <= 0) { pr_warn("%s: Timeout getting PE#%x's state (%d)\n", __func__, pe->addr, max_wait); return EEH_STATE_NOT_SUPPORT; } + max_wait -= mwait; msleep(mwait); } -- cgit v1.2.3 From d4d4add9eacf0a3e642228a22df722f00921a184 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 22 Apr 2015 15:36:50 +1000 Subject: powerpc: Little endian should depend on PPC_BOOK3S_64 The only little endian configuration we support is ppc64le, all other configurations are big endian. So we should only offer a choice of endian if we're building for 64-bit Book3S, ie. PPC_BOOK3S_64. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/Kconfig.cputype | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 724ecc791404..c140e94c7c72 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -431,6 +431,7 @@ config CPU_BIG_ENDIAN config CPU_LITTLE_ENDIAN bool "Build little endian kernel" + depends on PPC_BOOK3S_64 select PPC64_BOOT_WRAPPER help Build a little endian kernel. -- cgit v1.2.3 From d405a98c70ebbaa6ef276d7307d034a682321b95 Mon Sep 17 00:00:00 2001 From: "Shreyas B. Prabhu" Date: Mon, 20 Apr 2015 10:32:57 +0530 Subject: powerpc/powernv: Move cpuidle related code from setup.c to new file This is a cleanup patch; doesn't change any functionality. Moves all cpuidle related code from setup.c to a new file. Signed-off-by: Shreyas B. Prabhu Reviewed-by: Preeti U Murthy [mpe: Fix the SMP=n build by including asm/smp.h in idle.c] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/idle.c | 192 ++++++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/setup.c | 171 ---------------------------- 3 files changed, 193 insertions(+), 172 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/idle.c (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 33e44f37212f..bee923585afc 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,4 +1,4 @@ -obj-y += setup.o opal-wrappers.o opal.o opal-async.o +obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c new file mode 100644 index 000000000000..5c799324b695 --- /dev/null +++ b/arch/powerpc/platforms/powernv/idle.c @@ -0,0 +1,192 @@ +/* + * PowerNV cpuidle code + * + * Copyright 2015 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "powernv.h" +#include "subcore.h" + +static u32 supported_cpuidle_states; + +int pnv_save_sprs_for_winkle(void) +{ + int cpu; + int rc; + + /* + * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross + * all cpus at boot. Get these reg values of current cpu and use the + * same accross all cpus. + */ + uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + uint64_t hid0_val = mfspr(SPRN_HID0); + uint64_t hid1_val = mfspr(SPRN_HID1); + uint64_t hid4_val = mfspr(SPRN_HID4); + uint64_t hid5_val = mfspr(SPRN_HID5); + uint64_t hmeer_val = mfspr(SPRN_HMEER); + + for_each_possible_cpu(cpu) { + uint64_t pir = get_hard_smp_processor_id(cpu); + uint64_t hsprg0_val = (uint64_t)&paca[cpu]; + + /* + * HSPRG0 is used to store the cpu's pointer to paca. Hence last + * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0 + * with 63rd bit set, so that when a thread wakes up at 0x100 we + * can use this bit to distinguish between fastsleep and + * deep winkle. + */ + hsprg0_val |= 1; + + rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); + if (rc != 0) + return rc; + + /* HIDs are per core registers */ + if (cpu_thread_in_core(cpu) == 0) { + + rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); + if (rc != 0) + return rc; + } + } + + return 0; +} + +static void pnv_alloc_idle_core_states(void) +{ + int i, j; + int nr_cores = cpu_nr_cores(); + u32 *core_idle_state; + + /* + * core_idle_state - First 8 bits track the idle state of each thread + * of the core. The 8th bit is the lock bit. Initially all thread bits + * are set. They are cleared when the thread enters deep idle state + * like sleep and winkle. Initially the lock bit is cleared. + * The lock bit has 2 purposes + * a. While the first thread is restoring core state, it prevents + * other threads in the core from switching to process context. + * b. While the last thread in the core is saving the core state, it + * prevents a different thread from waking up. + */ + for (i = 0; i < nr_cores; i++) { + int first_cpu = i * threads_per_core; + int node = cpu_to_node(first_cpu); + + core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); + *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + + for (j = 0; j < threads_per_core; j++) { + int cpu = first_cpu + j; + + paca[cpu].core_idle_state_ptr = core_idle_state; + paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; + paca[cpu].thread_mask = 1 << j; + } + } + + update_subcore_sibling_mask(); + + if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) + pnv_save_sprs_for_winkle(); +} + +u32 pnv_get_supported_cpuidle_states(void) +{ + return supported_cpuidle_states; +} +EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); + +static int __init pnv_init_idle_states(void) +{ + struct device_node *power_mgt; + int dt_idle_states; + u32 *flags; + int i; + + supported_cpuidle_states = 0; + + if (cpuidle_disable != IDLE_NO_OVERRIDE) + goto out; + + if (!firmware_has_feature(FW_FEATURE_OPALv3)) + goto out; + + power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); + if (!power_mgt) { + pr_warn("opal: PowerMgmt Node not found\n"); + goto out; + } + dt_idle_states = of_property_count_u32_elems(power_mgt, + "ibm,cpu-idle-state-flags"); + if (dt_idle_states < 0) { + pr_warn("cpuidle-powernv: no idle states found in the DT\n"); + goto out; + } + + flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL); + if (of_property_read_u32_array(power_mgt, + "ibm,cpu-idle-state-flags", flags, dt_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n"); + goto out_free; + } + + for (i = 0; i < dt_idle_states; i++) + supported_cpuidle_states |= flags[i]; + + if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + patch_instruction( + (unsigned int *)pnv_fastsleep_workaround_at_entry, + PPC_INST_NOP); + patch_instruction( + (unsigned int *)pnv_fastsleep_workaround_at_exit, + PPC_INST_NOP); + } + pnv_alloc_idle_core_states(); +out_free: + kfree(flags); +out: + return 0; +} + +subsys_initcall(pnv_init_idle_states); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 16fdcb23f4c3..509cdd2f7ad8 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -35,12 +35,8 @@ #include #include #include -#include -#include -#include #include "powernv.h" -#include "subcore.h" static void __init pnv_setup_arch(void) { @@ -277,173 +273,6 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.handle_hmi_exception = opal_handle_hmi_exception; } -static u32 supported_cpuidle_states; - -int pnv_save_sprs_for_winkle(void) -{ - int cpu; - int rc; - - /* - * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross - * all cpus at boot. Get these reg values of current cpu and use the - * same accross all cpus. - */ - uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; - uint64_t hid0_val = mfspr(SPRN_HID0); - uint64_t hid1_val = mfspr(SPRN_HID1); - uint64_t hid4_val = mfspr(SPRN_HID4); - uint64_t hid5_val = mfspr(SPRN_HID5); - uint64_t hmeer_val = mfspr(SPRN_HMEER); - - for_each_possible_cpu(cpu) { - uint64_t pir = get_hard_smp_processor_id(cpu); - uint64_t hsprg0_val = (uint64_t)&paca[cpu]; - - /* - * HSPRG0 is used to store the cpu's pointer to paca. Hence last - * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0 - * with 63rd bit set, so that when a thread wakes up at 0x100 we - * can use this bit to distinguish between fastsleep and - * deep winkle. - */ - hsprg0_val |= 1; - - rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); - if (rc != 0) - return rc; - - rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); - if (rc != 0) - return rc; - - /* HIDs are per core registers */ - if (cpu_thread_in_core(cpu) == 0) { - - rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val); - if (rc != 0) - return rc; - - rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val); - if (rc != 0) - return rc; - - rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); - if (rc != 0) - return rc; - - rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); - if (rc != 0) - return rc; - - rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); - if (rc != 0) - return rc; - } - } - - return 0; -} - -static void pnv_alloc_idle_core_states(void) -{ - int i, j; - int nr_cores = cpu_nr_cores(); - u32 *core_idle_state; - - /* - * core_idle_state - First 8 bits track the idle state of each thread - * of the core. The 8th bit is the lock bit. Initially all thread bits - * are set. They are cleared when the thread enters deep idle state - * like sleep and winkle. Initially the lock bit is cleared. - * The lock bit has 2 purposes - * a. While the first thread is restoring core state, it prevents - * other threads in the core from switching to process context. - * b. While the last thread in the core is saving the core state, it - * prevents a different thread from waking up. - */ - for (i = 0; i < nr_cores; i++) { - int first_cpu = i * threads_per_core; - int node = cpu_to_node(first_cpu); - - core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); - *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; - - for (j = 0; j < threads_per_core; j++) { - int cpu = first_cpu + j; - - paca[cpu].core_idle_state_ptr = core_idle_state; - paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; - paca[cpu].thread_mask = 1 << j; - } - } - - update_subcore_sibling_mask(); - - if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) - pnv_save_sprs_for_winkle(); -} - -u32 pnv_get_supported_cpuidle_states(void) -{ - return supported_cpuidle_states; -} -EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); - -static int __init pnv_init_idle_states(void) -{ - struct device_node *power_mgt; - int dt_idle_states; - u32 *flags; - int i; - - supported_cpuidle_states = 0; - - if (cpuidle_disable != IDLE_NO_OVERRIDE) - goto out; - - if (!firmware_has_feature(FW_FEATURE_OPALv3)) - goto out; - - power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); - if (!power_mgt) { - pr_warn("opal: PowerMgmt Node not found\n"); - goto out; - } - dt_idle_states = of_property_count_u32_elems(power_mgt, - "ibm,cpu-idle-state-flags"); - if (dt_idle_states < 0) { - pr_warn("cpuidle-powernv: no idle states found in the DT\n"); - goto out; - } - - flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL); - if (of_property_read_u32_array(power_mgt, - "ibm,cpu-idle-state-flags", flags, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n"); - goto out_free; - } - - for (i = 0; i < dt_idle_states; i++) - supported_cpuidle_states |= flags[i]; - - if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_entry, - PPC_INST_NOP); - patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_exit, - PPC_INST_NOP); - } - pnv_alloc_idle_core_states(); -out_free: - kfree(flags); -out: - return 0; -} - -subsys_initcall(pnv_init_idle_states); - static int __init pnv_probe(void) { unsigned long root = of_get_flat_dt_root(); -- cgit v1.2.3 From 5703d2f4a1da6d23b3be896947ce255226fc4295 Mon Sep 17 00:00:00 2001 From: "Shreyas B. Prabhu" Date: Mon, 20 Apr 2015 10:32:58 +0530 Subject: powerpc/powernv: Introduce sysfs control for fastsleep workaround behavior Fastsleep is one of the idle state which cpuidle subsystem currently uses on power8 machines. In this state L2 cache is brought down to a threshold voltage. Therefore when the core is in fastsleep, the communication between L2 and L3 needs to be fenced. But there is a bug in the current power8 chips surrounding this fencing. OPAL provides a workaround which precludes the possibility of hitting this bug. But running with this workaround applied causes checkstop if any correctable error in L2 cache directory is detected. Hence OPAL also provides a way to undo the workaround. In the existing implementation, workaround is applied by the last thread of the core entering fastsleep and undone by the first thread waking up. But this has a performance cost. These OPAL calls account for roughly 4000 cycles everytime the core has to enter or wakeup from fastsleep. This patch introduces a sysfs attribute (fastsleep_workaround_applyonce) to choose the behavior of this workaround. By default, fastsleep_workaround_applyonce = 0. In this case, workaround is applied/undone everytime the core enters/exits fastsleep. fastsleep_workaround_applyonce = 1. In this case the workaround is applied once on all the cores and never undone. This can be triggered by echo 1 > /sys/devices/system/cpu/fastsleep_workaround_applyonce For simplicity this attribute can be modified only once. Implying, once fastsleep_workaround_applyonce is changed to 1, it cannot be reverted to the default state. Signed-off-by: Shreyas B. Prabhu Reviewed-by: Preeti U Murthy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 7 ++ arch/powerpc/include/asm/opal.h | 1 + arch/powerpc/platforms/powernv/idle.c | 101 +++++++++++++++++++++++++ arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + 4 files changed, 110 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 0321a909e663..a49e5fa6f939 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -165,6 +165,13 @@ #define OPAL_PM_WINKLE_ENABLED 0x00040000 #define OPAL_PM_SLEEP_ENABLED_ER1 0x00080000 /* with workaround */ +/* + * OPAL_CONFIG_CPU_IDLE_STATE parameters + */ +#define OPAL_CONFIG_IDLE_FASTSLEEP 1 +#define OPAL_CONFIG_IDLE_UNDO 0 +#define OPAL_CONFIG_IDLE_APPLY 1 + #ifndef __ASSEMBLY__ /* Other enums */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 042af1abfc4d..9a4781357fa6 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -186,6 +186,7 @@ int64_t opal_handle_hmi(void); int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end); int64_t opal_unregister_dump_region(uint32_t id); int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val); +int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t flag); int64_t opal_pci_set_phb_cxl_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number); int64_t opal_ipmi_send(uint64_t interface, struct opal_ipmi_msg *msg, uint64_t msg_len); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 5c799324b695..bd39a120bd60 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include @@ -137,6 +139,96 @@ u32 pnv_get_supported_cpuidle_states(void) } EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); + +static void pnv_fastsleep_workaround_apply(void *info) + +{ + int rc; + int *err = info; + + rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP, + OPAL_CONFIG_IDLE_APPLY); + if (rc) + *err = 1; +} + +/* + * Used to store fastsleep workaround state + * 0 - Workaround applied/undone at fastsleep entry/exit path (Default) + * 1 - Workaround applied once, never undone. + */ +static u8 fastsleep_workaround_applyonce; + +static ssize_t show_fastsleep_workaround_applyonce(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", fastsleep_workaround_applyonce); +} + +static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + cpumask_t primary_thread_mask; + int err; + u8 val; + + if (kstrtou8(buf, 0, &val) || val != 1) + return -EINVAL; + + if (fastsleep_workaround_applyonce == 1) + return count; + + /* + * fastsleep_workaround_applyonce = 1 implies + * fastsleep workaround needs to be left in 'applied' state on all + * the cores. Do this by- + * 1. Patching out the call to 'undo' workaround in fastsleep exit path + * 2. Sending ipi to all the cores which have atleast one online thread + * 3. Patching out the call to 'apply' workaround in fastsleep entry + * path + * There is no need to send ipi to cores which have all threads + * offlined, as last thread of the core entering fastsleep or deeper + * state would have applied workaround. + */ + err = patch_instruction( + (unsigned int *)pnv_fastsleep_workaround_at_exit, + PPC_INST_NOP); + if (err) { + pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit"); + goto fail; + } + + get_online_cpus(); + primary_thread_mask = cpu_online_cores_map(); + on_each_cpu_mask(&primary_thread_mask, + pnv_fastsleep_workaround_apply, + &err, 1); + put_online_cpus(); + if (err) { + pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply"); + goto fail; + } + + err = patch_instruction( + (unsigned int *)pnv_fastsleep_workaround_at_entry, + PPC_INST_NOP); + if (err) { + pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry"); + goto fail; + } + + fastsleep_workaround_applyonce = 1; + + return count; +fail: + return -EIO; +} + +static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, + show_fastsleep_workaround_applyonce, + store_fastsleep_workaround_applyonce); + static int __init pnv_init_idle_states(void) { struct device_node *power_mgt; @@ -181,7 +273,16 @@ static int __init pnv_init_idle_states(void) patch_instruction( (unsigned int *)pnv_fastsleep_workaround_at_exit, PPC_INST_NOP); + } else { + /* + * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that + * workaround is needed to use fastsleep. Provide sysfs + * control to choose how this workaround has to be applied. + */ + device_create_file(cpu_subsys.dev_root, + &dev_attr_fastsleep_workaround_applyonce); } + pnv_alloc_idle_core_states(); out_free: kfree(flags); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index a7ade94cdf87..bf15ead00e12 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -283,6 +283,7 @@ OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); OPAL_CALL(opal_get_param, OPAL_GET_PARAM); OPAL_CALL(opal_set_param, OPAL_SET_PARAM); OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); -- cgit v1.2.3 From 96e023e7534c16ab54e236c114340e2447c36d2f Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 15 May 2015 14:06:36 +1000 Subject: powerpc/powernv: Reorder OPAL subsystem initialisation Most of the OPAL subsystems are always compiled in for PowerNV and many of them need to be initialised before or after other OPAL subsystems. Rather than trying to control this ordering through machine initcalls it is clearer and easier to control initialisation order with explicit calls in opal_init. Signed-off-by: Alistair Popple Cc: Mahesh Jagannath Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal.h | 3 +++ arch/powerpc/platforms/powernv/opal-async.c | 3 +-- arch/powerpc/platforms/powernv/opal-hmi.c | 3 +-- arch/powerpc/platforms/powernv/opal-memory-errors.c | 2 +- arch/powerpc/platforms/powernv/opal-sensor.c | 3 +-- arch/powerpc/platforms/powernv/opal.c | 13 ++++++++++++- 6 files changed, 19 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9a4781357fa6..dcdf83c86256 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -240,6 +240,9 @@ extern int opal_elog_init(void); extern void opal_platform_dump_init(void); extern void opal_sys_param_init(void); extern void opal_msglog_init(void); +extern int opal_async_comp_init(void); +extern int opal_sensor_init(void); +extern int opal_hmi_handler_init(void); extern int opal_machine_check(struct pt_regs *regs); extern bool opal_mce_check_early_recovery(struct pt_regs *regs); diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 693b6cdac691..bdc8c0c71d15 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -151,7 +151,7 @@ static struct notifier_block opal_async_comp_nb = { .priority = 0, }; -static int __init opal_async_comp_init(void) +int __init opal_async_comp_init(void) { struct device_node *opal_node; const __be32 *async; @@ -205,4 +205,3 @@ out_opal_node: out: return err; } -machine_subsys_initcall(powernv, opal_async_comp_init); diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index b322bfb51343..a8f49d380449 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -170,7 +170,7 @@ static struct notifier_block opal_hmi_handler_nb = { .priority = 0, }; -static int __init opal_hmi_handler_init(void) +int __init opal_hmi_handler_init(void) { int ret; @@ -186,4 +186,3 @@ static int __init opal_hmi_handler_init(void) } return 0; } -machine_subsys_initcall(powernv, opal_hmi_handler_init); diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index 43db2136dbff..00a29432be39 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -144,4 +144,4 @@ static int __init opal_mem_err_init(void) } return 0; } -machine_subsys_initcall(powernv, opal_mem_err_init); +machine_device_initcall(powernv, opal_mem_err_init); diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index 655250499d18..a06059df9239 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -77,7 +77,7 @@ out: } EXPORT_SYMBOL_GPL(opal_get_sensor_data); -static __init int opal_sensor_init(void) +int __init opal_sensor_init(void) { struct platform_device *pdev; struct device_node *sensor; @@ -93,4 +93,3 @@ static __init int opal_sensor_init(void) return PTR_ERR_OR_ZERO(pdev); } -machine_subsys_initcall(powernv, opal_sensor_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 2241565b0739..eb3decc67c43 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -393,7 +393,6 @@ static int __init opal_message_init(void) } return 0; } -machine_early_initcall(powernv, opal_message_init); int opal_get_chars(uint32_t vtermno, char *buf, int count) { @@ -807,6 +806,18 @@ static int __init opal_init(void) of_node_put(consoles); } + /* Initialise OPAL messaging system */ + opal_message_init(); + + /* Initialise OPAL asynchronous completion interface */ + opal_async_comp_init(); + + /* Initialise OPAL sensor interface */ + opal_sensor_init(); + + /* Initialise OPAL hypervisor maintainence interrupt handling */ + opal_hmi_handler_init(); + /* Create i2c platform devices */ opal_i2c_create_devs(); -- cgit v1.2.3 From 9f0fd0499d30dbd61632463f293e2e826fa363b1 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 15 May 2015 14:06:37 +1000 Subject: powerpc/powernv: Add a virtual irqchip for opal events Whenever an interrupt is received for opal the linux kernel gets a bitfield indicating certain events that have occurred and need handling by the various device drivers. Currently this is handled using a notifier interface where we call every device driver that has registered to receive opal events. This approach has several drawbacks. For example each driver has to do its own checking to see if the event is relevant as well as event masking. There is also no easy method of recording the number of times we receive particular events. This patch solves these issues by exposing opal events via the standard interrupt APIs by adding a new interrupt chip and domain. Drivers can then register for the appropriate events using standard kernel calls such as irq_of_parse_and_map(). Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal.h | 3 + arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/opal-irqchip.c | 253 ++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/opal.c | 78 ++------ arch/powerpc/platforms/powernv/powernv.h | 4 + 5 files changed, 273 insertions(+), 67 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-irqchip.c (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index dcdf83c86256..1412814347ba 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -243,6 +243,7 @@ extern void opal_msglog_init(void); extern int opal_async_comp_init(void); extern int opal_sensor_init(void); extern int opal_hmi_handler_init(void); +extern int opal_event_init(void); extern int opal_machine_check(struct pt_regs *regs); extern bool opal_mce_check_early_recovery(struct pt_regs *regs); @@ -254,6 +255,8 @@ extern int opal_resync_timebase(void); extern void opal_lpc_init(void); +extern int opal_event_request(unsigned int opal_event_nr); + struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, unsigned long vmalloc_size); void opal_free_sg_list(struct opal_sg_list *sg); diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index bee923585afc..26bd7e417b7c 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,7 +1,7 @@ obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o -obj-y += opal-msglog.o opal-hmi.o opal-power.o +obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c new file mode 100644 index 000000000000..bd5125dcf0d7 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -0,0 +1,253 @@ +/* + * This file implements an irqchip for OPAL events. Whenever there is + * an interrupt that is handled by OPAL we get passed a list of events + * that Linux needs to do something about. These basically look like + * interrupts to Linux so we implement an irqchip to handle them. + * + * Copyright Alistair Popple, IBM Corporation 2014. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "powernv.h" + +/* Maximum number of events supported by OPAL firmware */ +#define MAX_NUM_EVENTS 64 + +struct opal_event_irqchip { + struct irq_chip irqchip; + struct irq_domain *domain; + unsigned long mask; +}; +static struct opal_event_irqchip opal_event_irqchip; + +static unsigned int opal_irq_count; +static unsigned int *opal_irqs; + +static void opal_handle_irq_work(struct irq_work *work); +static __be64 last_outstanding_events; +static struct irq_work opal_event_irq_work = { + .func = opal_handle_irq_work, +}; + +static void opal_event_mask(struct irq_data *d) +{ + clear_bit(d->hwirq, &opal_event_irqchip.mask); +} + +static void opal_event_unmask(struct irq_data *d) +{ + set_bit(d->hwirq, &opal_event_irqchip.mask); + + opal_poll_events(&last_outstanding_events); + if (last_outstanding_events & opal_event_irqchip.mask) + /* Need to retrigger the interrupt */ + irq_work_queue(&opal_event_irq_work); +} + +static int opal_event_set_type(struct irq_data *d, unsigned int flow_type) +{ + /* + * For now we only support level triggered events. The irq + * handler will be called continuously until the event has + * been cleared in OPAL. + */ + if (flow_type != IRQ_TYPE_LEVEL_HIGH) + return -EINVAL; + + return 0; +} + +static struct opal_event_irqchip opal_event_irqchip = { + .irqchip = { + .name = "OPAL EVT", + .irq_mask = opal_event_mask, + .irq_unmask = opal_event_unmask, + .irq_set_type = opal_event_set_type, + }, + .mask = 0, +}; + +static int opal_event_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hwirq) +{ + irq_set_chip_data(irq, &opal_event_irqchip); + irq_set_chip_and_handler(irq, &opal_event_irqchip.irqchip, + handle_level_irq); + + return 0; +} + +void opal_handle_events(uint64_t events) +{ + int virq, hwirq = 0; + u64 mask = opal_event_irqchip.mask; + u64 notifier_mask = 0; + + if (!in_irq() && (events & mask)) { + last_outstanding_events = events; + irq_work_queue(&opal_event_irq_work); + return; + } + + while (events) { + hwirq = fls64(events) - 1; + virq = irq_find_mapping(opal_event_irqchip.domain, + hwirq); + if (virq) { + if (BIT_ULL(hwirq) & mask) + generic_handle_irq(virq); + } else + notifier_mask |= BIT_ULL(hwirq); + events &= ~BIT_ULL(hwirq); + } + + opal_do_notifier(notifier_mask); +} + +static irqreturn_t opal_interrupt(int irq, void *data) +{ + __be64 events; + + opal_handle_interrupt(virq_to_hw(irq), &events); + opal_handle_events(be64_to_cpu(events)); + + return IRQ_HANDLED; +} + +static void opal_handle_irq_work(struct irq_work *work) +{ + opal_handle_events(be64_to_cpu(last_outstanding_events)); +} + +static int opal_event_match(struct irq_domain *h, struct device_node *node) +{ + return h->of_node == node; +} + +static int opal_event_xlate(struct irq_domain *h, struct device_node *np, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_flags) +{ + *out_hwirq = intspec[0]; + *out_flags = IRQ_TYPE_LEVEL_HIGH; + + return 0; +} + +static const struct irq_domain_ops opal_event_domain_ops = { + .match = opal_event_match, + .map = opal_event_map, + .xlate = opal_event_xlate, +}; + +void opal_event_shutdown(void) +{ + unsigned int i; + + /* First free interrupts, which will also mask them */ + for (i = 0; i < opal_irq_count; i++) { + if (opal_irqs[i]) + free_irq(opal_irqs[i], NULL); + opal_irqs[i] = 0; + } +} + +int __init opal_event_init(void) +{ + struct device_node *dn, *opal_node; + const __be32 *irqs; + int i, irqlen, rc = 0; + + opal_node = of_find_node_by_path("/ibm,opal"); + if (!opal_node) { + pr_warn("opal: Node not found\n"); + return -ENODEV; + } + + /* If dn is NULL it means the domain won't be linked to a DT + * node so therefore irq_of_parse_and_map(...) wont work. But + * that shouldn't be problem because if we're running a + * version of skiboot that doesn't have the dn then the + * devices won't have the correct properties and will have to + * fall back to the legacy method (opal_event_request(...)) + * anyway. */ + dn = of_find_compatible_node(NULL, NULL, "ibm,opal-event"); + opal_event_irqchip.domain = irq_domain_add_linear(dn, MAX_NUM_EVENTS, + &opal_event_domain_ops, &opal_event_irqchip); + of_node_put(dn); + if (!opal_event_irqchip.domain) { + pr_warn("opal: Unable to create irq domain\n"); + rc = -ENOMEM; + goto out; + } + + /* Get interrupt property */ + irqs = of_get_property(opal_node, "opal-interrupts", &irqlen); + opal_irq_count = irqs ? (irqlen / 4) : 0; + pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count); + + /* Install interrupt handlers */ + opal_irqs = kcalloc(opal_irq_count, sizeof(*opal_irqs), GFP_KERNEL); + for (i = 0; irqs && i < opal_irq_count; i++, irqs++) { + unsigned int irq, virq; + + /* Get hardware and virtual IRQ */ + irq = be32_to_cpup(irqs); + virq = irq_create_mapping(NULL, irq); + if (virq == NO_IRQ) { + pr_warn("Failed to map irq 0x%x\n", irq); + continue; + } + + /* Install interrupt handler */ + rc = request_irq(virq, opal_interrupt, 0, "opal", NULL); + if (rc) { + irq_dispose_mapping(virq); + pr_warn("Error %d requesting irq %d (0x%x)\n", + rc, virq, irq); + continue; + } + + /* Cache IRQ */ + opal_irqs[i] = virq; + } + +out: + of_node_put(opal_node); + return rc; +} + +/** + * opal_event_request(unsigned int opal_event_nr) - Request an event + * @opal_event_nr: the opal event number to request + * + * This routine can be used to find the linux virq number which can + * then be passed to request_irq to assign a handler for a particular + * opal event. This should only be used by legacy devices which don't + * have proper device tree bindings. Most devices should use + * irq_of_parse_and_map() instead. + */ +int opal_event_request(unsigned int opal_event_nr) +{ + return irq_create_mapping(opal_event_irqchip.domain, opal_event_nr); +} +EXPORT_SYMBOL(opal_event_request); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index eb3decc67c43..3baca718261a 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -53,8 +53,6 @@ static int mc_recoverable_range_len; struct device_node *opal_node; static DEFINE_SPINLOCK(opal_write_lock); -static unsigned int *opal_irqs; -static unsigned int opal_irq_count; static ATOMIC_NOTIFIER_HEAD(opal_notifier_head); static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; static DEFINE_SPINLOCK(opal_notifier_lock); @@ -251,7 +249,7 @@ int opal_notifier_unregister(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(opal_notifier_unregister); -static void opal_do_notifier(uint64_t events) +void opal_do_notifier(uint64_t events) { unsigned long flags; uint64_t changed_mask; @@ -571,8 +569,10 @@ int opal_handle_hmi_exception(struct pt_regs *regs) local_paca->hmi_event_available = 0; rc = opal_poll_events(&evt); - if (rc == OPAL_SUCCESS && evt) + if (rc == OPAL_SUCCESS && evt) { opal_do_notifier(be64_to_cpu(evt)); + opal_handle_events(be64_to_cpu(evt)); + } return 1; } @@ -609,17 +609,6 @@ out: return !!recover_addr; } -static irqreturn_t opal_interrupt(int irq, void *data) -{ - __be64 events; - - opal_handle_interrupt(virq_to_hw(irq), &events); - - opal_do_notifier(be64_to_cpu(events)); - - return IRQ_HANDLED; -} - static int opal_sysfs_init(void) { opal_kobj = kobject_create_and_add("opal", firmware_kobj); @@ -718,52 +707,15 @@ static void opal_i2c_create_devs(void) of_platform_device_create(np, NULL, NULL); } -static void __init opal_irq_init(struct device_node *dn) -{ - const __be32 *irqs; - int i, irqlen; - - /* Get interrupt property */ - irqs = of_get_property(opal_node, "opal-interrupts", &irqlen); - opal_irq_count = irqs ? (irqlen / 4) : 0; - pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count); - if (!opal_irq_count) - return; - - /* Install interrupt handlers */ - opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL); - for (i = 0; irqs && i < opal_irq_count; i++, irqs++) { - unsigned int irq, virq; - int rc; - - /* Get hardware and virtual IRQ */ - irq = be32_to_cpup(irqs); - virq = irq_create_mapping(NULL, irq); - if (virq == NO_IRQ) { - pr_warn("Failed to map irq 0x%x\n", irq); - continue; - } - - /* Install interrupt handler */ - rc = request_irq(virq, opal_interrupt, 0, "opal", NULL); - if (rc) { - irq_dispose_mapping(virq); - pr_warn("Error %d requesting irq %d (0x%x)\n", - rc, virq, irq); - continue; - } - - /* Cache IRQ */ - opal_irqs[i] = virq; - } -} - static int kopald(void *unused) { + __be64 events; + set_freezable(); do { try_to_freeze(); - opal_poll_events(NULL); + opal_poll_events(&events); + opal_handle_events(be64_to_cpu(events)); msleep_interruptible(opal_heartbeat); } while (!kthread_should_stop()); @@ -792,6 +744,9 @@ static int __init opal_init(void) return -ENODEV; } + /* Initialise OPAL events */ + opal_event_init(); + /* Register OPAL consoles if any ports */ if (firmware_has_feature(FW_FEATURE_OPALv2)) consoles = of_find_node_by_path("/ibm,opal/consoles"); @@ -824,9 +779,6 @@ static int __init opal_init(void) /* Setup a heatbeat thread if requested by OPAL */ opal_init_heartbeat(); - /* Find all OPAL interrupts and request them */ - opal_irq_init(opal_node); - /* Create "opal" kobject under /sys/firmware */ rc = opal_sysfs_init(); if (rc == 0) { @@ -857,15 +809,9 @@ machine_subsys_initcall(powernv, opal_init); void opal_shutdown(void) { - unsigned int i; long rc = OPAL_BUSY; - /* First free interrupts, which will also mask them */ - for (i = 0; i < opal_irq_count; i++) { - if (opal_irqs[i]) - free_irq(opal_irqs[i], NULL); - opal_irqs[i] = 0; - } + opal_event_shutdown(); /* * Then sync with OPAL which ensure anything that can diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 826d2c9bea56..221d4c827fc5 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -35,6 +35,10 @@ extern u32 pnv_get_supported_cpuidle_states(void); extern void pnv_lpc_init(void); +extern void opal_do_notifier(uint64_t events); +extern void opal_handle_events(uint64_t events); +extern void opal_event_shutdown(void); + bool cpu_core_split_required(void); #endif /* _POWERNV_H */ -- cgit v1.2.3 From 79231448c929cc0870f6e5cc10c485661dfb4a9f Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 15 May 2015 14:06:40 +1000 Subject: powernv/eeh: Update the EEH code to use the opal irq domain The eeh code currently uses the old notifier method to get eeh events from OPAL. It also contains some logic to filter opal events which has been moved into the virtual irqchip. This patch converts the eeh code to the new event interface which simplifies event handling. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/eeh-powernv.c | 58 +++++++++++++++------------- 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 622f08cf54b6..5cf5e6ea213b 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include "pci.h" static bool pnv_eeh_nb_init = false; +static int eeh_event_irq = -EINVAL; /** * pnv_eeh_init - EEH platform dependent initialization @@ -88,34 +90,22 @@ static int pnv_eeh_init(void) return 0; } -static int pnv_eeh_event(struct notifier_block *nb, - unsigned long events, void *change) +static irqreturn_t pnv_eeh_event(int irq, void *data) { - uint64_t changed_evts = (uint64_t)change; - /* - * We simply send special EEH event if EEH has - * been enabled, or clear pending events in - * case that we enable EEH soon + * We simply send a special EEH event if EEH has been + * enabled. We don't care about EEH events until we've + * finished processing the outstanding ones. Event processing + * gets unmasked in next_error() if EEH is enabled. */ - if (!(changed_evts & OPAL_EVENT_PCI_ERROR) || - !(events & OPAL_EVENT_PCI_ERROR)) - return 0; + disable_irq_nosync(irq); if (eeh_enabled()) eeh_send_failure_event(NULL); - else - opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); - return 0; + return IRQ_HANDLED; } -static struct notifier_block pnv_eeh_nb = { - .notifier_call = pnv_eeh_event, - .next = NULL, - .priority = 0 -}; - #ifdef CONFIG_DEBUG_FS static ssize_t pnv_eeh_ei_write(struct file *filp, const char __user *user_buf, @@ -237,16 +227,28 @@ static int pnv_eeh_post_init(void) /* Register OPAL event notifier */ if (!pnv_eeh_nb_init) { - ret = opal_notifier_register(&pnv_eeh_nb); - if (ret) { - pr_warn("%s: Can't register OPAL event notifier (%d)\n", - __func__, ret); + eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); + if (eeh_event_irq < 0) { + pr_err("%s: Can't register OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return eeh_event_irq; + } + + ret = request_irq(eeh_event_irq, pnv_eeh_event, + IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); + if (ret < 0) { + irq_dispose_mapping(eeh_event_irq); + pr_err("%s: Can't request OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); return ret; } pnv_eeh_nb_init = true; } + if (!eeh_enabled()) + disable_irq(eeh_event_irq); + list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; @@ -1303,12 +1305,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) int state, ret = EEH_NEXT_ERR_NONE; /* - * While running here, it's safe to purge the event queue. - * And we should keep the cached OPAL notifier event sychronized - * between the kernel and firmware. + * While running here, it's safe to purge the event queue. The + * event should still be masked. */ eeh_remove_event(NULL, false); - opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); list_for_each_entry(hose, &hose_list, list_node) { /* @@ -1477,6 +1477,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) break; } + /* Unmask the event */ + if (eeh_enabled()) + enable_irq(eeh_event_irq); + return ret; } -- cgit v1.2.3 From a295af24d0d2af238d74a994603407b72de157b3 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 15 May 2015 14:06:41 +1000 Subject: powernv/opal: Convert opal message events to opal irq domain This patch converts the opal message event to use the new opal irq domain. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 3baca718261a..cd5718b74d7c 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -362,33 +362,34 @@ static void opal_handle_message(void) opal_message_do_notify(type, (void *)&msg); } -static int opal_message_notify(struct notifier_block *nb, - unsigned long events, void *change) +static irqreturn_t opal_message_notify(int irq, void *data) { - if (events & OPAL_EVENT_MSG_PENDING) - opal_handle_message(); - return 0; + opal_handle_message(); + return IRQ_HANDLED; } -static struct notifier_block opal_message_nb = { - .notifier_call = opal_message_notify, - .next = NULL, - .priority = 0, -}; - static int __init opal_message_init(void) { - int ret, i; + int ret, i, irq; for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); - ret = opal_notifier_register(&opal_message_nb); + irq = opal_event_request(ilog2(OPAL_EVENT_MSG_PENDING)); + if (!irq) { + pr_err("%s: Can't register OPAL event irq (%d)\n", + __func__, irq); + return irq; + } + + ret = request_irq(irq, opal_message_notify, + IRQ_TYPE_LEVEL_HIGH, "opal-msg", NULL); if (ret) { - pr_err("%s: Can't register OPAL event notifier (%d)\n", + pr_err("%s: Can't request OPAL event irq (%d)\n", __func__, ret); return ret; } + return 0; } -- cgit v1.2.3 From 74159a70283ba685cd4d856cfc4438596524b1f2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 15 May 2015 14:06:42 +1000 Subject: powernv/elog: Convert elog to opal irq domain This patch converts the elog code to use the opal irq domain instead of notifier events. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-elog.c | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c index 38ce757e5e2a..4949ef0d9400 100644 --- a/arch/powerpc/platforms/powernv/opal-elog.c +++ b/arch/powerpc/platforms/powernv/opal-elog.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -276,24 +277,15 @@ static void elog_work_fn(struct work_struct *work) static DECLARE_WORK(elog_work, elog_work_fn); -static int elog_event(struct notifier_block *nb, - unsigned long events, void *change) +static irqreturn_t elog_event(int irq, void *data) { - /* check for error log event */ - if (events & OPAL_EVENT_ERROR_LOG_AVAIL) - schedule_work(&elog_work); - return 0; + schedule_work(&elog_work); + return IRQ_HANDLED; } -static struct notifier_block elog_nb = { - .notifier_call = elog_event, - .next = NULL, - .priority = 0 -}; - int __init opal_elog_init(void) { - int rc = 0; + int rc = 0, irq; /* ELOG not supported by firmware */ if (!opal_check_token(OPAL_ELOG_READ)) @@ -305,10 +297,18 @@ int __init opal_elog_init(void) return -1; } - rc = opal_notifier_register(&elog_nb); + irq = opal_event_request(ilog2(OPAL_EVENT_ERROR_LOG_AVAIL)); + if (!irq) { + pr_err("%s: Can't register OPAL event irq (%d)\n", + __func__, irq); + return irq; + } + + rc = request_irq(irq, elog_event, + IRQ_TYPE_LEVEL_HIGH, "opal-elog", NULL); if (rc) { - pr_err("%s: Can't register OPAL event notifier (%d)\n", - __func__, rc); + pr_err("%s: Can't request OPAL event irq (%d)\n", + __func__, rc); return rc; } -- cgit v1.2.3 From 8034f715f0a5fce97b14bacd47eefb1513316b17 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 15 May 2015 14:06:43 +1000 Subject: powernv/opal-dump: Convert to irq domain Convert the opal dump driver to the new opal irq domain. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-dump.c | 56 +++++++++--------------------- 1 file changed, 17 insertions(+), 39 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c index 5aa9c1ce4de3..2ee96431f736 100644 --- a/arch/powerpc/platforms/powernv/opal-dump.c +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -60,7 +61,7 @@ static ssize_t dump_type_show(struct dump_obj *dump_obj, struct dump_attribute *attr, char *buf) { - + return sprintf(buf, "0x%x %s\n", dump_obj->type, dump_type_to_string(dump_obj->type)); } @@ -363,7 +364,7 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size, return dump; } -static int process_dump(void) +static irqreturn_t process_dump(int irq, void *data) { int rc; uint32_t dump_id, dump_size, dump_type; @@ -387,45 +388,13 @@ static int process_dump(void) if (!dump) return -1; - return 0; -} - -static void dump_work_fn(struct work_struct *work) -{ - process_dump(); + return IRQ_HANDLED; } -static DECLARE_WORK(dump_work, dump_work_fn); - -static void schedule_process_dump(void) -{ - schedule_work(&dump_work); -} - -/* - * New dump available notification - * - * Once we get notification, we add sysfs entries for it. - * We only fetch the dump on demand, and create sysfs asynchronously. - */ -static int dump_event(struct notifier_block *nb, - unsigned long events, void *change) -{ - if (events & OPAL_EVENT_DUMP_AVAIL) - schedule_process_dump(); - - return 0; -} - -static struct notifier_block dump_nb = { - .notifier_call = dump_event, - .next = NULL, - .priority = 0 -}; - void __init opal_platform_dump_init(void) { int rc; + int dump_irq; /* ELOG not supported by firmware */ if (!opal_check_token(OPAL_DUMP_READ)) @@ -445,10 +414,19 @@ void __init opal_platform_dump_init(void) return; } - rc = opal_notifier_register(&dump_nb); + dump_irq = opal_event_request(ilog2(OPAL_EVENT_DUMP_AVAIL)); + if (!dump_irq) { + pr_err("%s: Can't register OPAL event irq (%d)\n", + __func__, dump_irq); + return; + } + + rc = request_threaded_irq(dump_irq, NULL, process_dump, + IRQF_TRIGGER_HIGH | IRQF_ONESHOT, + "opal-dump", NULL); if (rc) { - pr_warn("%s: Can't register OPAL event notifier (%d)\n", - __func__, rc); + pr_err("%s: Can't request OPAL event irq (%d)\n", + __func__, rc); return; } -- cgit v1.2.3 From 81f2f7ce4c5bb688ad691cb3ee37e81ca26a8a3b Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 15 May 2015 14:06:44 +1000 Subject: opal: Remove events notifier All users of the old opal events notifier have been converted over to the irq domain so remove the event notifier functions. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-irqchip.c | 16 ++--- arch/powerpc/platforms/powernv/opal.c | 84 +-------------------------- arch/powerpc/platforms/powernv/powernv.h | 1 - arch/powerpc/platforms/powernv/setup.c | 2 +- 4 files changed, 8 insertions(+), 95 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index bd5125dcf0d7..841135f48981 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -100,7 +100,6 @@ void opal_handle_events(uint64_t events) { int virq, hwirq = 0; u64 mask = opal_event_irqchip.mask; - u64 notifier_mask = 0; if (!in_irq() && (events & mask)) { last_outstanding_events = events; @@ -108,19 +107,16 @@ void opal_handle_events(uint64_t events) return; } - while (events) { + while (events & mask) { hwirq = fls64(events) - 1; - virq = irq_find_mapping(opal_event_irqchip.domain, - hwirq); - if (virq) { - if (BIT_ULL(hwirq) & mask) + if (BIT_ULL(hwirq) & mask) { + virq = irq_find_mapping(opal_event_irqchip.domain, + hwirq); + if (virq) generic_handle_irq(virq); - } else - notifier_mask |= BIT_ULL(hwirq); + } events &= ~BIT_ULL(hwirq); } - - opal_do_notifier(notifier_mask); } static irqreturn_t opal_interrupt(int irq, void *data) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index cd5718b74d7c..8403307c5362 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -53,11 +53,7 @@ static int mc_recoverable_range_len; struct device_node *opal_node; static DEFINE_SPINLOCK(opal_write_lock); -static ATOMIC_NOTIFIER_HEAD(opal_notifier_head); static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; -static DEFINE_SPINLOCK(opal_notifier_lock); -static uint64_t last_notified_mask = 0x0ul; -static atomic_t opal_notifier_hold = ATOMIC_INIT(0); static uint32_t opal_heartbeat; static void opal_reinit_cores(void) @@ -223,82 +219,6 @@ static int __init opal_register_exception_handlers(void) } machine_early_initcall(powernv, opal_register_exception_handlers); -int opal_notifier_register(struct notifier_block *nb) -{ - if (!nb) { - pr_warning("%s: Invalid argument (%p)\n", - __func__, nb); - return -EINVAL; - } - - atomic_notifier_chain_register(&opal_notifier_head, nb); - return 0; -} -EXPORT_SYMBOL_GPL(opal_notifier_register); - -int opal_notifier_unregister(struct notifier_block *nb) -{ - if (!nb) { - pr_warning("%s: Invalid argument (%p)\n", - __func__, nb); - return -EINVAL; - } - - atomic_notifier_chain_unregister(&opal_notifier_head, nb); - return 0; -} -EXPORT_SYMBOL_GPL(opal_notifier_unregister); - -void opal_do_notifier(uint64_t events) -{ - unsigned long flags; - uint64_t changed_mask; - - if (atomic_read(&opal_notifier_hold)) - return; - - spin_lock_irqsave(&opal_notifier_lock, flags); - changed_mask = last_notified_mask ^ events; - last_notified_mask = events; - spin_unlock_irqrestore(&opal_notifier_lock, flags); - - /* - * We feed with the event bits and changed bits for - * enough information to the callback. - */ - atomic_notifier_call_chain(&opal_notifier_head, - events, (void *)changed_mask); -} - -void opal_notifier_update_evt(uint64_t evt_mask, - uint64_t evt_val) -{ - unsigned long flags; - - spin_lock_irqsave(&opal_notifier_lock, flags); - last_notified_mask &= ~evt_mask; - last_notified_mask |= evt_val; - spin_unlock_irqrestore(&opal_notifier_lock, flags); -} - -void opal_notifier_enable(void) -{ - int64_t rc; - __be64 evt = 0; - - atomic_set(&opal_notifier_hold, 0); - - /* Process pending events */ - rc = opal_poll_events(&evt); - if (rc == OPAL_SUCCESS && evt) - opal_do_notifier(be64_to_cpu(evt)); -} - -void opal_notifier_disable(void) -{ - atomic_set(&opal_notifier_hold, 1); -} - /* * Opal message notifier based on message type. Allow subscribers to get * notified for specific messgae type. @@ -570,10 +490,8 @@ int opal_handle_hmi_exception(struct pt_regs *regs) local_paca->hmi_event_available = 0; rc = opal_poll_events(&evt); - if (rc == OPAL_SUCCESS && evt) { - opal_do_notifier(be64_to_cpu(evt)); + if (rc == OPAL_SUCCESS && evt) opal_handle_events(be64_to_cpu(evt)); - } return 1; } diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 221d4c827fc5..f907f0a494da 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -35,7 +35,6 @@ extern u32 pnv_get_supported_cpuidle_states(void); extern void pnv_lpc_init(void); -extern void opal_do_notifier(uint64_t events); extern void opal_handle_events(uint64_t events); extern void opal_event_shutdown(void); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 509cdd2f7ad8..15e9337b392c 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -107,7 +107,7 @@ static void pnv_prepare_going_down(void) * Disable all notifiers from OPAL, we can't * service interrupts anymore anyway */ - opal_notifier_disable(); + opal_event_shutdown(); /* Soft disable interrupts */ local_irq_disable(); -- cgit v1.2.3 From d6381119a415316c5602710cf7fbf388d15d6d3c Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 14 Apr 2015 14:27:55 +1000 Subject: powerpc/powernv: Move MSI-related ops to pci_controller_ops Move the PowerNV/BML platform to use the pci_controller_ops structure rather than ppc_md for MSI related PCI controller operations. Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index bca2aeb6e4b6..0f04940b7fb0 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -768,16 +768,14 @@ void __init pnv_pci_init(void) ppc_md.tce_free_rm = pnv_tce_free_rm; ppc_md.tce_get = pnv_tce_get; set_pci_dma_ops(&dma_iommu_ops); - - /* Configure MSIs */ -#ifdef CONFIG_PCI_MSI - ppc_md.setup_msi_irqs = pnv_setup_msi_irqs; - ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; -#endif } machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init); struct pci_controller_ops pnv_pci_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, +#ifdef CONFIG_PCI_MSI + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, +#endif }; -- cgit v1.2.3 From 7e3d6c5a4be45769a9d439a4b62ad85cfe9e6754 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 14 Apr 2015 14:27:56 +1000 Subject: powerpc/cell: Move MSI-related ops to pci_controller_ops Move the Cell platform to use the pci_controller_ops structure rather than ppc_md for MSI related PCI controller operations. We can be confident that the functions will be added to the platform's ops struct before any PCI controller's ops struct is populated because: 1) These ops are added to the struct in a subsys initcall. We populate the ops in axon_msi_probe, which is the probe call for the axon-msi driver. However the driver is registered in axon_msi_init, which is a subsys initcall, so this will happen at the subsys level. 2) The controller recieves the struct later, in a device initcall. Cell populates the controller in cell_setup_phb, which is hooked up to ppc_md.pci_setup_phb. ppc_md.pci_setup_phb is only ever called in of_platform.c, as part of the OpenFirmware PCI driver's probe routine. That driver is registered in a device initcall, so it will occur *after* the struct is properly populated. Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/cell/axon_msi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index 623bd961465a..d9b8a443458f 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -22,6 +22,7 @@ #include #include +#include "cell.h" /* * MSIC registers, specified as offsets from dcr_base @@ -406,8 +407,8 @@ static int axon_msi_probe(struct platform_device *device) dev_set_drvdata(&device->dev, msic); - ppc_md.setup_msi_irqs = axon_msi_setup_msi_irqs; - ppc_md.teardown_msi_irqs = axon_msi_teardown_msi_irqs; + cell_pci_controller_ops.setup_msi_irqs = axon_msi_setup_msi_irqs; + cell_pci_controller_ops.teardown_msi_irqs = axon_msi_teardown_msi_irqs; axon_msi_debug_setup(dn, msic); -- cgit v1.2.3 From 1d14b8755f0a7d8110f0bdc5b74987f8cc96c18e Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 14 Apr 2015 14:27:57 +1000 Subject: powerpc/pseries: Move MSI-related ops to pci_controller_ops Move the pseries platform to use the pci_controller_ops structure rather than ppc_md for MSI related PCI controller operations We need to iterate all PHBs because the MSI setup happens later than find_and_init_phbs() - mpe. Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/msi.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index c8d24f9a6948..c22bb647cce6 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -18,6 +18,8 @@ #include #include +#include "pseries.h" + static int query_token, change_token; #define RTAS_QUERY_FN 0 @@ -505,6 +507,8 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) static int rtas_msi_init(void) { + struct pci_controller *phb; + query_token = rtas_token("ibm,query-interrupt-source-number"); change_token = rtas_token("ibm,change-msi"); @@ -516,9 +520,15 @@ static int rtas_msi_init(void) pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n"); - WARN_ON(ppc_md.setup_msi_irqs); - ppc_md.setup_msi_irqs = rtas_setup_msi_irqs; - ppc_md.teardown_msi_irqs = rtas_teardown_msi_irqs; + WARN_ON(pseries_pci_controller_ops.setup_msi_irqs); + pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; + pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; + + list_for_each_entry(phb, &hose_list, list_node) { + WARN_ON(phb->controller_ops.setup_msi_irqs); + phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; + phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; + } WARN_ON(ppc_md.pci_irq_fixup); ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup; -- cgit v1.2.3 From 83922966973e19a48b6e59f9fa1259aa790a33c1 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 14 Apr 2015 14:28:01 +1000 Subject: powerpc/pasemi: Move MSI-related ops to pci_controller_ops Move the PaSemi MPIC msi subsystem to use the pci_controller_ops structure rather than ppc_md for MSI related PCI controller operations. As with fsl_msi, operations are plugged in at the subsys level, after controller creation. Again, we iterate over all controllers and populate them with the MSI ops. Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pasemi/msi.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/pasemi/msi.c b/arch/powerpc/platforms/pasemi/msi.c index 0b3706604543..27f2b187a91b 100644 --- a/arch/powerpc/platforms/pasemi/msi.c +++ b/arch/powerpc/platforms/pasemi/msi.c @@ -142,6 +142,7 @@ static int pasemi_msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) int mpic_pasemi_msi_init(struct mpic *mpic) { int rc; + struct pci_controller *phb; if (!mpic->irqhost->of_node || !of_device_is_compatible(mpic->irqhost->of_node, @@ -157,9 +158,11 @@ int mpic_pasemi_msi_init(struct mpic *mpic) pr_debug("pasemi_msi: Registering PA Semi MPIC MSI callbacks\n"); msi_mpic = mpic; - WARN_ON(ppc_md.setup_msi_irqs); - ppc_md.setup_msi_irqs = pasemi_msi_setup_msi_irqs; - ppc_md.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs; + list_for_each_entry(phb, &hose_list, list_node) { + WARN_ON(phb->controller_ops.setup_msi_irqs); + phb->controller_ops.setup_msi_irqs = pasemi_msi_setup_msi_irqs; + phb->controller_ops.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs; + } return 0; } -- cgit v1.2.3 From 92ae03532619dc24fdb7a5ae8ea63785fbd39f86 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 28 Apr 2015 15:12:05 +1000 Subject: powerpc/powernv: Specialise pci_controller_ops for each controller type Remove powernv generic PCI controller operations. Replace it with controller ops for each of the two supported PHBs. As an added bonus, make the two new structs const, which will help guard against bugs such as the one introduced in 65ebf4b63 ("powerpc/powernv: Move controller ops from ppc_md to controller_ops") Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++++++++++++---- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 10 +++++++++- arch/powerpc/platforms/powernv/pci.c | 14 +++----------- arch/powerpc/platforms/powernv/pci.h | 4 ++++ arch/powerpc/platforms/powernv/powernv.h | 2 -- 5 files changed, 28 insertions(+), 18 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f8bc950efcae..23125f46f865 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2648,6 +2648,17 @@ static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) OPAL_ASSERT_RESET); } +static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { + .dma_dev_setup = pnv_pci_dma_dev_setup, +#ifdef CONFIG_PCI_MSI + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, +#endif + .enable_device_hook = pnv_pci_enable_device_hook, + .window_alignment = pnv_pci_window_alignment, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, +}; + static void __init pnv_pci_init_ioda_phb(struct device_node *np, u64 hub_id, int ioda_type) { @@ -2808,10 +2819,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, * the child P2P bridges) can form individual PE. */ ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; - pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook; - pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment; - pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus; - hose->controller_ops = pnv_pci_controller_ops; + hose->controller_ops = pnv_pci_ioda_controller_ops; #ifdef CONFIG_PCI_IOV ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 4729ca793813..7f826e8cc1d4 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -95,6 +95,14 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table); } +static const struct pci_controller_ops pnv_pci_p5ioc2_controller_ops = { + .dma_dev_setup = pnv_pci_dma_dev_setup, +#ifdef CONFIG_PCI_MSI + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, +#endif +}; + static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, void *tce_mem, u64 tce_size) { @@ -133,7 +141,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, phb->hose->first_busno = 0; phb->hose->last_busno = 0xff; phb->hose->private_data = phb; - phb->hose->controller_ops = pnv_pci_controller_ops; + phb->hose->controller_ops = pnv_pci_p5ioc2_controller_ops; phb->hub_id = hub_id; phb->opal_id = phb_id; phb->type = PNV_PHB_P5IOC2; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 0f04940b7fb0..dfad2ba13bdf 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -45,7 +45,7 @@ //#define cfg_dbg(fmt...) printk(fmt) #ifdef CONFIG_PCI_MSI -static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; @@ -94,7 +94,7 @@ static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) return 0; } -static void pnv_teardown_msi_irqs(struct pci_dev *pdev) +void pnv_teardown_msi_irqs(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; @@ -662,7 +662,7 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl, tbl->it_type = TCE_PCI; } -static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) +void pnv_pci_dma_dev_setup(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; @@ -771,11 +771,3 @@ void __init pnv_pci_init(void) } machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init); - -struct pci_controller_ops pnv_pci_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, -#ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, -#endif -}; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 070ee888fc95..15fde52cb5cd 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -218,4 +218,8 @@ extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); +extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev); +extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); +extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); + #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index f907f0a494da..addd15cdbabc 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -29,8 +29,6 @@ static inline u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev) } #endif -extern struct pci_controller_ops pnv_pci_controller_ops; - extern u32 pnv_get_supported_cpuidle_states(void); extern void pnv_lpc_init(void); -- cgit v1.2.3 From 763d2d8df1ee2b92ff09cd58f6034021e2cabf6d Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 28 Apr 2015 15:12:07 +1000 Subject: powerpc/powernv: Move dma_set_mask() from pnv_phb to pci_controller_ops Previously, dma_set_mask() on powernv was convoluted: 0) Call dma_set_mask() (a/p/kernel/dma.c) 1) In dma_set_mask(), ppc_md.dma_set_mask() exists, so call it. 2) On powernv, that function pointer is pnv_dma_set_mask(). In pnv_dma_set_mask(), the device is pci, so call pnv_pci_dma_set_mask(). 3) In pnv_pci_dma_set_mask(), call pnv_phb->set_dma_mask() if it exists. 4) It only exists in the ioda case, where it points to pnv_pci_ioda_dma_set_mask(), which is the final function. So the call chain is: dma_set_mask() -> pnv_dma_set_mask() -> pnv_pci_dma_set_mask() -> pnv_pci_ioda_dma_set_mask() Both ppc_md and pnv_phb function pointers are used. Rip out the ppc_md call, pnv_dma_set_mask() and pnv_pci_dma_set_mask(). Instead: 0) Call dma_set_mask() (a/p/kernel/dma.c) 1) In dma_set_mask(), the device is pci, and pci_controller_ops.dma_set_mask() exists, so call pci_controller_ops.dma_set_mask() 2) In the ioda case, that points to pnv_pci_ioda_dma_set_mask(). The new call chain is dma_set_mask() -> pnv_pci_ioda_dma_set_mask() Now only the pci_controller_ops function pointer is used. The fallback paths for p5ioc2 are the same. Previously, pnv_pci_dma_set_mask() would find no pnv_phb->set_dma_mask() function, to it would call __set_dma_mask(). Now, dma_set_mask() finds no ppc_md call or pci_controller_ops call, so it calls __set_dma_mask(). Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 7 ++++--- arch/powerpc/platforms/powernv/pci.c | 10 ---------- arch/powerpc/platforms/powernv/pci.h | 2 -- arch/powerpc/platforms/powernv/powernv.h | 6 ------ arch/powerpc/platforms/powernv/setup.c | 8 -------- 5 files changed, 4 insertions(+), 29 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 23125f46f865..508f530e367b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1601,9 +1601,10 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table); } -static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, - struct pci_dev *pdev, u64 dma_mask) +static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) { + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; uint64_t top; @@ -2657,6 +2658,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .enable_device_hook = pnv_pci_enable_device_hook, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_pci_ioda_dma_set_mask, }; static void __init pnv_pci_init_ioda_phb(struct device_node *np, @@ -2802,7 +2804,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* Setup TCEs */ phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; - phb->dma_set_mask = pnv_pci_ioda_dma_set_mask; phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask; /* Setup shutdown function for kexec */ diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index dfad2ba13bdf..8a557b5aabf7 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -689,16 +689,6 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev) phb->dma_dev_setup(phb, pdev); } -int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - - if (phb && phb->dma_set_mask) - return phb->dma_set_mask(phb, pdev, dma_mask); - return __dma_set_mask(&pdev->dev, dma_mask); -} - u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 15fde52cb5cd..ac8686c853e6 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -106,8 +106,6 @@ struct pnv_phb { unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg); void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); - int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev, - u64 dma_mask); u64 (*dma_get_required_mask)(struct pnv_phb *phb, struct pci_dev *pdev); void (*fixup_phb)(struct pci_controller *hose); diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index addd15cdbabc..9269e30e4ca0 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -12,17 +12,11 @@ struct pci_dev; #ifdef CONFIG_PCI extern void pnv_pci_init(void); extern void pnv_pci_shutdown(void); -extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask); extern u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev); #else static inline void pnv_pci_init(void) { } static inline void pnv_pci_shutdown(void) { } -static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) -{ - return -ENODEV; -} - static inline u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev) { return 0; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 15e9337b392c..53737e019ae3 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -165,13 +165,6 @@ static void pnv_progress(char *s, unsigned short hex) { } -static int pnv_dma_set_mask(struct device *dev, u64 dma_mask) -{ - if (dev_is_pci(dev)) - return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask); - return __dma_set_mask(dev, dma_mask); -} - static u64 pnv_dma_get_required_mask(struct device *dev) { if (dev_is_pci(dev)) @@ -321,7 +314,6 @@ define_machine(powernv) { .machine_shutdown = pnv_shutdown, .power_save = power7_idle, .calibrate_decr = generic_calibrate_decr, - .dma_set_mask = pnv_dma_set_mask, .dma_get_required_mask = pnv_dma_get_required_mask, #ifdef CONFIG_KEXEC .kexec_cpu_down = pnv_kexec_cpu_down, -- cgit v1.2.3 From c1231a784acc25ec01d35a019533da9ab8593e2e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 20 May 2015 17:59:52 +0800 Subject: powerpc: Use irq_desc_get_xxx() to avoid redundant lookup of irq_desc Use irq_desc_get_xxx() to avoid redundant lookup of irq_desc while we already have a pointer to corresponding irq_desc. Signed-off-by: Jiang Liu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 2 +- arch/powerpc/platforms/cell/axon_msi.c | 2 +- arch/powerpc/platforms/embedded6xx/hlwd-pic.c | 2 +- arch/powerpc/sysdev/uic.c | 2 +- arch/powerpc/sysdev/xics/xics-common.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index c949ca055712..63016621aff8 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -193,7 +193,7 @@ static struct irq_chip mpc52xx_gpt_irq_chip = { void mpc52xx_gpt_irq_cascade(unsigned int virq, struct irq_desc *desc) { - struct mpc52xx_gpt_priv *gpt = irq_get_handler_data(virq); + struct mpc52xx_gpt_priv *gpt = irq_desc_get_handler_data(desc); int sub_virq; u32 status; diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index d9b8a443458f..fe51de4fcf13 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -96,7 +96,7 @@ static void msic_dcr_write(struct axon_msic *msic, unsigned int dcr_n, u32 val) static void axon_msi_cascade(unsigned int irq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - struct axon_msic *msic = irq_get_handler_data(irq); + struct axon_msic *msic = irq_desc_get_handler_data(desc); u32 write_offset, msi; int idx; int retry = 0; diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c index c269caee58f9..9dd154d6f89a 100644 --- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c +++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c @@ -124,7 +124,7 @@ static void hlwd_pic_irq_cascade(unsigned int cascade_virq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - struct irq_domain *irq_domain = irq_get_handler_data(cascade_virq); + struct irq_domain *irq_domain = irq_desc_get_handler_data(desc); unsigned int virq; raw_spin_lock(&desc->lock); diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c index 1cd057f11725..d77345338671 100644 --- a/arch/powerpc/sysdev/uic.c +++ b/arch/powerpc/sysdev/uic.c @@ -198,7 +198,7 @@ void uic_irq_cascade(unsigned int virq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irq_data *idata = irq_desc_get_irq_data(desc); - struct uic *uic = irq_get_handler_data(virq); + struct uic *uic = irq_desc_get_handler_data(desc); u32 msr; int src; int subvirq; diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 5bc5889d4acc..08c248eb491b 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -227,7 +227,7 @@ void xics_migrate_irqs_away(void) /* Locate interrupt server */ server = -1; - ics = irq_get_chip_data(virq); + ics = irq_desc_get_chip_data(desc); if (ics) server = ics->get_server(ics, irq); if (server < 0) { -- cgit v1.2.3 From 2b6029e2e026771fb4e2add9b38e91f34725813c Mon Sep 17 00:00:00 2001 From: Shengzhou Liu Date: Thu, 9 Apr 2015 16:07:43 +0800 Subject: powerpc/fsl-booke: Add T1024 QDS board support Add support for Freescale T1024/T1023 QorIQ Development System Board. T1024QDS is a high-performance computing evaluation, development and test platform for T1024 QorIQ Power Architecture processor. Signed-off-by: Shengzhou Liu [scottwood: vendor prefix: s/at24/atmel/ and trimmed detailed board description with too-long lines] Signed-off-by: Scott Wood --- arch/powerpc/boot/dts/fsl/t1023si-post.dtsi | 36 ++-- arch/powerpc/boot/dts/t1024qds.dts | 251 ++++++++++++++++++++++++++ arch/powerpc/platforms/85xx/Kconfig | 2 +- arch/powerpc/platforms/85xx/corenet_generic.c | 1 + 4 files changed, 271 insertions(+), 19 deletions(-) create mode 100644 arch/powerpc/boot/dts/t1024qds.dts (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi index dbe65783cc2d..df1f068a5376 100644 --- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi @@ -305,26 +305,26 @@ /include/ "qoriq-gpio-2.dtsi" /include/ "qoriq-gpio-3.dtsi" /include/ "qoriq-usb2-mph-0.dtsi" - usb0: usb@210000 { - compatible = "fsl-usb2-mph-v2.5", "fsl-usb2-mph"; - fsl,iommu-parent = <&pamu0>; - fsl,liodn-reg = <&guts 0x520>; /* USB1LIODNR */ - phy_type = "utmi"; - port0; - }; + usb0: usb@210000 { + compatible = "fsl-usb2-mph-v2.5", "fsl-usb2-mph"; + fsl,iommu-parent = <&pamu0>; + fsl,liodn-reg = <&guts 0x520>; /* USB1LIODNR */ + phy_type = "utmi"; + port0; + }; /include/ "qoriq-usb2-dr-0.dtsi" - usb1: usb@211000 { - compatible = "fsl-usb2-dr-v2.5", "fsl-usb2-dr"; - fsl,iommu-parent = <&pamu0>; - fsl,liodn-reg = <&guts 0x524>; /* USB2LIODNR */ - dr_mode = "host"; - phy_type = "utmi"; - }; + usb1: usb@211000 { + compatible = "fsl-usb2-dr-v2.5", "fsl-usb2-dr"; + fsl,iommu-parent = <&pamu0>; + fsl,liodn-reg = <&guts 0x524>; /* USB2LIODNR */ + dr_mode = "host"; + phy_type = "utmi"; + }; /include/ "qoriq-sata2-0.dtsi" -sata@220000 { - fsl,iommu-parent = <&pamu0>; - fsl,liodn-reg = <&guts 0x550>; /* SATA1LIODNR */ -}; + sata@220000 { + fsl,iommu-parent = <&pamu0>; + fsl,liodn-reg = <&guts 0x550>; /* SATA1LIODNR */ + }; /include/ "qoriq-sec5.0-0.dtsi" }; diff --git a/arch/powerpc/boot/dts/t1024qds.dts b/arch/powerpc/boot/dts/t1024qds.dts new file mode 100644 index 000000000000..f31fabb383b9 --- /dev/null +++ b/arch/powerpc/boot/dts/t1024qds.dts @@ -0,0 +1,251 @@ +/* + * T1024 QDS Device Tree Source + * + * Copyright 2014 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/include/ "fsl/t102xsi-pre.dtsi" + +/ { + model = "fsl,T1024QDS"; + compatible = "fsl,T1024QDS"; + #address-cells = <2>; + #size-cells = <2>; + interrupt-parent = <&mpic>; + + ifc: localbus@ffe124000 { + reg = <0xf 0xfe124000 0 0x2000>; + ranges = <0 0 0xf 0xe8000000 0x08000000 + 2 0 0xf 0xff800000 0x00010000 + 3 0 0xf 0xffdf0000 0x00008000>; + + nor@0,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "cfi-flash"; + reg = <0x0 0x0 0x8000000>; + bank-width = <2>; + device-width = <1>; + }; + + nand@2,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,ifc-nand"; + reg = <0x2 0x0 0x10000>; + }; + + board-control@3,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,tetra-fpga", "fsl,fpga-qixis"; + reg = <3 0 0x300>; + ranges = <0 3 0 0x300>; + }; + }; + + memory { + device_type = "memory"; + }; + + dcsr: dcsr@f00000000 { + ranges = <0x00000000 0xf 0x00000000 0x01072000>; + }; + + soc: soc@ffe000000 { + ranges = <0x00000000 0xf 0xfe000000 0x1000000>; + reg = <0xf 0xfe000000 0 0x00001000>; + spi@110000 { + flash@0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "micron,n25q128a11"; /* 16MB */ + reg = <0>; + spi-max-frequency = <10000000>; + }; + + flash@1 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "sst,sst25wf040"; /* 512KB */ + reg = <1>; + spi-max-frequency = <10000000>; + }; + + flash@2 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "eon,en25s64"; /* 8MB */ + reg = <2>; + spi-max-frequency = <10000000>; + }; + + slic@2 { + compatible = "maxim,ds26522"; + reg = <2>; + spi-max-frequency = <2000000>; + }; + + slic@3 { + compatible = "maxim,ds26522"; + reg = <3>; + spi-max-frequency = <2000000>; + }; + }; + + i2c@118000 { + pca9547@77 { + compatible = "nxp,pca9547"; + reg = <0x77>; + #address-cells = <1>; + #size-cells = <0>; + + i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0x0>; + + eeprom@50 { + compatible = "atmel,24c512"; + reg = <0x50>; + }; + + eeprom@51 { + compatible = "atmel,24c02"; + reg = <0x51>; + }; + + eeprom@57 { + compatible = "atmel,24c02"; + reg = <0x57>; + }; + }; + + i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0x2>; + + ina220@40 { + compatible = "ti,ina220"; + reg = <0x40>; + shunt-resistor = <1000>; + }; + + ina220@41 { + compatible = "ti,ina220"; + reg = <0x41>; + shunt-resistor = <1000>; + }; + }; + + i2c@3 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0x3>; + + adt7461@4c { + /* Thermal Monitor */ + compatible = "adi,adt7461"; + reg = <0x4c>; + }; + + eeprom@55 { + compatible = "atmel,24c02"; + reg = <0x55>; + }; + + eeprom@56 { + compatible = "atmel,24c512"; + reg = <0x56>; + }; + + eeprom@57 { + compatible = "atmel,24c512"; + reg = <0x57>; + }; + }; + }; + rtc@68 { + compatible = "dallas,ds3232"; + reg = <0x68>; + interrupts = <0x5 0x1 0 0>; + }; + }; + }; + + pci0: pcie@ffe240000 { + reg = <0xf 0xfe240000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; + + pci1: pcie@ffe250000 { + reg = <0xf 0xfe250000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; + + pci2: pcie@ffe260000 { + reg = <0xf 0xfe260000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; +}; + +/include/ "fsl/t1024si-post.dtsi" diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index 2fb4b24368a6..e628d4001272 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -282,7 +282,7 @@ config CORENET_GENERIC For 64bit kernel, the following boards are supported: T208x QDS/RDB, T4240 QDS/RDB and B4 QDS The following boards are supported for both 32bit and 64bit kernel: - P5020 DS, P5040 DS and T104xQDS/RDB + P5020 DS, P5040 DS, T102x QDS, T104x QDS/RDB endif # FSL_SOC_BOOKE diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c index 9824d2cf79bd..f61f47780486 100644 --- a/arch/powerpc/platforms/85xx/corenet_generic.c +++ b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -150,6 +150,7 @@ static const char * const boards[] __initconst = { "fsl,B4860QDS", "fsl,B4420QDS", "fsl,B4220QDS", + "fsl,T1024QDS", "fsl,T1040QDS", "fsl,T1042QDS", "fsl,T1040RDB", -- cgit v1.2.3 From 5afe13fd48c72be61342edbc6ede6e60f6153227 Mon Sep 17 00:00:00 2001 From: Shengzhou Liu Date: Thu, 9 Apr 2015 16:07:44 +0800 Subject: powerpc/fsl-booke: Add T1024 RDB board support T1024RDB is a Freescale Reference Design Board that hosts the T1024 SoC. Signed-off-by: Shengzhou Liu [scottwood: vendor prefix: s/at24/atmel/ and trimmed detailed board description with too-long lines] Signed-off-by: Scott Wood --- arch/powerpc/boot/dts/t1024rdb.dts | 185 ++++++++++++++++++++++++++ arch/powerpc/platforms/85xx/Kconfig | 2 +- arch/powerpc/platforms/85xx/corenet_generic.c | 1 + 3 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/boot/dts/t1024rdb.dts (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/boot/dts/t1024rdb.dts b/arch/powerpc/boot/dts/t1024rdb.dts new file mode 100644 index 000000000000..733e723ffed6 --- /dev/null +++ b/arch/powerpc/boot/dts/t1024rdb.dts @@ -0,0 +1,185 @@ +/* + * T1024 RDB Device Tree Source + * + * Copyright 2014 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/include/ "fsl/t102xsi-pre.dtsi" + +/ { + model = "fsl,T1024RDB"; + compatible = "fsl,T1024RDB"; + #address-cells = <2>; + #size-cells = <2>; + interrupt-parent = <&mpic>; + + ifc: localbus@ffe124000 { + reg = <0xf 0xfe124000 0 0x2000>; + ranges = <0 0 0xf 0xe8000000 0x08000000 + 2 0 0xf 0xff800000 0x00010000 + 3 0 0xf 0xffdf0000 0x00008000>; + + nor@0,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "cfi-flash"; + reg = <0x0 0x0 0x8000000>; + bank-width = <2>; + device-width = <1>; + }; + + nand@1,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,ifc-nand"; + reg = <0x2 0x0 0x10000>; + }; + + board-control@2,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,t1024-cpld"; + reg = <3 0 0x300>; + ranges = <0 3 0 0x300>; + bank-width = <1>; + device-width = <1>; + }; + }; + + memory { + device_type = "memory"; + }; + + dcsr: dcsr@f00000000 { + ranges = <0x00000000 0xf 0x00000000 0x01072000>; + }; + + soc: soc@ffe000000 { + ranges = <0x00000000 0xf 0xfe000000 0x1000000>; + reg = <0xf 0xfe000000 0 0x00001000>; + spi@110000 { + flash@0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "micron,n25q512ax3"; + reg = <0>; + spi-max-frequency = <10000000>; /* input clk */ + }; + + slic@1 { + compatible = "maxim,ds26522"; + reg = <1>; + spi-max-frequency = <2000000>; + }; + + slic@2 { + compatible = "maxim,ds26522"; + reg = <2>; + spi-max-frequency = <2000000>; + }; + }; + + i2c@118000 { + adt7461@4c { + /* Thermal Monitor */ + compatible = "adi,adt7461"; + reg = <0x4c>; + }; + + eeprom@50 { + compatible = "atmel,24c256"; + reg = <0x50>; + }; + + rtc@68 { + compatible = "dallas,ds1339"; + reg = <0x68>; + interrupts = <0x1 0x1 0 0>; + }; + }; + + i2c@118100 { + pca9546@77 { + compatible = "nxp,pca9546"; + reg = <0x77>; + #address-cells = <1>; + #size-cells = <0>; + }; + }; + }; + + pci0: pcie@ffe240000 { + reg = <0xf 0xfe240000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; + + pci1: pcie@ffe250000 { + reg = <0xf 0xfe250000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; + + pci2: pcie@ffe260000 { + reg = <0xf 0xfe260000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; +}; + +/include/ "fsl/t1024si-post.dtsi" diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index e628d4001272..97915feffd42 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -282,7 +282,7 @@ config CORENET_GENERIC For 64bit kernel, the following boards are supported: T208x QDS/RDB, T4240 QDS/RDB and B4 QDS The following boards are supported for both 32bit and 64bit kernel: - P5020 DS, P5040 DS, T102x QDS, T104x QDS/RDB + P5020 DS, P5040 DS, T102x QDS/RDB, T104x QDS/RDB endif # FSL_SOC_BOOKE diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c index f61f47780486..d7c1b4d4254a 100644 --- a/arch/powerpc/platforms/85xx/corenet_generic.c +++ b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -151,6 +151,7 @@ static const char * const boards[] __initconst = { "fsl,B4420QDS", "fsl,B4220QDS", "fsl,T1024QDS", + "fsl,T1024RDB", "fsl,T1040QDS", "fsl,T1042QDS", "fsl,T1040RDB", -- cgit v1.2.3 From 65bf2a057021b009ff9924b59203993bdff82ecc Mon Sep 17 00:00:00 2001 From: Shengzhou Liu Date: Thu, 9 Apr 2015 16:07:45 +0800 Subject: powerpc/fsl-booke: Add T1023 RDB board support T1023RDB is a Freescale Reference Design Board that hosts T1023 SoC. T1023RDB board Overview ----------------------- - T1023 SoC integrating two 64-bit e5500 cores up to 1.4GHz - CoreNet fabric supporting coherent and noncoherent transactions with prioritization and bandwidth allocation - Memory: 2GB Micron MT40A512M8HX unbuffered 32-bit fixed DDR4 without ECC - Accelerator: DPAA components consist of FMan, BMan, QMan, DCE and SEC - Ethernet interfaces: - one 1G RGMII port on-board(RTL8211F PHY) - one 1G SGMII port on-board(RTL8211F PHY) - one 2.5G SGMII port on-board(AQR105 PHY) - PCIe: Two Mini-PCIe connectors on-board. - SerDes: 4 lanes up to 10.3125GHz - NOR: 128MB S29GL01GS110TFIV10 Spansion NOR Flash - NAND: 512MB S34MS04G200BFI000 Spansion NAND Flash - eSPI: 64MB S25FL512SAGMFI010 Spansion SPI flash - USB: one Type-A USB 2.0 port with internal PHY - eSDHC: support SD/MMC card and eMMC flash on-board - 256Kbit M24256 I2C EEPROM - RTC: Real-time clock DS1339 on I2C bus - UART: one serial port on-board with RJ45 connector - Debugging: JTAG/COP for T1023 debugging Signed-off-by: Shengzhou Liu Signed-off-by: Scott Wood --- arch/powerpc/boot/dts/t1023rdb.dts | 151 ++++++++++++++++++++++++++ arch/powerpc/platforms/85xx/corenet_generic.c | 1 + 2 files changed, 152 insertions(+) create mode 100644 arch/powerpc/boot/dts/t1023rdb.dts (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/boot/dts/t1023rdb.dts b/arch/powerpc/boot/dts/t1023rdb.dts new file mode 100644 index 000000000000..06b090aba066 --- /dev/null +++ b/arch/powerpc/boot/dts/t1023rdb.dts @@ -0,0 +1,151 @@ +/* + * T1023 RDB Device Tree Source + * + * Copyright 2014 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/include/ "fsl/t102xsi-pre.dtsi" + +/ { + model = "fsl,T1023RDB"; + compatible = "fsl,T1023RDB"; + #address-cells = <2>; + #size-cells = <2>; + interrupt-parent = <&mpic>; + + ifc: localbus@ffe124000 { + reg = <0xf 0xfe124000 0 0x2000>; + ranges = <0 0 0xf 0xe8000000 0x08000000 + 1 0 0xf 0xff800000 0x00010000>; + + nor@0,0 { + #address-cells = <1>; + #size-cells = <1>; + status = "disabled"; + compatible = "cfi-flash"; + reg = <0x0 0x0 0x8000000>; + bank-width = <2>; + device-width = <1>; + }; + + nand@1,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,ifc-nand"; + reg = <0x2 0x0 0x10000>; + }; + }; + + memory { + device_type = "memory"; + }; + + dcsr: dcsr@f00000000 { + ranges = <0x00000000 0xf 0x00000000 0x01072000>; + }; + + soc: soc@ffe000000 { + ranges = <0x00000000 0xf 0xfe000000 0x1000000>; + reg = <0xf 0xfe000000 0 0x00001000>; + spi@110000 { + flash@0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "spansion,s25fl512s"; + reg = <0>; + spi-max-frequency = <10000000>; /* input clk */ + }; + }; + + i2c@118000 { + eeprom@50 { + compatible = "st,m24256"; + reg = <0x50>; + }; + + rtc@68 { + compatible = "dallas,ds1339"; + reg = <0x68>; + interrupts = <0x5 0x1 0 0>; + }; + }; + + i2c@118100 { + }; + }; + + pci0: pcie@ffe240000 { + reg = <0xf 0xfe240000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; + + pci1: pcie@ffe250000 { + reg = <0xf 0xfe250000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; + + pci2: pcie@ffe260000 { + reg = <0xf 0xfe260000 0 0x10000>; + ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>; + pcie@0 { + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x10000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; +}; + +/include/ "fsl/t1023si-post.dtsi" diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c index d7c1b4d4254a..bd839dc287fe 100644 --- a/arch/powerpc/platforms/85xx/corenet_generic.c +++ b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -150,6 +150,7 @@ static const char * const boards[] __initconst = { "fsl,B4860QDS", "fsl,B4420QDS", "fsl,B4220QDS", + "fsl,T1023RDB", "fsl,T1024QDS", "fsl,T1024RDB", "fsl,T1040QDS", -- cgit v1.2.3 From 379caf606396a13c8da2742e9a6e0bcfaaffa726 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Sun, 17 May 2015 16:12:37 +0800 Subject: powerpc: mpc85xx: flush the l1 cache before cpu down in kexec We observe a "Zero PT_NOTE entries found" warning when vmcore_init() is running on the dump-capture kernel. Actually the PT_NOTE segments is not empty, but the entries generated by crash_save_cpu() are not flushed to the memory before we reset these cores. So we should flush the l1 cache as what we do in cpu hotplug. With this change, we can also kill the mpc85xx_smp_flush_dcache_kexec() since that becomes unnecessary. Please note: this only fix the issue on e500 core, we still need to implement the function to flush the l2 cache for the e500mc core. Fortunately we already had proposing patch for this support [1]. Hope we can fix this issue for e500mc after that merged. [1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-March/115830.html Signed-off-by: Kevin Hao Signed-off-by: Scott Wood --- arch/powerpc/platforms/85xx/smp.c | 51 +-------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 8631ac5f0e57..b8b821697910 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -345,6 +345,7 @@ void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary) local_irq_disable(); if (secondary) { + __flush_disable_L1(); atomic_inc(&kexec_down_cpus); /* loop forever */ while (1); @@ -357,61 +358,11 @@ static void mpc85xx_smp_kexec_down(void *arg) ppc_md.kexec_cpu_down(0,1); } -static void map_and_flush(unsigned long paddr) -{ - struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); - unsigned long kaddr = (unsigned long)kmap_atomic(page); - - flush_dcache_range(kaddr, kaddr + PAGE_SIZE); - kunmap_atomic((void *)kaddr); -} - -/** - * Before we reset the other cores, we need to flush relevant cache - * out to memory so we don't get anything corrupted, some of these flushes - * are performed out of an overabundance of caution as interrupts are not - * disabled yet and we can switch cores - */ -static void mpc85xx_smp_flush_dcache_kexec(struct kimage *image) -{ - kimage_entry_t *ptr, entry; - unsigned long paddr; - int i; - - if (image->type == KEXEC_TYPE_DEFAULT) { - /* normal kexec images are stored in temporary pages */ - for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); - ptr = (entry & IND_INDIRECTION) ? - phys_to_virt(entry & PAGE_MASK) : ptr + 1) { - if (!(entry & IND_DESTINATION)) { - map_and_flush(entry); - } - } - /* flush out last IND_DONE page */ - map_and_flush(entry); - } else { - /* crash type kexec images are copied to the crash region */ - for (i = 0; i < image->nr_segments; i++) { - struct kexec_segment *seg = &image->segment[i]; - for (paddr = seg->mem; paddr < seg->mem + seg->memsz; - paddr += PAGE_SIZE) { - map_and_flush(paddr); - } - } - } - - /* also flush the kimage struct to be passed in as well */ - flush_dcache_range((unsigned long)image, - (unsigned long)image + sizeof(*image)); -} - static void mpc85xx_smp_machine_kexec(struct kimage *image) { int timeout = INT_MAX; int i, num_cpus = num_present_cpus(); - mpc85xx_smp_flush_dcache_kexec(image); - if (image->type == KEXEC_TYPE_DEFAULT) smp_call_function(mpc85xx_smp_kexec_down, NULL, 0); -- cgit v1.2.3 From 31ea9d5dfd27b9aa6f92c207041eba8e81f4c497 Mon Sep 17 00:00:00 2001 From: Xie Xiaobo Date: Fri, 22 May 2015 13:07:19 +0800 Subject: powerpc/85xx: p1025twr: add module conditional to fix QE-uart issue A ioport setting was needed when used the QE uart function on TWR-P1025. Added a conditional definition to avoid missing this setting when the QE-uart driver was bulit to a module. Signed-off-by: Xie Xiaobo Signed-off-by: Li Pengbo Signed-off-by: Scott Wood --- arch/powerpc/platforms/85xx/twr_p102x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/85xx/twr_p102x.c b/arch/powerpc/platforms/85xx/twr_p102x.c index 1eadb6d0dc64..30e002f4648c 100644 --- a/arch/powerpc/platforms/85xx/twr_p102x.c +++ b/arch/powerpc/platforms/85xx/twr_p102x.c @@ -79,7 +79,7 @@ static void __init twr_p1025_setup_arch(void) mpc85xx_qe_init(); mpc85xx_qe_par_io_init(); -#if defined(CONFIG_UCC_GETH) || defined(CONFIG_SERIAL_QE) +#if IS_ENABLED(CONFIG_UCC_GETH) || IS_ENABLED(CONFIG_SERIAL_QE) if (machine_is(twr_p1025)) { struct ccsr_guts __iomem *guts; @@ -101,7 +101,7 @@ static void __init twr_p1025_setup_arch(void) MPC85xx_PMUXCR_QE(12)); iounmap(guts); -#if defined(CONFIG_SERIAL_QE) +#if IS_ENABLED(CONFIG_SERIAL_QE) /* On P1025TWR board, the UCC7 acted as UART port. * However, The UCC7's CTS pin is low level in default, * it will impact the transmission in full duplex -- cgit v1.2.3 From 7a8e6bbf8593a9395dd6c61f7c5f421570600017 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 27 May 2015 16:06:59 +1000 Subject: powerpc/pci: Add shutdown hook to pci_controller_ops Currently pnv_pci_shutdown() calls the PHB shutdown code for all PHBs in the system. It dereferences the private_data assuming it's a powernv PHB, which won't be the case when we have different PHB in the systems (like when we add vPHBs for CXL). This moves the shutdown hook to the pci_controller_ops and fixes the call site to use that instead. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pci-bridge.h | 2 ++ arch/powerpc/platforms/powernv/pci-ioda.c | 8 ++++---- arch/powerpc/platforms/powernv/pci.c | 9 +++------ arch/powerpc/platforms/powernv/pci.h | 1 - 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 4cf0caaa33c7..3947749de280 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -40,6 +40,8 @@ struct pci_controller_ops { #endif int (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask); + + void (*shutdown)(struct pci_controller *); }; /* diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 508f530e367b..cec43c8dc44d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2643,8 +2643,10 @@ static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; } -static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) +static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { + struct pnv_phb *phb = hose->private_data; + opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET); } @@ -2659,6 +2661,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, .dma_set_mask = pnv_pci_ioda_dma_set_mask, + .shutdown = pnv_pci_ioda_shutdown, }; static void __init pnv_pci_init_ioda_phb(struct device_node *np, @@ -2806,9 +2809,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask; - /* Setup shutdown function for kexec */ - phb->shutdown = pnv_pci_ioda_shutdown; - /* Setup MSI support */ pnv_pci_init_ioda_msis(phb); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 8a557b5aabf7..3c35487e9778 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -704,12 +704,9 @@ void pnv_pci_shutdown(void) { struct pci_controller *hose; - list_for_each_entry(hose, &hose_list, list_node) { - struct pnv_phb *phb = hose->private_data; - - if (phb && phb->shutdown) - phb->shutdown(phb); - } + list_for_each_entry(hose, &hose_list, list_node) + if (hose->controller_ops.shutdown) + hose->controller_ops.shutdown(hose); } /* Fixup wrong class code in p7ioc and p8 root complex */ diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index ac8686c853e6..714ee3c19854 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -110,7 +110,6 @@ struct pnv_phb { struct pci_dev *pdev); void (*fixup_phb)(struct pci_controller *hose); u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); - void (*shutdown)(struct pnv_phb *phb); int (*init_m64)(struct pnv_phb *phb); void (*reserve_m64_pe)(struct pnv_phb *phb); int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all); -- cgit v1.2.3 From ec249dd860ed88e15b3e2bd363cbfc76ba8c1884 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 27 May 2015 16:07:16 +1000 Subject: cxl: Move include file cxl.h -> cxl-base.h This moves the current include file from cxl.h -> cxl-base.h. This current include file is used only to pass information between the base driver that needs to be built into the kernel and the cxl module. This is to make way for a new include/misc/cxl.h which will contain just the kernel API for other driver to use Signed-off-by: Michael Neuling Acked-by: Ian Munsie Signed-off-by: Michael Ellerman --- MAINTAINERS | 2 +- arch/powerpc/include/asm/pnv-pci.h | 2 +- arch/powerpc/mm/copro_fault.c | 2 +- arch/powerpc/mm/hash_native_64.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- drivers/misc/cxl/base.c | 2 +- drivers/misc/cxl/cxl.h | 2 +- drivers/misc/cxl/irq.c | 2 +- drivers/misc/cxl/main.c | 2 +- drivers/misc/cxl/native.c | 2 +- include/misc/cxl-base.h | 48 +++++++++++++++++++++++++++++++ include/misc/cxl.h | 48 ------------------------------- 12 files changed, 58 insertions(+), 58 deletions(-) create mode 100644 include/misc/cxl-base.h delete mode 100644 include/misc/cxl.h (limited to 'arch/powerpc/platforms') diff --git a/MAINTAINERS b/MAINTAINERS index 562ae4ef85c8..80627ddba3e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2946,7 +2946,7 @@ M: Michael Neuling L: linuxppc-dev@lists.ozlabs.org S: Supported F: drivers/misc/cxl/ -F: include/misc/cxl.h +F: include/misc/cxl* F: include/uapi/misc/cxl.h F: Documentation/powerpc/cxl.txt F: Documentation/powerpc/cxl.txt diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index f9b498292a5c..6f77f71ee964 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -11,7 +11,7 @@ #define _ASM_PNV_PCI_H #include -#include +#include int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode); int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 7a71d7f81cf6..6527882ce05e 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include /* * This ought to be kept in sync with the powerpc specific do_page_fault diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 9c4880ddecd6..13befa35d8a8 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -29,7 +29,7 @@ #include #include -#include +#include #ifdef DEBUG_LOW #define DBG_LOW(fmt...) udbg_printf(fmt) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index cec43c8dc44d..b2841853550a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include "powernv.h" #include "pci.h" diff --git a/drivers/misc/cxl/base.c b/drivers/misc/cxl/base.c index 0654ad83675e..a9f0dd3255a2 100644 --- a/drivers/misc/cxl/base.c +++ b/drivers/misc/cxl/base.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "cxl.h" /* protected by rcu */ diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 778161afd0a3..46d806e3b943 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c index 212790b4ee57..680cd263436d 100644 --- a/drivers/misc/cxl/irq.c +++ b/drivers/misc/cxl/irq.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "cxl.h" #include "trace.h" diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c index 8ccddceead66..833348e2c9cb 100644 --- a/drivers/misc/cxl/main.c +++ b/drivers/misc/cxl/main.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include "cxl.h" #include "trace.h" diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index a4b40d72466a..10567f245818 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "cxl.h" #include "trace.h" diff --git a/include/misc/cxl-base.h b/include/misc/cxl-base.h new file mode 100644 index 000000000000..5ae962512fb8 --- /dev/null +++ b/include/misc/cxl-base.h @@ -0,0 +1,48 @@ +/* + * Copyright 2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _MISC_CXL_BASE_H +#define _MISC_CXL_BASE_H + +#ifdef CONFIG_CXL_BASE + +#define CXL_IRQ_RANGES 4 + +struct cxl_irq_ranges { + irq_hw_number_t offset[CXL_IRQ_RANGES]; + irq_hw_number_t range[CXL_IRQ_RANGES]; +}; + +extern atomic_t cxl_use_count; + +static inline bool cxl_ctx_in_use(void) +{ + return (atomic_read(&cxl_use_count) != 0); +} + +static inline void cxl_ctx_get(void) +{ + atomic_inc(&cxl_use_count); +} + +static inline void cxl_ctx_put(void) +{ + atomic_dec(&cxl_use_count); +} + +void cxl_slbia(struct mm_struct *mm); + +#else /* CONFIG_CXL_BASE */ + +static inline bool cxl_ctx_in_use(void) { return false; } +static inline void cxl_slbia(struct mm_struct *mm) {} + +#endif /* CONFIG_CXL_BASE */ + +#endif diff --git a/include/misc/cxl.h b/include/misc/cxl.h deleted file mode 100644 index 975cc7861f18..000000000000 --- a/include/misc/cxl.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2014 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _MISC_CXL_H -#define _MISC_CXL_H - -#ifdef CONFIG_CXL_BASE - -#define CXL_IRQ_RANGES 4 - -struct cxl_irq_ranges { - irq_hw_number_t offset[CXL_IRQ_RANGES]; - irq_hw_number_t range[CXL_IRQ_RANGES]; -}; - -extern atomic_t cxl_use_count; - -static inline bool cxl_ctx_in_use(void) -{ - return (atomic_read(&cxl_use_count) != 0); -} - -static inline void cxl_ctx_get(void) -{ - atomic_inc(&cxl_use_count); -} - -static inline void cxl_ctx_put(void) -{ - atomic_dec(&cxl_use_count); -} - -void cxl_slbia(struct mm_struct *mm); - -#else /* CONFIG_CXL_BASE */ - -static inline bool cxl_ctx_in_use(void) { return false; } -static inline void cxl_slbia(struct mm_struct *mm) {} - -#endif /* CONFIG_CXL_BASE */ - -#endif -- cgit v1.2.3 From 14aae78f084af5241476ede8e28b377fc751b191 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Wed, 8 Apr 2015 09:31:10 +0200 Subject: powerpc/powernv: convert OPAL codes returned by sysparam calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The opal_{get,set}_param calls return internal error codes which need to be translated in errnos in Linux. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-sysparam.c | 12 ++++++++---- arch/powerpc/platforms/powernv/opal.c | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c index 2e52b47393e7..afe66c576a38 100644 --- a/arch/powerpc/platforms/powernv/opal-sysparam.c +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -55,8 +55,10 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) } ret = opal_get_param(token, param_id, (u64)buffer, length); - if (ret != OPAL_ASYNC_COMPLETION) + if (ret != OPAL_ASYNC_COMPLETION) { + ret = opal_error_code(ret); goto out_token; + } ret = opal_async_wait_response(token, &msg); if (ret) { @@ -65,7 +67,7 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) goto out_token; } - ret = be64_to_cpu(msg.params[1]); + ret = opal_error_code(be64_to_cpu(msg.params[1])); out_token: opal_async_release_token(token); @@ -89,8 +91,10 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) ret = opal_set_param(token, param_id, (u64)buffer, length); - if (ret != OPAL_ASYNC_COMPLETION) + if (ret != OPAL_ASYNC_COMPLETION) { + ret = opal_error_code(ret); goto out_token; + } ret = opal_async_wait_response(token, &msg); if (ret) { @@ -99,7 +103,7 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) goto out_token; } - ret = be64_to_cpu(msg.params[1]); + ret = opal_error_code(be64_to_cpu(msg.params[1])); out_token: opal_async_release_token(token); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 8403307c5362..fd694bd51b80 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -830,6 +830,7 @@ int opal_error_code(int rc) case OPAL_ASYNC_COMPLETION: return -EINPROGRESS; case OPAL_BUSY_EVENT: return -EBUSY; case OPAL_NO_MEM: return -ENOMEM; + case OPAL_PERMISSION: return -EPERM; case OPAL_UNSUPPORTED: return -EIO; case OPAL_HARDWARE: return -EIO; -- cgit v1.2.3 From 48c06154952a18c5cd4d074ec3de627430cb6d23 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 20 May 2015 11:23:33 +0800 Subject: powerpc/powernv: Merge common platform device initialisation opal_ipmi_init and opal_flash_init are equivalent, except for the compatbile string. Merge these two into a common opal_pdev_init function. Signed-off-by: Jeremy Kerr Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index fd694bd51b80..fdce840f0006 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -600,21 +600,13 @@ static void __init opal_dump_region_init(void) "rc = %d\n", rc); } -static void opal_flash_init(struct device_node *opal_node) +static void opal_pdev_init(struct device_node *opal_node, + const char *compatible) { struct device_node *np; for_each_child_of_node(opal_node, np) - if (of_device_is_compatible(np, "ibm,opal-flash")) - of_platform_device_create(np, NULL, NULL); -} - -static void opal_ipmi_init(struct device_node *opal_node) -{ - struct device_node *np; - - for_each_child_of_node(opal_node, np) - if (of_device_is_compatible(np, "ibm,opal-ipmi")) + if (of_device_is_compatible(np, compatible)) of_platform_device_create(np, NULL, NULL); } @@ -717,10 +709,9 @@ static int __init opal_init(void) opal_msglog_init(); } - /* Initialize OPAL IPMI backend */ - opal_ipmi_init(opal_node); - - opal_flash_init(opal_node); + /* Initialize platform devices: IPMI backend & flash interface */ + opal_pdev_init(opal_node, "ibm,opal-ipmi"); + opal_pdev_init(opal_node, "ibm,opal-flash"); return 0; } -- cgit v1.2.3 From 594fcb9ec9e17215b4f98b5d40ba5e3af618ba82 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 20 May 2015 11:23:33 +0800 Subject: powerpc/powernv: Expose OPAL APIs required by PRD interface The (upcoming) opal-prd driver needs to access the message notifier and xscom code, so add EXPORT_SYMBOL_GPL macros for these. Signed-off-by: Jeremy Kerr Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index fdce840f0006..0379068e1c10 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -235,6 +235,7 @@ int opal_message_notifier_register(enum opal_msg_type msg_type, return atomic_notifier_chain_register( &opal_msg_notifier_head[msg_type], nb); } +EXPORT_SYMBOL_GPL(opal_message_notifier_register); int opal_message_notifier_unregister(enum opal_msg_type msg_type, struct notifier_block *nb) @@ -242,6 +243,7 @@ int opal_message_notifier_unregister(enum opal_msg_type msg_type, return atomic_notifier_chain_unregister( &opal_msg_notifier_head[msg_type], nb); } +EXPORT_SYMBOL_GPL(opal_message_notifier_unregister); static void opal_message_do_notify(uint32_t msg_type, void *msg) { @@ -743,6 +745,8 @@ void opal_shutdown(void) /* Export this so that test modules can use it */ EXPORT_SYMBOL_GPL(opal_invalid_call); +EXPORT_SYMBOL_GPL(opal_xscom_read); +EXPORT_SYMBOL_GPL(opal_xscom_write); EXPORT_SYMBOL_GPL(opal_ipmi_send); EXPORT_SYMBOL_GPL(opal_ipmi_recv); EXPORT_SYMBOL_GPL(opal_flash_read); -- cgit v1.2.3 From 0d7cd8550d30906c7461ced654306da30f1590e2 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 4 Jun 2015 21:51:47 +0800 Subject: powerpc/powernv: Add opal-prd channel This change adds a char device to access the "PRD" (processor runtime diagnostics) channel to OPAL firmware. Includes contributions from Vaidyanathan Srinivasan, Neelesh Gupta & Vishal Kulkarni. Signed-off-by: Neelesh Gupta Signed-off-by: Jeremy Kerr Acked-by: Stewart Smith Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 21 +- arch/powerpc/include/asm/opal.h | 1 + arch/powerpc/include/uapi/asm/opal-prd.h | 58 ++++ arch/powerpc/platforms/powernv/Kconfig | 7 + arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/opal-prd.c | 444 +++++++++++++++++++++++++ arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + arch/powerpc/platforms/powernv/opal.c | 4 +- 8 files changed, 535 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/include/uapi/asm/opal-prd.h create mode 100644 arch/powerpc/platforms/powernv/opal-prd.c (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index a49e5fa6f939..e9e4c52f3685 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -153,7 +153,8 @@ #define OPAL_FLASH_READ 110 #define OPAL_FLASH_WRITE 111 #define OPAL_FLASH_ERASE 112 -#define OPAL_LAST 112 +#define OPAL_PRD_MSG 113 +#define OPAL_LAST 113 /* Device tree flags */ @@ -359,6 +360,7 @@ enum opal_msg_type { OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ OPAL_MSG_HMI_EVT, OPAL_MSG_DPO, + OPAL_MSG_PRD, OPAL_MSG_TYPE_MAX, }; @@ -681,6 +683,23 @@ typedef struct oppanel_line { __be64 line_len; } oppanel_line_t; +enum opal_prd_msg_type { + OPAL_PRD_MSG_TYPE_INIT = 0, /* HBRT --> OPAL */ + OPAL_PRD_MSG_TYPE_FINI, /* HBRT/kernel --> OPAL */ + OPAL_PRD_MSG_TYPE_ATTN, /* HBRT <-- OPAL */ + OPAL_PRD_MSG_TYPE_ATTN_ACK, /* HBRT --> OPAL */ + OPAL_PRD_MSG_TYPE_OCC_ERROR, /* HBRT <-- OPAL */ + OPAL_PRD_MSG_TYPE_OCC_RESET, /* HBRT <-- OPAL */ +}; + +struct opal_prd_msg_header { + uint8_t type; + uint8_t pad[1]; + __be16 size; +}; + +struct opal_prd_msg; + /* * SG entries * diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 1412814347ba..958e941c0cda 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -194,6 +194,7 @@ int64_t opal_ipmi_recv(uint64_t interface, struct opal_ipmi_msg *msg, uint64_t *msg_len); int64_t opal_i2c_request(uint64_t async_token, uint32_t bus_id, struct opal_i2c_request *oreq); +int64_t opal_prd_msg(struct opal_prd_msg *msg); int64_t opal_flash_read(uint64_t id, uint64_t offset, uint64_t buf, uint64_t size, uint64_t token); diff --git a/arch/powerpc/include/uapi/asm/opal-prd.h b/arch/powerpc/include/uapi/asm/opal-prd.h new file mode 100644 index 000000000000..319ff4a26158 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/opal-prd.h @@ -0,0 +1,58 @@ +/* + * OPAL Runtime Diagnostics interface driver + * Supported on POWERNV platform + * + * (C) Copyright IBM 2015 + * + * Author: Vaidyanathan Srinivasan + * Author: Jeremy Kerr + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _UAPI_ASM_POWERPC_OPAL_PRD_H_ +#define _UAPI_ASM_POWERPC_OPAL_PRD_H_ + +#include + +/** + * The version of the kernel interface of the PRD system. This describes the + * interface available for the /dev/opal-prd device. The actual PRD message + * layout and content is private to the firmware <--> userspace interface, so + * is not covered by this versioning. + * + * Future interface versions are backwards-compatible; if a later kernel + * version is encountered, functionality provided in earlier versions + * will work. + */ +#define OPAL_PRD_KERNEL_VERSION 1 + +#define OPAL_PRD_GET_INFO _IOR('o', 0x01, struct opal_prd_info) +#define OPAL_PRD_SCOM_READ _IOR('o', 0x02, struct opal_prd_scom) +#define OPAL_PRD_SCOM_WRITE _IOW('o', 0x03, struct opal_prd_scom) + +#ifndef __ASSEMBLY__ + +struct opal_prd_info { + __u64 version; + __u64 reserved[3]; +}; + +struct opal_prd_scom { + __u64 chip; + __u64 addr; + __u64 data; + __s64 rc; +}; + +#endif /* __ASSEMBLY__ */ + +#endif /* _UAPI_ASM_POWERPC_OPAL_PRD_H */ diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 4b044d8cb49a..604190cab522 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -19,3 +19,10 @@ config PPC_POWERNV select CPU_FREQ_GOV_CONSERVATIVE select PPC_DOORBELL default y + +config OPAL_PRD + tristate 'OPAL PRD driver' + depends on PPC_POWERNV + help + This enables the opal-prd driver, a facility to run processor + recovery diagnostics on OpenPower machines diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 26bd7e417b7c..1c8cdb6250e7 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o +obj-$(CONFIG_OPAL_PRD) += opal-prd.o diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c new file mode 100644 index 000000000000..d9e72bde79f5 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -0,0 +1,444 @@ +/* + * OPAL Runtime Diagnostics interface driver + * Supported on POWERNV platform + * + * Copyright IBM Corporation 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) "opal-prd: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +struct opal_prd_msg_queue_item { + struct opal_prd_msg_header msg; + struct list_head list; +}; + +static struct device_node *prd_node; +static LIST_HEAD(opal_prd_msg_queue); +static DEFINE_SPINLOCK(opal_prd_msg_queue_lock); +static DECLARE_WAIT_QUEUE_HEAD(opal_prd_msg_wait); +static atomic_t prd_usage; + +static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size) +{ + struct device_node *parent, *node; + bool found; + + if (addr + size < addr) + return false; + + parent = of_find_node_by_path("/reserved-memory"); + if (!parent) + return false; + + found = false; + + for_each_child_of_node(parent, node) { + uint64_t range_addr, range_size, range_end; + const __be32 *addrp; + const char *label; + + addrp = of_get_address(node, 0, &range_size, NULL); + + range_addr = of_read_number(addrp, 2); + range_end = range_addr + range_size; + + label = of_get_property(node, "ibm,prd-label", NULL); + + /* PRD ranges need a label */ + if (!label) + continue; + + if (range_end <= range_addr) + continue; + + if (addr >= range_addr && addr + size <= range_end) { + found = true; + of_node_put(node); + break; + } + } + + of_node_put(parent); + return found; +} + +static int opal_prd_open(struct inode *inode, struct file *file) +{ + /* + * Prevent multiple (separate) processes from concurrent interactions + * with the FW PRD channel + */ + if (atomic_xchg(&prd_usage, 1) == 1) + return -EBUSY; + + return 0; +} + +/* + * opal_prd_mmap - maps firmware-provided ranges into userspace + * @file: file structure for the device + * @vma: VMA to map the registers into + */ + +static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t addr, size; + int rc; + + pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_flags); + + addr = vma->vm_pgoff << PAGE_SHIFT; + size = vma->vm_end - vma->vm_start; + + /* ensure we're mapping within one of the allowable ranges */ + if (!opal_prd_range_is_valid(addr, size)) + return -EINVAL; + + vma->vm_page_prot = __pgprot(pgprot_val(phys_mem_access_prot(file, + vma->vm_pgoff, + size, vma->vm_page_prot)) + | _PAGE_SPECIAL); + + rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, + vma->vm_page_prot); + + return rc; +} + +static bool opal_msg_queue_empty(void) +{ + unsigned long flags; + bool ret; + + spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); + ret = list_empty(&opal_prd_msg_queue); + spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags); + + return ret; +} + +static unsigned int opal_prd_poll(struct file *file, + struct poll_table_struct *wait) +{ + poll_wait(file, &opal_prd_msg_wait, wait); + + if (!opal_msg_queue_empty()) + return POLLIN | POLLRDNORM; + + return 0; +} + +static ssize_t opal_prd_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct opal_prd_msg_queue_item *item; + unsigned long flags; + ssize_t size, err; + int rc; + + /* we need at least a header's worth of data */ + if (count < sizeof(item->msg)) + return -EINVAL; + + if (*ppos) + return -ESPIPE; + + item = NULL; + + for (;;) { + + spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); + if (!list_empty(&opal_prd_msg_queue)) { + item = list_first_entry(&opal_prd_msg_queue, + struct opal_prd_msg_queue_item, list); + list_del(&item->list); + } + spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags); + + if (item) + break; + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + rc = wait_event_interruptible(opal_prd_msg_wait, + !opal_msg_queue_empty()); + if (rc) + return -EINTR; + } + + size = be16_to_cpu(item->msg.size); + if (size > count) { + err = -EINVAL; + goto err_requeue; + } + + rc = copy_to_user(buf, &item->msg, size); + if (rc) { + err = -EFAULT; + goto err_requeue; + } + + kfree(item); + + return size; + +err_requeue: + /* eep! re-queue at the head of the list */ + spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); + list_add(&item->list, &opal_prd_msg_queue); + spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags); + return err; +} + +static ssize_t opal_prd_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct opal_prd_msg_header hdr; + ssize_t size; + void *msg; + int rc; + + size = sizeof(hdr); + + if (count < size) + return -EINVAL; + + /* grab the header */ + rc = copy_from_user(&hdr, buf, sizeof(hdr)); + if (rc) + return -EFAULT; + + size = be16_to_cpu(hdr.size); + + msg = kmalloc(size, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rc = copy_from_user(msg, buf, size); + if (rc) { + size = -EFAULT; + goto out_free; + } + + rc = opal_prd_msg(msg); + if (rc) { + pr_warn("write: opal_prd_msg returned %d\n", rc); + size = -EIO; + } + +out_free: + kfree(msg); + + return size; +} + +static int opal_prd_release(struct inode *inode, struct file *file) +{ + struct opal_prd_msg_header msg; + + msg.size = cpu_to_be16(sizeof(msg)); + msg.type = OPAL_PRD_MSG_TYPE_FINI; + + opal_prd_msg((struct opal_prd_msg *)&msg); + + atomic_xchg(&prd_usage, 0); + + return 0; +} + +static long opal_prd_ioctl(struct file *file, unsigned int cmd, + unsigned long param) +{ + struct opal_prd_info info; + struct opal_prd_scom scom; + int rc = 0; + + switch (cmd) { + case OPAL_PRD_GET_INFO: + memset(&info, 0, sizeof(info)); + info.version = OPAL_PRD_KERNEL_VERSION; + rc = copy_to_user((void __user *)param, &info, sizeof(info)); + if (rc) + return -EFAULT; + break; + + case OPAL_PRD_SCOM_READ: + rc = copy_from_user(&scom, (void __user *)param, sizeof(scom)); + if (rc) + return -EFAULT; + + scom.rc = opal_xscom_read(scom.chip, scom.addr, + (__be64 *)&scom.data); + scom.data = be64_to_cpu(scom.data); + pr_devel("ioctl SCOM_READ: chip %llx addr %016llx data %016llx rc %lld\n", + scom.chip, scom.addr, scom.data, scom.rc); + + rc = copy_to_user((void __user *)param, &scom, sizeof(scom)); + if (rc) + return -EFAULT; + break; + + case OPAL_PRD_SCOM_WRITE: + rc = copy_from_user(&scom, (void __user *)param, sizeof(scom)); + if (rc) + return -EFAULT; + + scom.rc = opal_xscom_write(scom.chip, scom.addr, scom.data); + pr_devel("ioctl SCOM_WRITE: chip %llx addr %016llx data %016llx rc %lld\n", + scom.chip, scom.addr, scom.data, scom.rc); + + rc = copy_to_user((void __user *)param, &scom, sizeof(scom)); + if (rc) + return -EFAULT; + break; + + default: + rc = -EINVAL; + } + + return rc; +} + +static const struct file_operations opal_prd_fops = { + .open = opal_prd_open, + .mmap = opal_prd_mmap, + .poll = opal_prd_poll, + .read = opal_prd_read, + .write = opal_prd_write, + .unlocked_ioctl = opal_prd_ioctl, + .release = opal_prd_release, + .owner = THIS_MODULE, +}; + +static struct miscdevice opal_prd_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "opal-prd", + .fops = &opal_prd_fops, +}; + +/* opal interface */ +static int opal_prd_msg_notifier(struct notifier_block *nb, + unsigned long msg_type, void *_msg) +{ + struct opal_prd_msg_queue_item *item; + struct opal_prd_msg_header *hdr; + struct opal_msg *msg = _msg; + unsigned long flags; + int size; + + if (msg_type != OPAL_MSG_PRD) + return 0; + + /* Calculate total size of the item we need to store. The 'size' field + * in the header includes the header itself. */ + hdr = (void *)msg->params; + size = (sizeof(*item) - sizeof(item->msg)) + be16_to_cpu(hdr->size); + + item = kzalloc(size, GFP_ATOMIC); + if (!item) + return -ENOMEM; + + memcpy(&item->msg, msg->params, size); + + spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); + list_add_tail(&item->list, &opal_prd_msg_queue); + spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags); + + wake_up_interruptible(&opal_prd_msg_wait); + + return 0; +} + +static struct notifier_block opal_prd_event_nb = { + .notifier_call = opal_prd_msg_notifier, + .next = NULL, + .priority = 0, +}; + +static int opal_prd_probe(struct platform_device *pdev) +{ + int rc; + + if (!pdev || !pdev->dev.of_node) + return -ENODEV; + + /* We should only have one prd driver instance per machine; ensure + * that we only get a valid probe on a single OF node. + */ + if (prd_node) + return -EBUSY; + + prd_node = pdev->dev.of_node; + + rc = opal_message_notifier_register(OPAL_MSG_PRD, &opal_prd_event_nb); + if (rc) { + pr_err("Couldn't register event notifier\n"); + return rc; + } + + rc = misc_register(&opal_prd_dev); + if (rc) { + pr_err("failed to register miscdev\n"); + opal_message_notifier_unregister(OPAL_MSG_PRD, + &opal_prd_event_nb); + return rc; + } + + return 0; +} + +static int opal_prd_remove(struct platform_device *pdev) +{ + misc_deregister(&opal_prd_dev); + opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb); + return 0; +} + +static const struct of_device_id opal_prd_match[] = { + { .compatible = "ibm,opal-prd" }, + { }, +}; + +static struct platform_driver opal_prd_driver = { + .driver = { + .name = "opal-prd", + .owner = THIS_MODULE, + .of_match_table = opal_prd_match, + }, + .probe = opal_prd_probe, + .remove = opal_prd_remove, +}; + +module_platform_driver(opal_prd_driver); + +MODULE_DEVICE_TABLE(of, opal_prd_match); +MODULE_DESCRIPTION("PowerNV OPAL runtime diagnostic driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index bf15ead00e12..d6a7b8252e4d 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -296,3 +296,4 @@ OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); +OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 0379068e1c10..9e9c483eee4d 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -711,9 +711,10 @@ static int __init opal_init(void) opal_msglog_init(); } - /* Initialize platform devices: IPMI backend & flash interface */ + /* Initialize platform devices: IPMI backend, PRD & flash interface */ opal_pdev_init(opal_node, "ibm,opal-ipmi"); opal_pdev_init(opal_node, "ibm,opal-flash"); + opal_pdev_init(opal_node, "ibm,opal-prd"); return 0; } @@ -752,6 +753,7 @@ EXPORT_SYMBOL_GPL(opal_ipmi_recv); EXPORT_SYMBOL_GPL(opal_flash_read); EXPORT_SYMBOL_GPL(opal_flash_write); EXPORT_SYMBOL_GPL(opal_flash_erase); +EXPORT_SYMBOL_GPL(opal_prd_msg); /* Convert a region of vmalloc memory to an opal sg list */ struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, -- cgit v1.2.3 From ea30e99e8eccb684490f40d011ea534ecd937e98 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:34:53 +1000 Subject: powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group This relies on the fact that a PCI device always has an IOMMU table which may not be the case when we get dynamic DMA windows so let's use more reliable check for IOMMU group here. As we do not rely on the table presence here, remove the workaround from pnv_pci_ioda2_set_bypass(); also remove the @add_to_iommu_group parameter from pnv_ioda_setup_bus_dma(). Signed-off-by: Alexey Kardashevskiy Acked-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh.c | 4 +--- arch/powerpc/platforms/powernv/pci-ioda.c | 27 +++++---------------------- 2 files changed, 6 insertions(+), 25 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 51dcdf66e9e6..af9b597b10af 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1410,13 +1410,11 @@ static int dev_has_iommu_table(struct device *dev, void *data) { struct pci_dev *pdev = to_pci_dev(dev); struct pci_dev **ppdev = data; - struct iommu_table *tbl; if (!dev) return 0; - tbl = get_iommu_table_base(dev); - if (tbl && tbl->it_group) { + if (dev->iommu_group) { *ppdev = pdev; return 1; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b2841853550a..5491d2f0e192 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1655,21 +1655,15 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb, } static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, - struct pci_bus *bus, - bool add_to_iommu_group) + struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - if (add_to_iommu_group) - set_iommu_table_base_and_group(&dev->dev, - pe->tce32_table); - else - set_iommu_table_base(&dev->dev, pe->tce32_table); + set_iommu_table_base_and_group(&dev->dev, pe->tce32_table); if (dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate, - add_to_iommu_group); + pnv_ioda_setup_bus_dma(pe, dev->subordinate); } } @@ -1846,7 +1840,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + pnv_ioda_setup_bus_dma(pe, pe->pbus); } else if (pe->flags & PNV_IODA_PE_VF) { iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); @@ -1883,17 +1877,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) window_id, pe->tce_bypass_base, 0); - - /* - * EEH needs the mapping between IOMMU table and group - * of those VFIO/KVM pass-through devices. We can postpone - * resetting DMA ops until the DMA mask is configured in - * host side. - */ - if (pe->pdev) - set_iommu_table_base(&pe->pdev->dev, tbl); - else - pnv_ioda_setup_bus_dma(pe, pe->pbus, false); } if (rc) pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); @@ -1985,7 +1968,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + pnv_ioda_setup_bus_dma(pe, pe->pbus); } else if (pe->flags & PNV_IODA_PE_VF) { iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); -- cgit v1.2.3 From 4617082ec049d92a797e8be0b4ba2ba6b59d90d1 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:34:54 +1000 Subject: powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group The set_iommu_table_base_and_group() name suggests that the function sets table base and add a device to an IOMMU group. The actual purpose for table base setting is to put some reference into a device so later iommu_add_device() can get the IOMMU group reference and the device to the group. At the moment a group cannot be explicitly passed to iommu_add_device() as we want it to work from the bus notifier, we can fix it later and remove confusing calls of set_iommu_table_base(). This replaces set_iommu_table_base_and_group() with a couple of set_iommu_table_base() + iommu_add_device() which makes reading the code easier. This adds few comments why set_iommu_table_base() and iommu_add_device() are called where they are called. For IODA1/2, this essentially removes iommu_add_device() call from the pnv_pci_ioda_dma_dev_setup() as it will always fail at this particular place: - for physical PE, the device is already attached by iommu_add_device() in pnv_pci_ioda_setup_dma_pe(); - for virtual PE, the sysfs entries are not ready to create all symlinks so actual adding is happening in tce_iommu_bus_notifier. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 7 ------- arch/powerpc/platforms/powernv/pci-ioda.c | 27 +++++++++++++++++++++++---- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 ++- arch/powerpc/platforms/pseries/iommu.c | 15 ++++++++------- 4 files changed, 33 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 1e27d6338565..8353c865db6f 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -140,13 +140,6 @@ static inline int __init tce_iommu_bus_notifier_init(void) } #endif /* !CONFIG_IOMMU_API */ -static inline void set_iommu_table_base_and_group(struct device *dev, - void *base) -{ - set_iommu_table_base(dev, base); - iommu_add_device(dev); -} - extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, struct scatterlist *sglist, int nelems, unsigned long mask, diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5491d2f0e192..fd594b28fce1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1598,7 +1598,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table); + set_iommu_table_base(&pdev->dev, pe->tce32_table); + /* + * Note: iommu_add_device() will fail here as + * for physical PE: the device is already added by now; + * for virtual PE: sysfs entries are not ready yet and + * tce_iommu_bus_notifier will add the device to a group later. + */ } static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) @@ -1660,7 +1666,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base_and_group(&dev->dev, pe->tce32_table); + set_iommu_table_base(&dev->dev, pe->tce32_table); + iommu_add_device(&dev->dev); if (dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate); @@ -1836,7 +1843,13 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, if (pe->flags & PNV_IODA_PE_DEV) { iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); - set_iommu_table_base_and_group(&pe->pdev->dev, tbl); + /* + * Setting table base here only for carrying iommu_group + * further down to let iommu_add_device() do the job. + * pnv_pci_ioda_dma_dev_setup will override it later anyway. + */ + set_iommu_table_base(&pe->pdev->dev, tbl); + iommu_add_device(&pe->pdev->dev); } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); @@ -1964,7 +1977,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (pe->flags & PNV_IODA_PE_DEV) { iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); - set_iommu_table_base_and_group(&pe->pdev->dev, tbl); + /* + * Setting table base here only for carrying iommu_group + * further down to let iommu_add_device() do the job. + * pnv_pci_ioda_dma_dev_setup will override it later anyway. + */ + set_iommu_table_base(&pe->pdev->dev, tbl); + iommu_add_device(&pe->pdev->dev); } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 7f826e8cc1d4..61ae20aa9000 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -92,7 +92,8 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, pci_domain_nr(phb->hose->bus), phb->opal_id); } - set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table); + set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); + iommu_add_device(&pdev->dev); } static const struct pci_controller_ops pnv_pci_p5ioc2_controller_ops = { diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 61d5a17f45c0..05ab06dccb45 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -688,8 +688,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) iommu_table_setparms(phb, dn, tbl); PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); iommu_register_group(tbl, pci_domain_nr(phb->bus), 0); - set_iommu_table_base_and_group(&dev->dev, - PCI_DN(dn)->iommu_table); + set_iommu_table_base(&dev->dev, tbl); + iommu_add_device(&dev->dev); return; } @@ -700,10 +700,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL) dn = dn->parent; - if (dn && PCI_DN(dn)) - set_iommu_table_base_and_group(&dev->dev, - PCI_DN(dn)->iommu_table); - else + if (dn && PCI_DN(dn)) { + set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + iommu_add_device(&dev->dev); + } else printk(KERN_WARNING "iommu: Device %s has no iommu table\n", pci_name(dev)); } @@ -1115,7 +1115,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pr_debug(" found DMA window, table: %p\n", pci->iommu_table); } - set_iommu_table_base_and_group(&dev->dev, pci->iommu_table); + set_iommu_table_base(&dev->dev, pci->iommu_table); + iommu_add_device(&dev->dev); } static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) -- cgit v1.2.3 From c5773822c000bafd462ad02db1d61459acef0d52 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:34:55 +1000 Subject: powerpc/powernv/ioda: Clean up IOMMU group registration The existing code has 3 calls to iommu_register_group() and all 3 branches actually cover all possible cases. This replaces 3 calls with one and moves the registration earlier; the latter will make more sense when we add TCE table sharing. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fd594b28fce1..2b829eb07f87 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1785,6 +1785,9 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; + tbl = pe->tce32_table; + iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + /* Grab a 32-bit TCE table */ pe->tce32_seg = base; pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", @@ -1819,7 +1822,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - tbl = pe->tce32_table; pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, base << 28, IOMMU_PAGE_SHIFT_4K); @@ -1841,8 +1843,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, iommu_init_table(tbl, phb->hose->node); if (pe->flags & PNV_IODA_PE_DEV) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); /* * Setting table base here only for carrying iommu_group * further down to let iommu_add_device() do the job. @@ -1850,14 +1850,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, */ set_iommu_table_base(&pe->pdev->dev, tbl); iommu_add_device(&pe->pdev->dev); - } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); - } else if (pe->flags & PNV_IODA_PE_VF) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); - } return; fail: @@ -1924,6 +1918,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; + tbl = pe->tce32_table; + iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; end = (1 << ilog2(phb->ioda.m32_pci_base)); @@ -1955,7 +1952,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - tbl = pe->tce32_table; pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, IOMMU_PAGE_SHIFT_4K); @@ -1975,8 +1971,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, iommu_init_table(tbl, phb->hose->node); if (pe->flags & PNV_IODA_PE_DEV) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); /* * Setting table base here only for carrying iommu_group * further down to let iommu_add_device() do the job. @@ -1984,14 +1978,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, */ set_iommu_table_base(&pe->pdev->dev, tbl); iommu_add_device(&pe->pdev->dev); - } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); - } else if (pe->flags & PNV_IODA_PE_VF) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); - } /* Also create a bypass window */ if (!pnv_iommu_bypass_disabled) -- cgit v1.2.3 From ac9a58891a965216e9cb549a0a2012b97e3abcce Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:34:56 +1000 Subject: powerpc/iommu: Put IOMMU group explicitly So far an iommu_table lifetime was the same as PE. Dynamic DMA windows will change this and iommu_free_table() will not always require the group to be released. This moves iommu_group_put() out of iommu_free_table(). This adds a iommu_pseries_free_table() helper which does iommu_group_put() and iommu_free_table(). Later it will be changed to receive a table_group and we will have to change less lines then. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/iommu.c | 7 ------- arch/powerpc/platforms/powernv/pci-ioda.c | 5 +++++ arch/powerpc/platforms/pseries/iommu.c | 16 +++++++++++++++- 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index b054f33ab1fb..3d47eb3e15dd 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -726,13 +726,6 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) if (tbl->it_offset == 0) clear_bit(0, tbl->it_map); -#ifdef CONFIG_IOMMU_API - if (tbl->it_group) { - iommu_group_put(tbl->it_group); - BUG_ON(tbl->it_group); - } -#endif - /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2b829eb07f87..8070dc9921f2 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -1310,6 +1311,10 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe if (rc) pe_warn(pe, "OPAL error %ld release DMA window\n", rc); + if (tbl->it_group) { + iommu_group_put(tbl->it_group); + BUG_ON(tbl->it_group); + } iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); free_pages(addr, get_order(TCE32_TABLE_SIZE)); pe->tce32_table = NULL; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 05ab06dccb45..fe5117bac29a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,18 @@ #include "pseries.h" +static void iommu_pseries_free_table(struct iommu_table *tbl, + const char *node_name) +{ +#ifdef CONFIG_IOMMU_API + if (tbl->it_group) { + iommu_group_put(tbl->it_group); + BUG_ON(tbl->it_group); + } +#endif + iommu_free_table(tbl, node_name); +} + static void tce_invalidate_pSeries_sw(struct iommu_table *tbl, __be64 *startp, __be64 *endp) { @@ -1271,7 +1284,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti */ remove_ddw(np, false); if (pci && pci->iommu_table) - iommu_free_table(pci->iommu_table, np->full_name); + iommu_pseries_free_table(pci->iommu_table, + np->full_name); spin_lock(&direct_window_list_lock); list_for_each_entry(window, &direct_window_list, list) { -- cgit v1.2.3 From 10b35b2b7485c342334a48cf199063eed8b8748e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:05 +1000 Subject: powerpc/powernv: Do not set "read" flag if direction==DMA_NONE Normally a bitmap from the iommu_table is used to track what TCE entry is in use. Since we are going to use iommu_table without its locks and do xchg() instead, it becomes essential not to put bits which are not implied in the direction flag as the old TCE value (more precisely - the permission bits) will be used to decide whether to put the page or not. This adds iommu_direction_to_tce_perm() (its counterpart is there already) and uses it for powernv's pnv_tce_build(). Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 1 + arch/powerpc/kernel/iommu.c | 15 +++++++++++++++ arch/powerpc/platforms/powernv/pci.c | 7 +------ 3 files changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index e94a5e3c5a2f..d91bd69d3196 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -200,6 +200,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl); extern void iommu_release_ownership(struct iommu_table *tbl); extern enum dma_data_direction iommu_tce_direction(unsigned long tce); +extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir); #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0019c80aa81d..ac2f959adf9a 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -866,6 +866,21 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, } } +unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir) +{ + switch (dir) { + case DMA_BIDIRECTIONAL: + return TCE_PCI_READ | TCE_PCI_WRITE; + case DMA_FROM_DEVICE: + return TCE_PCI_WRITE; + case DMA_TO_DEVICE: + return TCE_PCI_READ; + default: + return 0; + } +} +EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm); + #ifdef CONFIG_IOMMU_API /* * SPAPR TCE API diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 3c35487e9778..1477422b907a 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -576,15 +576,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs, bool rm) { - u64 proto_tce; + u64 proto_tce = iommu_direction_to_tce_perm(direction); __be64 *tcep, *tces; u64 rpn; - proto_tce = TCE_PCI_READ; // Read allowed - - if (direction != DMA_TO_DEVICE) - proto_tce |= TCE_PCI_WRITE; - tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; rpn = __pa(uaddr) >> tbl->it_page_shift; -- cgit v1.2.3 From da004c3600f52e4f05017f60970e5010978006bc Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:06 +1000 Subject: powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table This adds a iommu_table_ops struct and puts pointer to it into the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush callbacks from ppc_md to the new struct where they really belong to. This adds the requirement for @it_ops to be initialized before calling iommu_init_table() to make sure that we do not leave any IOMMU table with iommu_table_ops uninitialized. This is not a parameter of iommu_init_table() though as there will be cases when iommu_init_table() will not be called on TCE tables, for example - VFIO. This does s/tce_build/set/, s/tce_free/clear/ and removes "tce_" redundant prefixes. This removes tce_xxx_rm handlers from ppc_md but does not add them to iommu_table_ops as this will be done later if we decide to support TCE hypercalls in real mode. This removes _vm callbacks as only virtual mode is supported by now so this also removes @rm parameter. For pSeries, this always uses tce_buildmulti_pSeriesLP/ tce_buildmulti_pSeriesLP. This changes multi callback to fall back to tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not present. The reason for this is we still have to support "multitce=off" boot parameter in disable_multitce() and we do not want to walk through all IOMMU tables in the system and replace "multi" callbacks with single ones. For powernv, this defines _ops per PHB type which are P5IOC2/IODA1/IODA2. This makes the callbacks for them public. Later patches will extend callbacks for IODA1/2. No change in behaviour is expected. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 17 +++++++++++ arch/powerpc/include/asm/machdep.h | 25 --------------- arch/powerpc/kernel/iommu.c | 46 ++++++++++++++-------------- arch/powerpc/kernel/vio.c | 5 +++ arch/powerpc/platforms/cell/iommu.c | 8 +++-- arch/powerpc/platforms/pasemi/iommu.c | 7 +++-- arch/powerpc/platforms/powernv/pci-ioda.c | 14 +++++++++ arch/powerpc/platforms/powernv/pci-p5ioc2.c | 7 +++++ arch/powerpc/platforms/powernv/pci.c | 47 +++++------------------------ arch/powerpc/platforms/powernv/pci.h | 5 +++ arch/powerpc/platforms/pseries/iommu.c | 34 ++++++++++++--------- arch/powerpc/sysdev/dart_iommu.c | 12 +++++--- 12 files changed, 116 insertions(+), 111 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index d91bd69d3196..e2a45c37dba8 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -44,6 +44,22 @@ extern int iommu_is_off; extern int iommu_force_on; +struct iommu_table_ops { + int (*set)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs); + void (*clear)(struct iommu_table *tbl, + long index, long npages); + unsigned long (*get)(struct iommu_table *tbl, long index); + void (*flush)(struct iommu_table *tbl); +}; + +/* These are used by VIO */ +extern struct iommu_table_ops iommu_table_lpar_multi_ops; +extern struct iommu_table_ops iommu_table_pseries_ops; + /* * IOMAP_MAX_ORDER defines the largest contiguous block * of dma space we can get. IOMAP_MAX_ORDER = 13 @@ -78,6 +94,7 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif + struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); #ifdef CONFIG_PPC_POWERNV void *data; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 0d387d0570cd..952579f5e79a 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -65,31 +65,6 @@ struct machdep_calls { * destroyed as well */ void (*hpte_clear_all)(void); - int (*tce_build)(struct iommu_table *tbl, - long index, - long npages, - unsigned long uaddr, - enum dma_data_direction direction, - struct dma_attrs *attrs); - void (*tce_free)(struct iommu_table *tbl, - long index, - long npages); - unsigned long (*tce_get)(struct iommu_table *tbl, - long index); - void (*tce_flush)(struct iommu_table *tbl); - - /* _rm versions are for real mode use only */ - int (*tce_build_rm)(struct iommu_table *tbl, - long index, - long npages, - unsigned long uaddr, - enum dma_data_direction direction, - struct dma_attrs *attrs); - void (*tce_free_rm)(struct iommu_table *tbl, - long index, - long npages); - void (*tce_flush_rm)(struct iommu_table *tbl); - void __iomem * (*ioremap)(phys_addr_t addr, unsigned long size, unsigned long flags, void *caller); void (*iounmap)(volatile void __iomem *token); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index ac2f959adf9a..c0e67e945c95 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -322,11 +322,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, ret = entry << tbl->it_page_shift; /* Set the return dma address */ /* Put the TCEs in the HW table */ - build_fail = ppc_md.tce_build(tbl, entry, npages, + build_fail = tbl->it_ops->set(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK(tbl), direction, attrs); - /* ppc_md.tce_build() only returns non-zero for transient errors. + /* tbl->it_ops->set() only returns non-zero for transient errors. * Clean up the table bitmap in this case and return * DMA_ERROR_CODE. For all other errors the functionality is * not altered. @@ -337,8 +337,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, } /* Flush/invalidate TLB caches if necessary */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); /* Make sure updates are seen by hardware */ mb(); @@ -408,7 +408,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, if (!iommu_free_check(tbl, dma_addr, npages)) return; - ppc_md.tce_free(tbl, entry, npages); + tbl->it_ops->clear(tbl, entry, npages); spin_lock_irqsave(&(pool->lock), flags); bitmap_clear(tbl->it_map, free_entry, npages); @@ -424,8 +424,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, * not do an mb() here on purpose, it is not needed on any of * the current platforms. */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); } int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, @@ -495,7 +495,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, npages, entry, dma_addr); /* Insert into HW table */ - build_fail = ppc_md.tce_build(tbl, entry, npages, + build_fail = tbl->it_ops->set(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK(tbl), direction, attrs); if(unlikely(build_fail)) @@ -534,8 +534,8 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, } /* Flush/invalidate TLB caches if necessary */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); DBG("mapped %d elements:\n", outcount); @@ -600,8 +600,8 @@ void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, * do not do an mb() here, the affected platforms do not need it * when freeing. */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); } static void iommu_table_clear(struct iommu_table *tbl) @@ -613,17 +613,17 @@ static void iommu_table_clear(struct iommu_table *tbl) */ if (!is_kdump_kernel() || is_fadump_active()) { /* Clear the table in case firmware left allocations in it */ - ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); + tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size); return; } #ifdef CONFIG_CRASH_DUMP - if (ppc_md.tce_get) { + if (tbl->it_ops->get) { unsigned long index, tceval, tcecount = 0; /* Reserve the existing mappings left by the first kernel. */ for (index = 0; index < tbl->it_size; index++) { - tceval = ppc_md.tce_get(tbl, index + tbl->it_offset); + tceval = tbl->it_ops->get(tbl, index + tbl->it_offset); /* * Freed TCE entry contains 0x7fffffffffffffff on JS20 */ @@ -657,6 +657,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) unsigned int i; struct iommu_pool *p; + BUG_ON(!tbl->it_ops); + /* number of bytes needed for the bitmap */ sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); @@ -929,8 +931,8 @@ EXPORT_SYMBOL_GPL(iommu_tce_direction); void iommu_flush_tce(struct iommu_table *tbl) { /* Flush/invalidate TLB caches if necessary */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); /* Make sure updates are seen by hardware */ mb(); @@ -941,7 +943,7 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce_value, unsigned long npages) { - /* ppc_md.tce_free() does not support any value but 0 */ + /* tbl->it_ops->clear() does not support any value but 0 */ if (tce_value) return -EINVAL; @@ -989,9 +991,9 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) spin_lock(&(pool->lock)); - oldtce = ppc_md.tce_get(tbl, entry); + oldtce = tbl->it_ops->get(tbl, entry); if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) - ppc_md.tce_free(tbl, entry, 1); + tbl->it_ops->clear(tbl, entry, 1); else oldtce = 0; @@ -1014,10 +1016,10 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, spin_lock(&(pool->lock)); - oldtce = ppc_md.tce_get(tbl, entry); + oldtce = tbl->it_ops->get(tbl, entry); /* Add new entry if it is not busy */ if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) - ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL); + ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL); spin_unlock(&(pool->lock)); diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 5bfdab9047be..b41426c60ef6 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -1196,6 +1196,11 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) tbl->it_type = TCE_VB; tbl->it_blocksize = 16; + if (firmware_has_feature(FW_FEATURE_LPAR)) + tbl->it_ops = &iommu_table_lpar_multi_ops; + else + tbl->it_ops = &iommu_table_pseries_ops; + return iommu_init_table(tbl, -1); } diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 21b502398bf3..14a582b21274 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -466,6 +466,11 @@ static inline u32 cell_iommu_get_ioid(struct device_node *np) return *ioid; } +static struct iommu_table_ops cell_iommu_ops = { + .set = tce_build_cell, + .clear = tce_free_cell +}; + static struct iommu_window * __init cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, unsigned long offset, unsigned long size, @@ -492,6 +497,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, window->table.it_offset = (offset >> window->table.it_page_shift) + pte_offset; window->table.it_size = size >> window->table.it_page_shift; + window->table.it_ops = &cell_iommu_ops; iommu_init_table(&window->table, iommu->nid); @@ -1201,8 +1207,6 @@ static int __init cell_iommu_init(void) /* Setup various callbacks */ cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup; ppc_md.dma_get_required_mask = cell_dma_get_required_mask; - ppc_md.tce_build = tce_build_cell; - ppc_md.tce_free = tce_free_cell; if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0) goto bail; diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index b8f567b2ea19..c929644e74a6 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -134,6 +134,10 @@ static void iobmap_free(struct iommu_table *tbl, long index, } } +static struct iommu_table_ops iommu_table_iobmap_ops = { + .set = iobmap_build, + .clear = iobmap_free +}; static void iommu_table_iobmap_setup(void) { @@ -153,6 +157,7 @@ static void iommu_table_iobmap_setup(void) * Should probably be 8 (64 bytes) */ iommu_table_iobmap.it_blocksize = 4; + iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops; iommu_init_table(&iommu_table_iobmap, 0); pr_debug(" <- %s\n", __func__); } @@ -252,8 +257,6 @@ void __init iommu_init_early_pasemi(void) pasemi_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pasemi; pasemi_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pasemi; - ppc_md.tce_build = iobmap_build; - ppc_md.tce_free = iobmap_free; set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 8070dc9921f2..b0a14c0e7774 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1726,6 +1726,12 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, */ } +static struct iommu_table_ops pnv_ioda1_iommu_ops = { + .set = pnv_tce_build, + .clear = pnv_tce_free, + .get = pnv_tce_get, +}; + static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, struct iommu_table *tbl, __be64 *startp, __be64 *endp, bool rm) @@ -1770,6 +1776,12 @@ void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); } +static struct iommu_table_ops pnv_ioda2_iommu_ops = { + .set = pnv_tce_build, + .clear = pnv_tce_free, + .get = pnv_tce_get, +}; + static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe, unsigned int base, unsigned int segs) @@ -1845,6 +1857,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, TCE_PCI_SWINV_FREE | TCE_PCI_SWINV_PAIR); } + tbl->it_ops = &pnv_ioda1_iommu_ops; iommu_init_table(tbl, phb->hose->node); if (pe->flags & PNV_IODA_PE_DEV) { @@ -1973,6 +1986,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 8); tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); } + tbl->it_ops = &pnv_ioda2_iommu_ops; iommu_init_table(tbl, phb->hose->node); if (pe->flags & PNV_IODA_PE_DEV) { diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 61ae20aa9000..5a387a9132ff 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -83,10 +83,17 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } #endif /* CONFIG_PCI_MSI */ +static struct iommu_table_ops pnv_p5ioc2_iommu_ops = { + .set = pnv_tce_build, + .clear = pnv_tce_free, + .get = pnv_tce_get, +}; + static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { if (phb->p5ioc2.iommu_table.it_map == NULL) { + phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops; iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); iommu_register_group(&phb->p5ioc2.iommu_table, pci_domain_nr(phb->hose->bus), phb->opal_id); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 1477422b907a..2793b9d576e3 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -572,9 +572,9 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; -static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, - struct dma_attrs *attrs, bool rm) +int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + struct dma_attrs *attrs) { u64 proto_tce = iommu_direction_to_tce_perm(direction); __be64 *tcep, *tces; @@ -592,22 +592,12 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, * of flags if that becomes the case */ if (tbl->it_type & TCE_PCI_SWINV_CREATE) - pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); + pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false); return 0; } -static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, - false); -} - -static void pnv_tce_free(struct iommu_table *tbl, long index, long npages, - bool rm) +void pnv_tce_free(struct iommu_table *tbl, long index, long npages) { __be64 *tcep, *tces; @@ -617,32 +607,14 @@ static void pnv_tce_free(struct iommu_table *tbl, long index, long npages, *(tcep++) = cpu_to_be64(0); if (tbl->it_type & TCE_PCI_SWINV_FREE) - pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); -} - -static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages) -{ - pnv_tce_free(tbl, index, npages, false); + pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false); } -static unsigned long pnv_tce_get(struct iommu_table *tbl, long index) +unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { return ((u64 *)tbl->it_base)[index - tbl->it_offset]; } -static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true); -} - -static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages) -{ - pnv_tce_free(tbl, index, npages, true); -} - void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned page_shift) @@ -744,11 +716,6 @@ void __init pnv_pci_init(void) pci_devs_phb_init(); /* Configure IOMMU DMA hooks */ - ppc_md.tce_build = pnv_tce_build_vm; - ppc_md.tce_free = pnv_tce_free_vm; - ppc_md.tce_build_rm = pnv_tce_build_rm; - ppc_md.tce_free_rm = pnv_tce_free_rm; - ppc_md.tce_get = pnv_tce_get; set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 714ee3c19854..45a756007ec3 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -197,6 +197,11 @@ struct pnv_phb { }; extern struct pci_ops pnv_pci_ops; +extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + struct dma_attrs *attrs); +extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); +extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, unsigned char *log_buff); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index fe5117bac29a..33f3a855bfd9 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -206,7 +206,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; unsigned long flags; - if (npages == 1) { + if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } @@ -298,6 +298,9 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n { u64 rc; + if (!firmware_has_feature(FW_FEATURE_MULTITCE)) + return tce_free_pSeriesLP(tbl, tcenum, npages); + rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); if (rc && printk_ratelimit()) { @@ -473,7 +476,6 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); } - #ifdef CONFIG_PCI static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, @@ -559,6 +561,12 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb, tbl->it_size = size >> tbl->it_page_shift; } +struct iommu_table_ops iommu_table_pseries_ops = { + .set = tce_build_pSeries, + .clear = tce_free_pSeries, + .get = tce_get_pseries +}; + static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) { struct device_node *dn; @@ -627,6 +635,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) pci->phb->node); iommu_table_setparms(pci->phb, dn, tbl); + tbl->it_ops = &iommu_table_pseries_ops; pci->iommu_table = iommu_init_table(tbl, pci->phb->node); iommu_register_group(tbl, pci_domain_nr(bus), 0); @@ -638,6 +647,11 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); } +struct iommu_table_ops iommu_table_lpar_multi_ops = { + .set = tce_buildmulti_pSeriesLP, + .clear = tce_freemulti_pSeriesLP, + .get = tce_get_pSeriesLP +}; static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) { @@ -672,6 +686,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, ppci->phb->node); iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); + tbl->it_ops = &iommu_table_lpar_multi_ops; ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); iommu_register_group(tbl, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->iommu_table); @@ -699,6 +714,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, phb->node); iommu_table_setparms(phb, dn, tbl); + tbl->it_ops = &iommu_table_pseries_ops; PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); iommu_register_group(tbl, pci_domain_nr(phb->bus), 0); set_iommu_table_base(&dev->dev, tbl); @@ -1121,6 +1137,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, pci->phb->node); iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); + tbl->it_ops = &iommu_table_lpar_multi_ops; pci->iommu_table = iommu_init_table(tbl, pci->phb->node); iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0); pr_debug(" created table: %p\n", pci->iommu_table); @@ -1315,22 +1332,11 @@ void iommu_init_early_pSeries(void) return; if (firmware_has_feature(FW_FEATURE_LPAR)) { - if (firmware_has_feature(FW_FEATURE_MULTITCE)) { - ppc_md.tce_build = tce_buildmulti_pSeriesLP; - ppc_md.tce_free = tce_freemulti_pSeriesLP; - } else { - ppc_md.tce_build = tce_build_pSeriesLP; - ppc_md.tce_free = tce_free_pSeriesLP; - } - ppc_md.tce_get = tce_get_pSeriesLP; pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; } else { - ppc_md.tce_build = tce_build_pSeries; - ppc_md.tce_free = tce_free_pSeries; - ppc_md.tce_get = tce_get_pseries; pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; } @@ -1348,8 +1354,6 @@ static int __init disable_multitce(char *str) firmware_has_feature(FW_FEATURE_LPAR) && firmware_has_feature(FW_FEATURE_MULTITCE)) { printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); - ppc_md.tce_build = tce_build_pSeriesLP; - ppc_md.tce_free = tce_free_pSeriesLP; powerpc_firmware_features &= ~FW_FEATURE_MULTITCE; } return 1; diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index d00a5663e312..90bcdfeedf48 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -286,6 +286,12 @@ static int __init dart_init(struct device_node *dart_node) return 0; } +static struct iommu_table_ops iommu_dart_ops = { + .set = dart_build, + .clear = dart_free, + .flush = dart_flush, +}; + static void iommu_table_dart_setup(void) { iommu_table_dart.it_busno = 0; @@ -298,6 +304,7 @@ static void iommu_table_dart_setup(void) iommu_table_dart.it_base = (unsigned long)dart_vbase; iommu_table_dart.it_index = 0; iommu_table_dart.it_blocksize = 1; + iommu_table_dart.it_ops = &iommu_dart_ops; iommu_init_table(&iommu_table_dart, -1); /* Reserve the last page of the DART to avoid possible prefetch @@ -386,11 +393,6 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) if (dart_init(dn) != 0) goto bail; - /* Setup low level TCE operations for the core IOMMU code */ - ppc_md.tce_build = dart_build; - ppc_md.tce_free = dart_free; - ppc_md.tce_flush = dart_flush; - /* Setup bypass if supported */ if (dart_is_u4) ppc_md.dma_set_mask = dart_dma_set_mask; -- cgit v1.2.3 From decbda25728ddfbb28b77749d2545028e892ca99 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:07 +1000 Subject: powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free() The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is supposed to be called on IODA1/2 and not called on p5ioc2. It receives start and end host addresses of TCE table. IODA2 actually needs PCI addresses to invalidate the cache. Those can be calculated from host addresses but since we are going to implement multi-level TCE tables, calculating PCI address from a host address might get either tricky or ugly as TCE table remains flat on PCI bus but not in RAM. This moves pnv_pci_ioda_tce_invalidate() from generic pnv_tce_build/ pnt_tce_free and defines IODA1/2-specific callbacks which call generic ones and do PHB-model-specific TCE cache invalidation. P5IOC2 keeps using generic callbacks as before. This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and number of pages which are PCI addresses shifted by IOMMU page shift. No change in behaviour is expected. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 81 ++++++++++++++++++++++--------- arch/powerpc/platforms/powernv/pci.c | 17 ++----- 2 files changed, 61 insertions(+), 37 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b0a14c0e7774..f710729a4a35 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1679,18 +1679,19 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, } } -static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, - struct iommu_table *tbl, - __be64 *startp, __be64 *endp, bool rm) +static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, + unsigned long index, unsigned long npages, bool rm) { + struct pnv_ioda_pe *pe = tbl->data; __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->tce_inval_reg_phys : (__be64 __iomem *)tbl->it_index; unsigned long start, end, inc; const unsigned shift = tbl->it_page_shift; - start = __pa(startp); - end = __pa(endp); + start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset); + end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset + + npages - 1); /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ if (tbl->it_busno) { @@ -1726,16 +1727,39 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, */ } +static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, + long npages, unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, + attrs); + + if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE)) + pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); + + return ret; +} + +static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, + long npages) +{ + pnv_tce_free(tbl, index, npages); + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); +} + static struct iommu_table_ops pnv_ioda1_iommu_ops = { - .set = pnv_tce_build, - .clear = pnv_tce_free, + .set = pnv_ioda1_tce_build, + .clear = pnv_ioda1_tce_free, .get = pnv_tce_get, }; -static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, - struct iommu_table *tbl, - __be64 *startp, __be64 *endp, bool rm) +static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, + unsigned long index, unsigned long npages, bool rm) { + struct pnv_ioda_pe *pe = tbl->data; unsigned long start, end, inc; __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->tce_inval_reg_phys : @@ -1748,10 +1772,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, end = start; /* Figure out the start, end and step */ - inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64)); - start |= (inc << shift); - inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64)); - end |= (inc << shift); + start |= (index << shift); + end |= ((index + npages - 1) << shift); inc = (0x1ull << shift); mb(); @@ -1764,21 +1786,32 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, } } -void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, - __be64 *startp, __be64 *endp, bool rm) +static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, + long npages, unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) { - struct pnv_ioda_pe *pe = tbl->data; - struct pnv_phb *phb = pe->phb; + int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, + attrs); - if (phb->type == PNV_PHB_IODA1) - pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); - else - pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); + if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE)) + pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); + + return ret; +} + +static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, + long npages) +{ + pnv_tce_free(tbl, index, npages); + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); } static struct iommu_table_ops pnv_ioda2_iommu_ops = { - .set = pnv_tce_build, - .clear = pnv_tce_free, + .set = pnv_ioda2_tce_build, + .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, }; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 2793b9d576e3..f72fc6e7d63d 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -577,37 +577,28 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, struct dma_attrs *attrs) { u64 proto_tce = iommu_direction_to_tce_perm(direction); - __be64 *tcep, *tces; + __be64 *tcep; u64 rpn; - tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; + tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; rpn = __pa(uaddr) >> tbl->it_page_shift; while (npages--) *(tcep++) = cpu_to_be64(proto_tce | (rpn++ << tbl->it_page_shift)); - /* Some implementations won't cache invalid TCEs and thus may not - * need that flush. We'll probably turn it_type into a bit mask - * of flags if that becomes the case - */ - if (tbl->it_type & TCE_PCI_SWINV_CREATE) - pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false); return 0; } void pnv_tce_free(struct iommu_table *tbl, long index, long npages) { - __be64 *tcep, *tces; + __be64 *tcep; - tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; + tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; while (npages--) *(tcep++) = cpu_to_be64(0); - - if (tbl->it_type & TCE_PCI_SWINV_FREE) - pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false); } unsigned long pnv_tce_get(struct iommu_table *tbl, long index) -- cgit v1.2.3 From b348aa65297659c310943221ac1d3f4b4491ea44 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:08 +1000 Subject: powerpc/spapr: vfio: Replace iommu_table with iommu_table_group Modern IBM POWERPC systems support multiple (currently two) TCE tables per IOMMU group (a.k.a. PE). This adds a iommu_table_group container for TCE tables. Right now just one table is supported. This defines iommu_table_group struct which stores pointers to iommu_group and iommu_table(s). This replaces iommu_table with iommu_table_group where iommu_table was used to identify a group: - iommu_register_group(); - iommudata of generic iommu_group; This removes @data from iommu_table as it_table_group provides same access to pnv_ioda_pe. For IODA, instead of embedding iommu_table, the new iommu_table_group keeps pointers to those. The iommu_table structs are allocated dynamically. For P5IOC2, both iommu_table_group and iommu_table are embedded into PE struct. As there is no EEH and SRIOV support for P5IOC2, iommu_free_table() should not be called on iommu_table struct pointers so we can keep it embedded in pnv_phb::p5ioc2. For pSeries, this replaces multiple calls of kzalloc_node() with a new iommu_pseries_alloc_group() helper and stores the table group struct pointer into the pci_dn struct. For release, a iommu_table_free_group() helper is added. This moves iommu_table struct allocation from SR-IOV code to the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized. This change is here because those lines had to be changed anyway. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 19 ++--- arch/powerpc/include/asm/pci-bridge.h | 2 +- arch/powerpc/kernel/iommu.c | 17 ++--- arch/powerpc/platforms/powernv/pci-ioda.c | 55 +++++++------- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 18 +++-- arch/powerpc/platforms/powernv/pci.h | 3 +- arch/powerpc/platforms/pseries/iommu.c | 107 +++++++++++++++++++--------- drivers/vfio/vfio_iommu_spapr_tce.c | 23 +++--- 8 files changed, 152 insertions(+), 92 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index e2a45c37dba8..5a7267f47bdb 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -91,14 +91,9 @@ struct iommu_table { struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ -#ifdef CONFIG_IOMMU_API - struct iommu_group *it_group; -#endif + struct iommu_table_group *it_table_group; struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); -#ifdef CONFIG_PPC_POWERNV - void *data; -#endif }; /* Pure 2^n version of get_order */ @@ -129,14 +124,22 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); +#define IOMMU_TABLE_GROUP_MAX_TABLES 1 + +struct iommu_table_group { + struct iommu_group *group; + struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; +}; + #ifdef CONFIG_IOMMU_API -extern void iommu_register_group(struct iommu_table *tbl, + +extern void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); extern int __init tce_iommu_bus_notifier_init(void); #else -static inline void iommu_register_group(struct iommu_table *tbl, +static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index b76cd5682a8c..712add590445 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -199,7 +199,7 @@ struct pci_dn { struct pci_dn *parent; struct pci_controller *phb; /* for pci devices */ - struct iommu_table *iommu_table; /* for phb's or bridges */ + struct iommu_table_group *table_group; /* for phb's or bridges */ struct device_node *node; /* back-pointer to the device_node */ int pci_ext_config_space; /* for pci devices */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c0e67e945c95..719f0485ec79 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -889,11 +889,12 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm); */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl->it_group = NULL; + struct iommu_table_group *table_group = iommu_data; + + table_group->group = NULL; } -void iommu_register_group(struct iommu_table *tbl, +void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num) { struct iommu_group *grp; @@ -905,8 +906,8 @@ void iommu_register_group(struct iommu_table *tbl, PTR_ERR(grp)); return; } - tbl->it_group = grp; - iommu_group_set_iommudata(grp, tbl, group_release); + table_group->group = grp; + iommu_group_set_iommudata(grp, table_group, group_release); name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", pci_domain_number, pe_num); if (!name) @@ -1094,7 +1095,7 @@ int iommu_add_device(struct device *dev) } tbl = get_iommu_table_base(dev); - if (!tbl || !tbl->it_group) { + if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) { pr_debug("%s: Skipping device %s with no tbl\n", __func__, dev_name(dev)); return 0; @@ -1102,7 +1103,7 @@ int iommu_add_device(struct device *dev) pr_debug("%s: Adding %s to iommu group %d\n", __func__, dev_name(dev), - iommu_group_id(tbl->it_group)); + iommu_group_id(tbl->it_table_group->group)); if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n", @@ -1111,7 +1112,7 @@ int iommu_add_device(struct device *dev) return -EINVAL; } - return iommu_group_add_device(tbl->it_group, dev); + return iommu_group_add_device(tbl->it_table_group->group, dev); } EXPORT_SYMBOL_GPL(iommu_add_device); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f710729a4a35..279dadf43d5a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1087,10 +1087,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) return; } - pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), - GFP_KERNEL, hose->node); - pe->tce32_table->data = pe; - /* Associate it with all child devices */ pnv_ioda_setup_same_PE(bus, pe); @@ -1292,11 +1288,12 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe struct iommu_table *tbl; unsigned long addr; int64_t rc; + struct iommu_table_group *table_group; bus = dev->bus; hose = pci_bus_to_host(bus); phb = hose->private_data; - tbl = pe->tce32_table; + tbl = pe->table_group.tables[0]; addr = tbl->it_base; opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, @@ -1311,13 +1308,14 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe if (rc) pe_warn(pe, "OPAL error %ld release DMA window\n", rc); - if (tbl->it_group) { - iommu_group_put(tbl->it_group); - BUG_ON(tbl->it_group); + table_group = tbl->it_table_group; + if (table_group->group) { + iommu_group_put(table_group->group); + BUG_ON(table_group->group); } iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); free_pages(addr, get_order(TCE32_TABLE_SIZE)); - pe->tce32_table = NULL; + pe->table_group.tables[0] = NULL; } static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) @@ -1465,10 +1463,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) continue; } - pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), - GFP_KERNEL, hose->node); - pe->tce32_table->data = pe; - /* Put PE to the list */ mutex_lock(&phb->ioda.pe_list_mutex); list_add_tail(&pe->list, &phb->ioda.pe_list); @@ -1603,7 +1597,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_iommu_table_base(&pdev->dev, pe->tce32_table); + set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); /* * Note: iommu_add_device() will fail here as * for physical PE: the device is already added by now; @@ -1637,7 +1631,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) } else { dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); set_dma_ops(&pdev->dev, &dma_iommu_ops); - set_iommu_table_base(&pdev->dev, pe->tce32_table); + set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); } *pdev->dev.dma_mask = dma_mask; return 0; @@ -1671,7 +1665,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base(&dev->dev, pe->tce32_table); + set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); iommu_add_device(&dev->dev); if (dev->subordinate) @@ -1682,7 +1676,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = tbl->data; + struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->tce_inval_reg_phys : (__be64 __iomem *)tbl->it_index; @@ -1759,7 +1754,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = tbl->data; + struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct pnv_ioda_pe, table_group); unsigned long start, end, inc; __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->tce_inval_reg_phys : @@ -1835,8 +1831,12 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; - tbl = pe->tce32_table; - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + phb->hose->node); + tbl->it_table_group = &pe->table_group; + pe->table_group.tables[0] = tbl; + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); /* Grab a 32-bit TCE table */ pe->tce32_seg = base; @@ -1915,7 +1915,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) { - struct pnv_ioda_pe *pe = tbl->data; + struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct pnv_ioda_pe, table_group); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1949,10 +1950,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pe->tce_bypass_base = 1ull << 59; /* Install set_bypass callback for VFIO */ - pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass; + pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass; /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(pe->tce32_table, true); + pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true); } static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, @@ -1969,8 +1970,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; - tbl = pe->tce32_table; - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + phb->hose->node); + tbl->it_table_group = &pe->table_group; + pe->table_group.tables[0] = tbl; + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 5a387a9132ff..6e00104093d6 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -92,14 +92,16 @@ static struct iommu_table_ops pnv_p5ioc2_iommu_ops = { static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { - if (phb->p5ioc2.iommu_table.it_map == NULL) { - phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops; - iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); - iommu_register_group(&phb->p5ioc2.iommu_table, + struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0]; + + if (!tbl->it_map) { + tbl->it_ops = &pnv_p5ioc2_iommu_ops; + iommu_init_table(tbl, phb->hose->node); + iommu_register_group(&phb->p5ioc2.table_group, pci_domain_nr(phb->hose->bus), phb->opal_id); } - set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); + set_iommu_table_base(&pdev->dev, tbl); iommu_add_device(&pdev->dev); } @@ -188,6 +190,12 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table, tce_mem, tce_size, 0, IOMMU_PAGE_SHIFT_4K); + /* + * We do not allocate iommu_table as we do not support + * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table() + * should not be called for phb->p5ioc2.table_group.tables[0] ever. + */ + phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table; } void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 45a756007ec3..d8ba34d7af1b 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -57,7 +57,7 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ int tce32_seg; int tce32_segcount; - struct iommu_table *tce32_table; + struct iommu_table_group table_group; phys_addr_t tce_inval_reg_phys; /* 64-bit TCE bypass region */ @@ -120,6 +120,7 @@ struct pnv_phb { union { struct { struct iommu_table iommu_table; + struct iommu_table_group table_group; } p5ioc2; struct { diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 33f3a855bfd9..307d704ffaa3 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -52,16 +52,51 @@ #include "pseries.h" -static void iommu_pseries_free_table(struct iommu_table *tbl, +static struct iommu_table_group *iommu_pseries_alloc_group(int node) +{ + struct iommu_table_group *table_group = NULL; + struct iommu_table *tbl = NULL; + + table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, + node); + if (!table_group) + goto fail_exit; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); + if (!tbl) + goto fail_exit; + + tbl->it_table_group = table_group; + table_group->tables[0] = tbl; + + return table_group; + +fail_exit: + kfree(table_group); + kfree(tbl); + + return NULL; +} + +static void iommu_pseries_free_group(struct iommu_table_group *table_group, const char *node_name) { + struct iommu_table *tbl; + + if (!table_group) + return; + #ifdef CONFIG_IOMMU_API - if (tbl->it_group) { - iommu_group_put(tbl->it_group); - BUG_ON(tbl->it_group); + if (table_group->group) { + iommu_group_put(table_group->group); + BUG_ON(table_group->group); } #endif + + tbl = table_group->tables[0]; iommu_free_table(tbl, node_name); + + kfree(table_group); } static void tce_invalidate_pSeries_sw(struct iommu_table *tbl, @@ -631,13 +666,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) pci->phb->dma_window_size = 0x8000000ul; pci->phb->dma_window_base_cur = 0x8000000ul; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - pci->phb->node); + pci->table_group = iommu_pseries_alloc_group(pci->phb->node); + tbl = pci->table_group->tables[0]; iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - pci->iommu_table = iommu_init_table(tbl, pci->phb->node); - iommu_register_group(tbl, pci_domain_nr(bus), 0); + iommu_init_table(tbl, pci->phb->node); + iommu_register_group(pci->table_group, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -680,16 +715,17 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci = PCI_DN(pdn); pr_debug(" parent is %s, iommu_table: 0x%p\n", - pdn->full_name, ppci->iommu_table); + pdn->full_name, ppci->table_group); - if (!ppci->iommu_table) { - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - ppci->phb->node); + if (!ppci->table_group) { + ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); + tbl = ppci->table_group->tables[0]; iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); - iommu_register_group(tbl, pci_domain_nr(bus), 0); - pr_debug(" created table: %p\n", ppci->iommu_table); + iommu_init_table(tbl, ppci->phb->node); + iommu_register_group(ppci->table_group, + pci_domain_nr(bus), 0); + pr_debug(" created table: %p\n", ppci->table_group); } } @@ -711,12 +747,13 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) struct pci_controller *phb = PCI_DN(dn)->phb; pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - phb->node); + PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); + tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); - iommu_register_group(tbl, pci_domain_nr(phb->bus), 0); + iommu_init_table(tbl, phb->node); + iommu_register_group(PCI_DN(dn)->table_group, + pci_domain_nr(phb->bus), 0); set_iommu_table_base(&dev->dev, tbl); iommu_add_device(&dev->dev); return; @@ -726,11 +763,12 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) * an already allocated iommu table is found and use that. */ - while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL) + while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL) dn = dn->parent; if (dn && PCI_DN(dn)) { - set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + set_iommu_table_base(&dev->dev, + PCI_DN(dn)->table_group->tables[0]); iommu_add_device(&dev->dev); } else printk(KERN_WARNING "iommu: Device %s has no iommu table\n", @@ -1117,7 +1155,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) dn = pci_device_to_OF_node(dev); pr_debug(" node is %s\n", dn->full_name); - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window) @@ -1133,19 +1171,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pr_debug(" parent is %s\n", pdn->full_name); pci = PCI_DN(pdn); - if (!pci->iommu_table) { - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - pci->phb->node); + if (!pci->table_group) { + pci->table_group = iommu_pseries_alloc_group(pci->phb->node); + tbl = pci->table_group->tables[0]; iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - pci->iommu_table = iommu_init_table(tbl, pci->phb->node); - iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0); - pr_debug(" created table: %p\n", pci->iommu_table); + iommu_init_table(tbl, pci->phb->node); + iommu_register_group(pci->table_group, + pci_domain_nr(pci->phb->bus), 0); + pr_debug(" created table: %p\n", pci->table_group); } else { - pr_debug(" found DMA window, table: %p\n", pci->iommu_table); + pr_debug(" found DMA window, table: %p\n", pci->table_group); } - set_iommu_table_base(&dev->dev, pci->iommu_table); + set_iommu_table_base(&dev->dev, pci->table_group->tables[0]); iommu_add_device(&dev->dev); } @@ -1176,7 +1215,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) * search upwards in the tree until we either hit a dma-window * property, OR find a parent with a table already allocated. */ - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window) @@ -1220,7 +1259,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev) dn = pci_device_to_OF_node(pdev); /* search upwards for ibm,dma-window */ - for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table; + for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group; dn = dn->parent) if (of_get_property(dn, "ibm,dma-window", NULL)) break; @@ -1300,8 +1339,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * the device node. */ remove_ddw(np, false); - if (pci && pci->iommu_table) - iommu_pseries_free_table(pci->iommu_table, + if (pci && pci->table_group) + iommu_pseries_free_group(pci->table_group, np->full_name); spin_lock(&direct_window_list_lock); diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index e65bc73cc8a8..c4bc345d64d7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -190,10 +190,11 @@ static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; - WARN_ON(container->tbl && !container->tbl->it_group); + WARN_ON(container->tbl && !container->tbl->it_table_group->group); - if (container->tbl && container->tbl->it_group) - tce_iommu_detach_group(iommu_data, container->tbl->it_group); + if (container->tbl && container->tbl->it_table_group->group) + tce_iommu_detach_group(iommu_data, + container->tbl->it_table_group->group); tce_iommu_disable(container); mutex_destroy(&container->lock); @@ -345,7 +346,7 @@ static long tce_iommu_ioctl(void *iommu_data, if (!tbl) return -ENXIO; - BUG_ON(!tbl->it_group); + BUG_ON(!tbl->it_table_group->group); minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); @@ -433,11 +434,12 @@ static long tce_iommu_ioctl(void *iommu_data, mutex_unlock(&container->lock); return 0; case VFIO_EEH_PE_OP: - if (!container->tbl || !container->tbl->it_group) + if (!container->tbl || !container->tbl->it_table_group->group) return -ENODEV; - return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group, - cmd, arg); + return vfio_spapr_iommu_eeh_ioctl( + container->tbl->it_table_group->group, + cmd, arg); } return -ENOTTY; @@ -457,7 +459,8 @@ static int tce_iommu_attach_group(void *iommu_data, iommu_group_id(iommu_group), iommu_group); */ if (container->tbl) { pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", - iommu_group_id(container->tbl->it_group), + iommu_group_id(container->tbl-> + it_table_group->group), iommu_group_id(iommu_group)); ret = -EBUSY; goto unlock_exit; @@ -491,13 +494,13 @@ static void tce_iommu_detach_group(void *iommu_data, if (tbl != container->tbl) { pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", iommu_group_id(iommu_group), - iommu_group_id(tbl->it_group)); + iommu_group_id(tbl->it_table_group->group)); goto unlock_exit; } if (container->enabled) { pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", - iommu_group_id(tbl->it_group)); + iommu_group_id(tbl->it_table_group->group)); tce_iommu_disable(container); } -- cgit v1.2.3 From 0eaf4defc7c44ed5dd33a03cab12a5f88c9b4b86 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:09 +1000 Subject: powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group So far one TCE table could only be used by one IOMMU group. However IODA2 hardware allows programming the same TCE table address to multiple PE allowing sharing tables. This replaces a single pointer to a group in a iommu_table struct with a linked list of groups which provides the way of invalidating TCE cache for every PE when an actual TCE table is updated. This adds pnv_pci_link_table_and_group() and pnv_pci_unlink_table_and_group() helpers to manage the list. However without VFIO, it is still going to be a single IOMMU group per iommu_table. This changes iommu_add_device() to add a device to a first group from the group list of a table as it is only called from the platform init code or PCI bus notifier and at these moments there is only one group per table. This does not change TCE invalidation code to loop through all attached groups in order to simplify this patch and because it is not really needed in most cases. IODA2 is fixed in a later patch. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 8 +- arch/powerpc/kernel/iommu.c | 14 +++- arch/powerpc/platforms/powernv/pci-ioda.c | 45 ++++++---- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 + arch/powerpc/platforms/powernv/pci.c | 75 +++++++++++++++++ arch/powerpc/platforms/powernv/pci.h | 7 ++ arch/powerpc/platforms/pseries/iommu.c | 27 +++++- drivers/vfio/vfio_iommu_spapr_tce.c | 122 ++++++++++++++++++++-------- 8 files changed, 241 insertions(+), 60 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 5a7267f47bdb..44a20ccf06b4 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -91,7 +91,7 @@ struct iommu_table { struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ - struct iommu_table_group *it_table_group; + struct list_head it_group_list;/* List of iommu_table_group_link */ struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); }; @@ -126,6 +126,12 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); #define IOMMU_TABLE_GROUP_MAX_TABLES 1 +struct iommu_table_group_link { + struct list_head next; + struct rcu_head rcu; + struct iommu_table_group *table_group; +}; + struct iommu_table_group { struct iommu_group *group; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 719f0485ec79..be258b2ecb10 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1078,6 +1078,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership); int iommu_add_device(struct device *dev) { struct iommu_table *tbl; + struct iommu_table_group_link *tgl; /* * The sysfs entries should be populated before @@ -1095,15 +1096,22 @@ int iommu_add_device(struct device *dev) } tbl = get_iommu_table_base(dev); - if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) { + if (!tbl) { pr_debug("%s: Skipping device %s with no tbl\n", __func__, dev_name(dev)); return 0; } + tgl = list_first_entry_or_null(&tbl->it_group_list, + struct iommu_table_group_link, next); + if (!tgl) { + pr_debug("%s: Skipping device %s with no group\n", + __func__, dev_name(dev)); + return 0; + } pr_debug("%s: Adding %s to iommu group %d\n", __func__, dev_name(dev), - iommu_group_id(tbl->it_table_group->group)); + iommu_group_id(tgl->table_group->group)); if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n", @@ -1112,7 +1120,7 @@ int iommu_add_device(struct device *dev) return -EINVAL; } - return iommu_group_add_device(tbl->it_table_group->group, dev); + return iommu_group_add_device(tgl->table_group->group, dev); } EXPORT_SYMBOL_GPL(iommu_add_device); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 279dadf43d5a..3b4130697b05 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1288,7 +1288,6 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe struct iommu_table *tbl; unsigned long addr; int64_t rc; - struct iommu_table_group *table_group; bus = dev->bus; hose = pci_bus_to_host(bus); @@ -1308,14 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe if (rc) pe_warn(pe, "OPAL error %ld release DMA window\n", rc); - table_group = tbl->it_table_group; - if (table_group->group) { - iommu_group_put(table_group->group); - BUG_ON(table_group->group); + pnv_pci_unlink_table_and_group(tbl, &pe->table_group); + if (pe->table_group.group) { + iommu_group_put(pe->table_group.group); + BUG_ON(pe->table_group.group); } iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); free_pages(addr, get_order(TCE32_TABLE_SIZE)); - pe->table_group.tables[0] = NULL; } static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) @@ -1676,7 +1674,10 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct iommu_table_group_link *tgl = list_first_entry_or_null( + &tbl->it_group_list, struct iommu_table_group_link, + next); + struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->tce_inval_reg_phys : @@ -1754,7 +1755,10 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct iommu_table_group_link *tgl = list_first_entry_or_null( + &tbl->it_group_list, struct iommu_table_group_link, + next); + struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); unsigned long start, end, inc; __be64 __iomem *invalidate = rm ? @@ -1831,12 +1835,10 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - phb->hose->node); - tbl->it_table_group = &pe->table_group; - pe->table_group.tables[0] = tbl; + tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); + pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* Grab a 32-bit TCE table */ pe->tce32_seg = base; @@ -1911,11 +1913,18 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, pe->tce32_seg = -1; if (tce_mem) __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); + if (tbl) { + pnv_pci_unlink_table_and_group(tbl, &pe->table_group); + iommu_free_table(tbl, "pnv"); + } } static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) { - struct pnv_ioda_pe *pe = container_of(tbl->it_table_group, + struct iommu_table_group_link *tgl = list_first_entry_or_null( + &tbl->it_group_list, struct iommu_table_group_link, + next); + struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1970,12 +1979,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - phb->hose->node); - tbl->it_table_group = &pe->table_group; - pe->table_group.tables[0] = tbl; + tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); + pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; @@ -2048,6 +2055,10 @@ fail: pe->tce32_seg = -1; if (tce_mem) __free_pages(tce_mem, get_order(tce_table_size)); + if (tbl) { + pnv_pci_unlink_table_and_group(tbl, &pe->table_group); + iommu_free_table(tbl, "pnv"); + } } static void pnv_ioda_setup_dma(struct pnv_phb *phb) diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 6e00104093d6..f80e5a1d6117 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -99,6 +99,9 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, iommu_init_table(tbl, phb->hose->node); iommu_register_group(&phb->p5ioc2.table_group, pci_domain_nr(phb->hose->bus), phb->opal_id); + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + pnv_pci_link_table_and_group(phb->hose->node, 0, + tbl, &phb->p5ioc2.table_group); } set_iommu_table_base(&pdev->dev, tbl); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index f72fc6e7d63d..00f1abd34379 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -606,6 +606,81 @@ unsigned long pnv_tce_get(struct iommu_table *tbl, long index) return ((u64 *)tbl->it_base)[index - tbl->it_offset]; } +struct iommu_table *pnv_pci_table_alloc(int nid) +{ + struct iommu_table *tbl; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid); + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + + return tbl; +} + +long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + struct iommu_table_group_link *tgl = NULL; + + if (WARN_ON(!tbl || !table_group)) + return -EINVAL; + + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + return -ENOMEM; + + tgl->table_group = table_group; + list_add_rcu(&tgl->next, &tbl->it_group_list); + + table_group->tables[num] = tbl; + + return 0; +} + +static void pnv_iommu_table_group_link_free(struct rcu_head *head) +{ + struct iommu_table_group_link *tgl = container_of(head, + struct iommu_table_group_link, rcu); + + kfree(tgl); +} + +void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + long i; + bool found; + struct iommu_table_group_link *tgl; + + if (!tbl || !table_group) + return; + + /* Remove link to a group from table's list of attached groups */ + found = false; + list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + if (tgl->table_group == table_group) { + list_del_rcu(&tgl->next); + call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free); + found = true; + break; + } + } + if (WARN_ON(!found)) + return; + + /* Clean a pointer to iommu_table in iommu_table_group::tables[] */ + found = false; + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (table_group->tables[i] == tbl) { + table_group->tables[i] = NULL; + found = true; + break; + } + } + WARN_ON(!found); +} + void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned page_shift) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index d8ba34d7af1b..d37cb4da409f 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -210,6 +210,13 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, int where, int size, u32 *val); int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val); +extern struct iommu_table *pnv_pci_table_alloc(int nid); + +extern long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group); extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned page_shift); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 307d704ffaa3..10510dea16b3 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) { struct iommu_table_group *table_group = NULL; struct iommu_table *tbl = NULL; + struct iommu_table_group_link *tgl = NULL; table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, node); @@ -66,12 +68,21 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) if (!tbl) goto fail_exit; - tbl->it_table_group = table_group; + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + goto fail_exit; + + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + tgl->table_group = table_group; + list_add_rcu(&tgl->next, &tbl->it_group_list); + table_group->tables[0] = tbl; return table_group; fail_exit: + kfree(tgl); kfree(table_group); kfree(tbl); @@ -82,18 +93,28 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group, const char *node_name) { struct iommu_table *tbl; +#ifdef CONFIG_IOMMU_API + struct iommu_table_group_link *tgl; +#endif if (!table_group) return; + tbl = table_group->tables[0]; #ifdef CONFIG_IOMMU_API + tgl = list_first_entry_or_null(&tbl->it_group_list, + struct iommu_table_group_link, next); + + WARN_ON_ONCE(!tgl); + if (tgl) { + list_del_rcu(&tgl->next); + kfree(tgl); + } if (table_group->group) { iommu_group_put(table_group->group); BUG_ON(table_group->group); } #endif - - tbl = table_group->tables[0]; iommu_free_table(tbl, node_name); kfree(table_group); diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index c4bc345d64d7..ffc634a75dba 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -88,7 +88,7 @@ static void decrement_locked_vm(long npages) */ struct tce_container { struct mutex lock; - struct iommu_table *tbl; + struct iommu_group *grp; bool enabled; unsigned long locked_pages; }; @@ -103,13 +103,42 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift) return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; } +static long tce_iommu_find_table(struct tce_container *container, + phys_addr_t ioba, struct iommu_table **ptbl) +{ + long i; + struct iommu_table_group *table_group; + + table_group = iommu_group_get_iommudata(container->grp); + if (!table_group) + return -1; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (tbl) { + unsigned long entry = ioba >> tbl->it_page_shift; + unsigned long start = tbl->it_offset; + unsigned long end = start + tbl->it_size; + + if ((start <= entry) && (entry < end)) { + *ptbl = tbl; + return i; + } + } + } + + return -1; +} + static int tce_iommu_enable(struct tce_container *container) { int ret = 0; unsigned long locked; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl; + struct iommu_table_group *table_group; - if (!container->tbl) + if (!container->grp) return -ENXIO; if (!current->mm) @@ -143,6 +172,11 @@ static int tce_iommu_enable(struct tce_container *container) * as this information is only available from KVM and VFIO is * KVM agnostic. */ + table_group = iommu_group_get_iommudata(container->grp); + if (!table_group) + return -ENODEV; + + tbl = table_group->tables[0]; locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; ret = try_increment_locked_vm(locked); if (ret) @@ -190,11 +224,10 @@ static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; - WARN_ON(container->tbl && !container->tbl->it_table_group->group); + WARN_ON(container->grp); - if (container->tbl && container->tbl->it_table_group->group) - tce_iommu_detach_group(iommu_data, - container->tbl->it_table_group->group); + if (container->grp) + tce_iommu_detach_group(iommu_data, container->grp); tce_iommu_disable(container); mutex_destroy(&container->lock); @@ -312,9 +345,16 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { struct vfio_iommu_spapr_tce_info info; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl; + struct iommu_table_group *table_group; + + if (WARN_ON(!container->grp)) + return -ENXIO; + + table_group = iommu_group_get_iommudata(container->grp); - if (WARN_ON(!tbl)) + tbl = table_group->tables[0]; + if (WARN_ON_ONCE(!tbl)) return -ENXIO; minsz = offsetofend(struct vfio_iommu_spapr_tce_info, @@ -337,17 +377,13 @@ static long tce_iommu_ioctl(void *iommu_data, } case VFIO_IOMMU_MAP_DMA: { struct vfio_iommu_type1_dma_map param; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl = NULL; unsigned long tce; + long num; if (!container->enabled) return -EPERM; - if (!tbl) - return -ENXIO; - - BUG_ON(!tbl->it_table_group->group); - minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); if (copy_from_user(¶m, (void __user *)arg, minsz)) @@ -360,6 +396,10 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; + num = tce_iommu_find_table(container, param.iova, &tbl); + if (num < 0) + return -ENXIO; + if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) return -EINVAL; @@ -385,14 +425,12 @@ static long tce_iommu_ioctl(void *iommu_data, } case VFIO_IOMMU_UNMAP_DMA: { struct vfio_iommu_type1_dma_unmap param; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl = NULL; + long num; if (!container->enabled) return -EPERM; - if (WARN_ON(!tbl)) - return -ENXIO; - minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); @@ -406,6 +444,10 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; + num = tce_iommu_find_table(container, param.iova, &tbl); + if (num < 0) + return -ENXIO; + if (param.size & ~IOMMU_PAGE_MASK(tbl)) return -EINVAL; @@ -434,12 +476,11 @@ static long tce_iommu_ioctl(void *iommu_data, mutex_unlock(&container->lock); return 0; case VFIO_EEH_PE_OP: - if (!container->tbl || !container->tbl->it_table_group->group) + if (!container->grp) return -ENODEV; - return vfio_spapr_iommu_eeh_ioctl( - container->tbl->it_table_group->group, - cmd, arg); + return vfio_spapr_iommu_eeh_ioctl(container->grp, + cmd, arg); } return -ENOTTY; @@ -450,17 +491,15 @@ static int tce_iommu_attach_group(void *iommu_data, { int ret; struct tce_container *container = iommu_data; - struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + struct iommu_table_group *table_group; - BUG_ON(!tbl); mutex_lock(&container->lock); /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ - if (container->tbl) { + if (container->grp) { pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", - iommu_group_id(container->tbl-> - it_table_group->group), + iommu_group_id(container->grp), iommu_group_id(iommu_group)); ret = -EBUSY; goto unlock_exit; @@ -473,9 +512,15 @@ static int tce_iommu_attach_group(void *iommu_data, goto unlock_exit; } - ret = iommu_take_ownership(tbl); + table_group = iommu_group_get_iommudata(iommu_group); + if (!table_group) { + ret = -ENXIO; + goto unlock_exit; + } + + ret = iommu_take_ownership(table_group->tables[0]); if (!ret) - container->tbl = tbl; + container->grp = iommu_group; unlock_exit: mutex_unlock(&container->lock); @@ -487,26 +532,31 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group) { struct tce_container *container = iommu_data; - struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + struct iommu_table_group *table_group; + struct iommu_table *tbl; - BUG_ON(!tbl); mutex_lock(&container->lock); - if (tbl != container->tbl) { + if (iommu_group != container->grp) { pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", iommu_group_id(iommu_group), - iommu_group_id(tbl->it_table_group->group)); + iommu_group_id(container->grp)); goto unlock_exit; } if (container->enabled) { pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", - iommu_group_id(tbl->it_table_group->group)); + iommu_group_id(container->grp)); tce_iommu_disable(container); } /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ - container->tbl = NULL; + container->grp = NULL; + + table_group = iommu_group_get_iommudata(iommu_group); + BUG_ON(!table_group); + + tbl = table_group->tables[0]; tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); iommu_release_ownership(tbl); -- cgit v1.2.3 From f87a88642e660edd8912ad39fe77848c6f9927a2 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:10 +1000 Subject: vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control This adds tce_iommu_take_ownership() and tce_iommu_release_ownership which call in a loop iommu_take_ownership()/iommu_release_ownership() for every table on the group. As there is just one now, no change in behaviour is expected. At the moment the iommu_table struct has a set_bypass() which enables/ disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code which calls this callback when external IOMMU users such as VFIO are about to get over a PHB. The set_bypass() callback is not really an iommu_table function but IOMMU/PE function. This introduces a iommu_table_group_ops struct and adds take_ownership()/release_ownership() callbacks to it which are called when an external user takes/releases control over the IOMMU. This replaces set_bypass() with ownership callbacks as it is not necessarily just bypass enabling, it can be something else/more so let's give it more generic name. The callbacks is implemented for IODA2 only. Other platforms (P5IOC2, IODA1) will use the old iommu_take_ownership/iommu_release_ownership API. The following patches will replace iommu_take_ownership/ iommu_release_ownership calls in IODA2 with full IOMMU table release/ create. As we here and touching bypass control, this removes pnv_pci_ioda2_setup_bypass_pe() as it does not do much more compared to pnv_pci_ioda2_set_bypass. This moves tce_bypass_base initialization to pnv_pci_ioda2_setup_dma_pe. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 11 ++++- arch/powerpc/kernel/iommu.c | 12 ------ arch/powerpc/platforms/powernv/pci-ioda.c | 43 ++++++++++++------- drivers/vfio/vfio_iommu_spapr_tce.c | 70 ++++++++++++++++++++++++++++--- 4 files changed, 103 insertions(+), 33 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 44a20ccf06b4..489133cf7c5e 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -93,7 +93,6 @@ struct iommu_table { unsigned long it_page_shift;/* table iommu page size */ struct list_head it_group_list;/* List of iommu_table_group_link */ struct iommu_table_ops *it_ops; - void (*set_bypass)(struct iommu_table *tbl, bool enable); }; /* Pure 2^n version of get_order */ @@ -126,6 +125,15 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); #define IOMMU_TABLE_GROUP_MAX_TABLES 1 +struct iommu_table_group; + +struct iommu_table_group_ops { + /* Switch ownership from platform code to external user (e.g. VFIO) */ + void (*take_ownership)(struct iommu_table_group *table_group); + /* Switch ownership from external user (e.g. VFIO) back to core */ + void (*release_ownership)(struct iommu_table_group *table_group); +}; + struct iommu_table_group_link { struct list_head next; struct rcu_head rcu; @@ -135,6 +143,7 @@ struct iommu_table_group_link { struct iommu_table_group { struct iommu_group *group; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; + struct iommu_table_group_ops *ops; }; #ifdef CONFIG_IOMMU_API diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index be258b2ecb10..e7f81b7399ba 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1047,14 +1047,6 @@ int iommu_take_ownership(struct iommu_table *tbl) memset(tbl->it_map, 0xff, sz); - /* - * Disable iommu bypass, otherwise the user can DMA to all of - * our physical memory via the bypass window instead of just - * the pages that has been explicitly mapped into the iommu - */ - if (tbl->set_bypass) - tbl->set_bypass(tbl, false); - return 0; } EXPORT_SYMBOL_GPL(iommu_take_ownership); @@ -1068,10 +1060,6 @@ void iommu_release_ownership(struct iommu_table *tbl) /* Restore bit#0 set by iommu_init_table() */ if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - - /* The kernel owns the device now, we can restore the iommu bypass */ - if (tbl->set_bypass) - tbl->set_bypass(tbl, true); } EXPORT_SYMBOL_GPL(iommu_release_ownership); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3b4130697b05..17a77522ea0d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1919,13 +1919,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } } -static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { - struct iommu_table_group_link *tgl = list_first_entry_or_null( - &tbl->it_group_list, struct iommu_table_group_link, - next); - struct pnv_ioda_pe *pe = container_of(tgl->table_group, - struct pnv_ioda_pe, table_group); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1952,19 +1947,31 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) pe->tce_bypass_enabled = enable; } -static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +#ifdef CONFIG_IOMMU_API +static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { - /* TVE #1 is selected by PCI address bit 59 */ - pe->tce_bypass_base = 1ull << 59; + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); - /* Install set_bypass callback for VFIO */ - pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass; + iommu_take_ownership(table_group->tables[0]); + pnv_pci_ioda2_set_bypass(pe, false); +} - /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true); +static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + + iommu_release_ownership(table_group->tables[0]); + pnv_pci_ioda2_set_bypass(pe, true); } +static struct iommu_table_group_ops pnv_pci_ioda2_ops = { + .take_ownership = pnv_ioda2_take_ownership, + .release_ownership = pnv_ioda2_release_ownership, +}; +#endif + static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { @@ -1979,6 +1986,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; + /* TVE #1 is selected by PCI address bit 59 */ + pe->tce_bypass_base = 1ull << 59; + tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); @@ -2033,6 +2043,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } tbl->it_ops = &pnv_ioda2_iommu_ops; iommu_init_table(tbl, phb->hose->node); +#ifdef CONFIG_IOMMU_API + pe->table_group.ops = &pnv_pci_ioda2_ops; +#endif if (pe->flags & PNV_IODA_PE_DEV) { /* @@ -2047,7 +2060,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* Also create a bypass window */ if (!pnv_iommu_bypass_disabled) - pnv_pci_ioda2_setup_bypass_pe(phb, pe); + pnv_pci_ioda2_set_bypass(pe, true); return; fail: diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index ffc634a75dba..9c720de46c33 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -486,6 +486,61 @@ static long tce_iommu_ioctl(void *iommu_data, return -ENOTTY; } +static void tce_iommu_release_ownership(struct tce_container *container, + struct iommu_table_group *table_group) +{ + int i; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl) + continue; + + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + if (tbl->it_map) + iommu_release_ownership(tbl); + } +} + +static int tce_iommu_take_ownership(struct tce_container *container, + struct iommu_table_group *table_group) +{ + int i, j, rc = 0; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl || !tbl->it_map) + continue; + + rc = iommu_take_ownership(tbl); + if (rc) { + for (j = 0; j < i; ++j) + iommu_release_ownership( + table_group->tables[j]); + + return rc; + } + } + + return 0; +} + +static void tce_iommu_release_ownership_ddw(struct tce_container *container, + struct iommu_table_group *table_group) +{ + table_group->ops->release_ownership(table_group); +} + +static long tce_iommu_take_ownership_ddw(struct tce_container *container, + struct iommu_table_group *table_group) +{ + table_group->ops->take_ownership(table_group); + + return 0; +} + static int tce_iommu_attach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -518,7 +573,12 @@ static int tce_iommu_attach_group(void *iommu_data, goto unlock_exit; } - ret = iommu_take_ownership(table_group->tables[0]); + if (!table_group->ops || !table_group->ops->take_ownership || + !table_group->ops->release_ownership) + ret = tce_iommu_take_ownership(container, table_group); + else + ret = tce_iommu_take_ownership_ddw(container, table_group); + if (!ret) container->grp = iommu_group; @@ -533,7 +593,6 @@ static void tce_iommu_detach_group(void *iommu_data, { struct tce_container *container = iommu_data; struct iommu_table_group *table_group; - struct iommu_table *tbl; mutex_lock(&container->lock); if (iommu_group != container->grp) { @@ -556,9 +615,10 @@ static void tce_iommu_detach_group(void *iommu_data, table_group = iommu_group_get_iommudata(iommu_group); BUG_ON(!table_group); - tbl = table_group->tables[0]; - tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - iommu_release_ownership(tbl); + if (!table_group->ops || !table_group->ops->release_ownership) + tce_iommu_release_ownership(container, table_group); + else + tce_iommu_release_ownership_ddw(container, table_group); unlock_exit: mutex_unlock(&container->lock); -- cgit v1.2.3 From 5780fb04263c16fdb9affd1012d5eb956e0cbcd9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:12 +1000 Subject: powerpc/powernv/ioda2: Move TCE kill register address to PE At the moment the DMA setup code looks for the "ibm,opal-tce-kill" property which contains the TCE kill register address. Writing to this register invalidates TCE cache on IODA/IODA2 hub. This moves the register address from iommu_table to pnv_pnb as this register belongs to PHB and invalidates TCE cache for all tables of all attached PEs. This moves the property reading/remapping code to a helper which is called when DMA is being configured for PE and which does DMA setup for both IODA1 and IODA2. This adds a new pnv_pci_ioda2_tce_invalidate_entire() helper which invalidates cache for the entire table. It should be called after every call to opal_pci_map_pe_dma_window(). It was not required before because there was just a single TCE table and 64bit DMA was handled via bypass window (which has no table so no cache was used) but this is going to change with Dynamic DMA windows (DDW). Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 66 ++++++++++++++++++------------- arch/powerpc/platforms/powernv/pci.h | 7 +++- 2 files changed, 44 insertions(+), 29 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 17a77522ea0d..6bbb2bb35581 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1680,8 +1680,8 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? - (__be64 __iomem *)pe->tce_inval_reg_phys : - (__be64 __iomem *)tbl->it_index; + (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : + pe->phb->ioda.tce_inval_reg; unsigned long start, end, inc; const unsigned shift = tbl->it_page_shift; @@ -1752,6 +1752,19 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { .get = pnv_tce_get, }; +static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe) +{ + /* 01xb - invalidate TCEs that match the specified PE# */ + unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF); + struct pnv_phb *phb = pe->phb; + + if (!phb->ioda.tce_inval_reg) + return; + + mb(); /* Ensure above stores are visible */ + __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); +} + static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { @@ -1762,8 +1775,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct pnv_ioda_pe, table_group); unsigned long start, end, inc; __be64 __iomem *invalidate = rm ? - (__be64 __iomem *)pe->tce_inval_reg_phys : - (__be64 __iomem *)tbl->it_index; + (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : + pe->phb->ioda.tce_inval_reg; const unsigned shift = tbl->it_page_shift; /* We'll invalidate DMA address in PE scope */ @@ -1821,7 +1834,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, { struct page *tce_mem = NULL; - const __be64 *swinvp; struct iommu_table *tbl; unsigned int i; int64_t rc; @@ -1878,20 +1890,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, base << 28, IOMMU_PAGE_SHIFT_4K); /* OPAL variant of P7IOC SW invalidated TCEs */ - swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); - if (swinvp) { - /* We need a couple more fields -- an address and a data - * to or. Since the bus is only printed out on table free - * errors, and on the first pass the data will be a relative - * bus number, print that out instead. - */ - pe->tce_inval_reg_phys = be64_to_cpup(swinvp); - tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, - 8); + if (phb->ioda.tce_inval_reg) tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE | TCE_PCI_SWINV_PAIR); - } + tbl->it_ops = &pnv_ioda1_iommu_ops; iommu_init_table(tbl, phb->hose->node); @@ -1972,12 +1975,24 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { }; #endif +static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) +{ + const __be64 *swinvp; + + /* OPAL variant of PHB3 invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (!swinvp) + return; + + phb->ioda.tce_inval_reg_phys = be64_to_cpup(swinvp); + phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8); +} + static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct page *tce_mem = NULL; void *addr; - const __be64 *swinvp; struct iommu_table *tbl; unsigned int tce_table_size, end; int64_t rc; @@ -2024,23 +2039,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, goto fail; } + pnv_pci_ioda2_tce_invalidate_entire(pe); + /* Setup linux iommu table */ pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, IOMMU_PAGE_SHIFT_4K); /* OPAL variant of PHB3 invalidated TCEs */ - swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); - if (swinvp) { - /* We need a couple more fields -- an address and a data - * to or. Since the bus is only printed out on table free - * errors, and on the first pass the data will be a relative - * bus number, print that out instead. - */ - pe->tce_inval_reg_phys = be64_to_cpup(swinvp); - tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, - 8); + if (phb->ioda.tce_inval_reg) tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); - } + tbl->it_ops = &pnv_ioda2_iommu_ops; iommu_init_table(tbl, phb->hose->node); #ifdef CONFIG_IOMMU_API @@ -2096,6 +2104,8 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) pr_info("PCI: %d PE# for a total weight of %d\n", phb->ioda.dma_pe_count, phb->ioda.dma_weight); + pnv_pci_ioda_setup_opal_tce_kill(phb); + /* Walk our PE list and configure their DMA segments, hand them * out one base segment plus any residual segments based on * weight diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index d37cb4da409f..792a723c36ec 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -58,7 +58,6 @@ struct pnv_ioda_pe { int tce32_seg; int tce32_segcount; struct iommu_table_group table_group; - phys_addr_t tce_inval_reg_phys; /* 64-bit TCE bypass region */ bool tce_bypass_enabled; @@ -184,6 +183,12 @@ struct pnv_phb { * boot for resource allocation purposes */ struct list_head pe_dma_list; + + /* TCE cache invalidate registers (physical and + * remapped) + */ + phys_addr_t tce_inval_reg_phys; + __be64 __iomem *tce_inval_reg; } ioda; }; -- cgit v1.2.3 From e57080f17da5d50f7b3b9d6c045f28632d71309d Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:13 +1000 Subject: powerpc/powernv/ioda2: Add TCE invalidation for all attached groups The iommu_table struct keeps a list of IOMMU groups it is used for. At the moment there is just a single group attached but further patches will add TCE table sharing. When sharing is enabled, TCE cache in each PE needs to be invalidated so does the patch. This does not change pnv_pci_ioda1_tce_invalidate() as there is no plan to enable TCE table sharing on PHBs older than IODA2. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 35 ++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 6bbb2bb35581..390863608569 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1765,23 +1766,15 @@ static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe) __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); } -static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, - unsigned long index, unsigned long npages, bool rm) +static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, + __be64 __iomem *invalidate, unsigned shift, + unsigned long index, unsigned long npages) { - struct iommu_table_group_link *tgl = list_first_entry_or_null( - &tbl->it_group_list, struct iommu_table_group_link, - next); - struct pnv_ioda_pe *pe = container_of(tgl->table_group, - struct pnv_ioda_pe, table_group); unsigned long start, end, inc; - __be64 __iomem *invalidate = rm ? - (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : - pe->phb->ioda.tce_inval_reg; - const unsigned shift = tbl->it_page_shift; /* We'll invalidate DMA address in PE scope */ start = 0x2ull << 60; - start |= (pe->pe_number & 0xFF); + start |= (pe_number & 0xFF); end = start; /* Figure out the start, end and step */ @@ -1799,6 +1792,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, } } +static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, + unsigned long index, unsigned long npages, bool rm) +{ + struct iommu_table_group_link *tgl; + + list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + struct pnv_ioda_pe *pe = container_of(tgl->table_group, + struct pnv_ioda_pe, table_group); + __be64 __iomem *invalidate = rm ? + (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : + pe->phb->ioda.tce_inval_reg; + + pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, + invalidate, tbl->it_page_shift, + index, npages); + } +} + static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, -- cgit v1.2.3 From c5bb44edee19b2c19221a0b5a68add37ea5733c5 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:14 +1000 Subject: powerpc/powernv: Implement accessor to TCE entry This replaces direct accesses to TCE table with a helper which returns an TCE entry address. This does not make difference now but will when multi-level TCE tables get introduces. No change in behavior is expected. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 00f1abd34379..a07b83283ccc 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -572,38 +572,46 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; +static __be64 *pnv_tce(struct iommu_table *tbl, long idx) +{ + __be64 *tmp = ((__be64 *)tbl->it_base); + + return tmp + idx; +} + int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 proto_tce = iommu_direction_to_tce_perm(direction); - __be64 *tcep; - u64 rpn; - - tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; - rpn = __pa(uaddr) >> tbl->it_page_shift; + u64 rpn = __pa(uaddr) >> tbl->it_page_shift; + long i; - while (npages--) - *(tcep++) = cpu_to_be64(proto_tce | - (rpn++ << tbl->it_page_shift)); + for (i = 0; i < npages; i++) { + unsigned long newtce = proto_tce | + ((rpn + i) << tbl->it_page_shift); + unsigned long idx = index - tbl->it_offset + i; + *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce); + } return 0; } void pnv_tce_free(struct iommu_table *tbl, long index, long npages) { - __be64 *tcep; + long i; - tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; + for (i = 0; i < npages; i++) { + unsigned long idx = index - tbl->it_offset + i; - while (npages--) - *(tcep++) = cpu_to_be64(0); + *(pnv_tce(tbl, idx)) = cpu_to_be64(0); + } } unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { - return ((u64 *)tbl->it_base)[index - tbl->it_offset]; + return *(pnv_tce(tbl, index - tbl->it_offset)); } struct iommu_table *pnv_pci_table_alloc(int nid) -- cgit v1.2.3 From 05c6cfb9dce0d13d37e9d007ee6a4af36f1c0a58 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:15 +1000 Subject: powerpc/iommu/powernv: Release replaced TCE At the moment writing new TCE value to the IOMMU table fails with EBUSY if there is a valid entry already. However PAPR specification allows the guest to write new TCE value without clearing it first. Another problem this patch is addressing is the use of pool locks for external IOMMU users such as VFIO. The pool locks are to protect DMA page allocator rather than entries and since the host kernel does not control what pages are in use, there is no point in pool locks and exchange()+put_page(oldtce) is sufficient to avoid possible races. This adds an exchange() callback to iommu_table_ops which does the same thing as set() plus it returns replaced TCE and DMA direction so the caller can release the pages afterwards. The exchange() receives a physical address unlike set() which receives linear mapping address; and returns a physical address as the clear() does. This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement for a platform to have exchange() implemented in order to support VFIO. This replaces iommu_tce_build() and iommu_clear_tce() with a single iommu_tce_xchg(). This makes sure that TCE permission bits are not set in TCE passed to IOMMU API as those are to be calculated by platform code from DMA direction. This moves SetPageDirty() to the IOMMU code to make it work for both VFIO ioctl interface in in-kernel TCE acceleration (when it becomes available later). Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 22 ++++++++-- arch/powerpc/kernel/iommu.c | 59 +++++++++------------------ arch/powerpc/platforms/powernv/pci-ioda.c | 34 ++++++++++++++++ arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 ++ arch/powerpc/platforms/powernv/pci.c | 18 +++++++++ arch/powerpc/platforms/powernv/pci.h | 2 + drivers/vfio/vfio_iommu_spapr_tce.c | 63 +++++++++++++++++------------ 7 files changed, 132 insertions(+), 69 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 489133cf7c5e..4636734604d7 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -45,13 +45,29 @@ extern int iommu_is_off; extern int iommu_force_on; struct iommu_table_ops { + /* + * When called with direction==DMA_NONE, it is equal to clear(). + * uaddr is a linear map address. + */ int (*set)(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); +#ifdef CONFIG_IOMMU_API + /* + * Exchanges existing TCE with new TCE plus direction bits; + * returns old TCE and DMA direction mask. + * @tce is a physical address. + */ + int (*exchange)(struct iommu_table *tbl, + long index, + unsigned long *hpa, + enum dma_data_direction *direction); +#endif void (*clear)(struct iommu_table *tbl, long index, long npages); + /* get() returns a physical address */ unsigned long (*get)(struct iommu_table *tbl, long index); void (*flush)(struct iommu_table *tbl); }; @@ -153,6 +169,8 @@ extern void iommu_register_group(struct iommu_table_group *table_group, extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); extern int __init tce_iommu_bus_notifier_init(void); +extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, + unsigned long *hpa, enum dma_data_direction *direction); #else static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, @@ -225,10 +243,6 @@ extern int iommu_tce_clear_param_check(struct iommu_table *tbl, unsigned long npages); extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); -extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction); -extern unsigned long iommu_clear_tce(struct iommu_table *tbl, - unsigned long entry); extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct iommu_table *tbl); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0fb88005c3c5..a8e3490b54e3 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -965,10 +965,7 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check); int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce) { - if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ))) - return -EINVAL; - - if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ)) + if (tce & ~IOMMU_PAGE_MASK(tbl)) return -EINVAL; if (ioba & ~IOMMU_PAGE_MASK(tbl)) @@ -985,44 +982,16 @@ int iommu_tce_put_param_check(struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) +long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, + unsigned long *hpa, enum dma_data_direction *direction) { - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); + long ret; - spin_lock(&(pool->lock)); + ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); - oldtce = tbl->it_ops->get(tbl, entry); - if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) - tbl->it_ops->clear(tbl, entry, 1); - else - oldtce = 0; - - spin_unlock(&(pool->lock)); - - return oldtce; -} -EXPORT_SYMBOL_GPL(iommu_clear_tce); - -/* - * hwaddr is a kernel virtual address here (0xc... bazillion), - * tce_build converts it to a physical address. - */ -int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction) -{ - int ret = -EBUSY; - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock(&(pool->lock)); - - oldtce = tbl->it_ops->get(tbl, entry); - /* Add new entry if it is not busy */ - if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) - ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL); - - spin_unlock(&(pool->lock)); + if (!ret && ((*direction == DMA_FROM_DEVICE) || + (*direction == DMA_BIDIRECTIONAL))) + SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); /* if (unlikely(ret)) pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", @@ -1031,13 +1000,23 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, return ret; } -EXPORT_SYMBOL_GPL(iommu_tce_build); +EXPORT_SYMBOL_GPL(iommu_tce_xchg); int iommu_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; int ret = 0; + /* + * VFIO does not control TCE entries allocation and the guest + * can write new TCEs on top of existing ones so iommu_tce_build() + * must be able to release old pages. This functionality + * requires exchange() callback defined so if it is not + * implemented, we disallow taking ownership over the table. + */ + if (!tbl->it_ops->exchange) + return -EINVAL; + spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) spin_lock(&tbl->pools[i].lock); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 390863608569..ecbc071a143e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1738,6 +1738,20 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, return ret; } +#ifdef CONFIG_IOMMU_API +static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret && (tbl->it_type & + (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) + pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false); + + return ret; +} +#endif + static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, long npages) { @@ -1749,6 +1763,9 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda1_iommu_ops = { .set = pnv_ioda1_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_ioda1_tce_xchg, +#endif .clear = pnv_ioda1_tce_free, .get = pnv_tce_get, }; @@ -1824,6 +1841,20 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, return ret; } +#ifdef CONFIG_IOMMU_API +static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret && (tbl->it_type & + (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) + pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); + + return ret; +} +#endif + static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, long npages) { @@ -1835,6 +1866,9 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_ioda2_tce_xchg, +#endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, }; diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index f80e5a1d6117..eaec85b63b34 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -85,6 +85,9 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } static struct iommu_table_ops pnv_p5ioc2_iommu_ops = { .set = pnv_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_tce_xchg, +#endif .clear = pnv_tce_free, .get = pnv_tce_get, }; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index a07b83283ccc..d465b9c32388 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -598,6 +598,24 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, return 0; } +#ifdef CONFIG_IOMMU_API +int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + u64 proto_tce = iommu_direction_to_tce_perm(*direction); + unsigned long newtce = *hpa | proto_tce, oldtce; + unsigned long idx = index - tbl->it_offset; + + BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); + + oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)); + *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE); + *direction = iommu_tce_direction(oldtce); + + return 0; +} +#endif + void pnv_tce_free(struct iommu_table *tbl, long index, long npages) { long i; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 792a723c36ec..8ef2d28aded0 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -207,6 +207,8 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); +extern int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 9c720de46c33..a9e2d13c03c0 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -236,18 +236,11 @@ static void tce_iommu_release(void *iommu_data) } static void tce_iommu_unuse_page(struct tce_container *container, - unsigned long oldtce) + unsigned long hpa) { struct page *page; - if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE))) - return; - - page = pfn_to_page(oldtce >> PAGE_SHIFT); - - if (oldtce & TCE_PCI_WRITE) - SetPageDirty(page); - + page = pfn_to_page(hpa >> PAGE_SHIFT); put_page(page); } @@ -255,14 +248,21 @@ static int tce_iommu_clear(struct tce_container *container, struct iommu_table *tbl, unsigned long entry, unsigned long pages) { - unsigned long oldtce; + unsigned long oldhpa; + long ret; + enum dma_data_direction direction; for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) + direction = DMA_NONE; + oldhpa = 0; + ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); + if (ret) + continue; + + if (direction == DMA_NONE) continue; - tce_iommu_unuse_page(container, oldtce); + tce_iommu_unuse_page(container, oldhpa); } return 0; @@ -284,12 +284,13 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) static long tce_iommu_build(struct tce_container *container, struct iommu_table *tbl, - unsigned long entry, unsigned long tce, unsigned long pages) + unsigned long entry, unsigned long tce, unsigned long pages, + enum dma_data_direction direction) { long i, ret = 0; struct page *page; unsigned long hpa; - enum dma_data_direction direction = iommu_tce_direction(tce); + enum dma_data_direction dirtmp; for (i = 0; i < pages; ++i) { unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; @@ -305,8 +306,8 @@ static long tce_iommu_build(struct tce_container *container, } hpa |= offset; - ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa), - direction); + dirtmp = direction; + ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); if (ret) { tce_iommu_unuse_page(container, hpa); pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", @@ -314,6 +315,10 @@ static long tce_iommu_build(struct tce_container *container, tce, ret); break; } + + if (dirtmp != DMA_NONE) + tce_iommu_unuse_page(container, hpa); + tce += IOMMU_PAGE_SIZE(tbl); } @@ -378,8 +383,8 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_MAP_DMA: { struct vfio_iommu_type1_dma_map param; struct iommu_table *tbl = NULL; - unsigned long tce; long num; + enum dma_data_direction direction; if (!container->enabled) return -EPERM; @@ -405,19 +410,27 @@ static long tce_iommu_ioctl(void *iommu_data, return -EINVAL; /* iova is checked by the IOMMU API */ - tce = param.vaddr; - if (param.flags & VFIO_DMA_MAP_FLAG_READ) - tce |= TCE_PCI_READ; - if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) - tce |= TCE_PCI_WRITE; + if (param.flags & VFIO_DMA_MAP_FLAG_READ) { + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + direction = DMA_BIDIRECTIONAL; + else + direction = DMA_TO_DEVICE; + } else { + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + direction = DMA_FROM_DEVICE; + else + return -EINVAL; + } - ret = iommu_tce_put_param_check(tbl, param.iova, tce); + ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); if (ret) return ret; ret = tce_iommu_build(container, tbl, param.iova >> tbl->it_page_shift, - tce, param.size >> tbl->it_page_shift); + param.vaddr, + param.size >> tbl->it_page_shift, + direction); iommu_flush_tce(tbl); -- cgit v1.2.3 From e5aad1e678746af663e3e3acc7f3501309997f51 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:16 +1000 Subject: powerpc/powernv/ioda2: Rework iommu_table creation This moves iommu_table creation to the beginning to make following changes easier to review. This starts using table parameters from the iommu_table struct. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index ecbc071a143e..feaba5ecfa97 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2071,13 +2071,23 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, addr = page_address(tce_mem); memset(addr, 0, tce_table_size); + /* Setup linux iommu table */ + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, + IOMMU_PAGE_SHIFT_4K); + + tbl->it_ops = &pnv_ioda2_iommu_ops; + iommu_init_table(tbl, phb->hose->node); +#ifdef CONFIG_IOMMU_API + pe->table_group.ops = &pnv_pci_ioda2_ops; +#endif + /* * Map TCE table through TVT. The TVE index is the PE number * shifted by 1 bit for 32-bits DMA space. */ rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, 1, __pa(addr), - tce_table_size, 0x1000); + pe->pe_number << 1, 1, __pa(tbl->it_base), + tbl->it_size << 3, 1ULL << tbl->it_page_shift); if (rc) { pe_err(pe, "Failed to configure 32-bit TCE table," " err %ld\n", rc); @@ -2086,20 +2096,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pnv_pci_ioda2_tce_invalidate_entire(pe); - /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - IOMMU_PAGE_SHIFT_4K); - /* OPAL variant of PHB3 invalidated TCEs */ if (phb->ioda.tce_inval_reg) tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); - tbl->it_ops = &pnv_ioda2_iommu_ops; - iommu_init_table(tbl, phb->hose->node); -#ifdef CONFIG_IOMMU_API - pe->table_group.ops = &pnv_pci_ioda2_ops; -#endif - if (pe->flags & PNV_IODA_PE_DEV) { /* * Setting table base here only for carrying iommu_group -- cgit v1.2.3 From aca6913f555176363cda21518ff79da7469ebd64 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:17 +1000 Subject: powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages This is a part of moving TCE table allocation into an iommu_ops callback to support multiple IOMMU groups per one VFIO container. This moves the code which allocates the actual TCE tables to helpers: pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages(). These do not allocate/free the iommu_table struct. This enforces window size to be a power of two. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 83 +++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 20 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index feaba5ecfa97..e61610cccf6e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -40,6 +40,7 @@ #include #include #include +#include #include @@ -49,6 +50,8 @@ /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) +static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); + static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) { @@ -1313,8 +1316,8 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe iommu_group_put(pe->table_group.group); BUG_ON(pe->table_group.group); } + pnv_pci_ioda2_table_free_pages(tbl); iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); - free_pages(addr, get_order(TCE32_TABLE_SIZE)); } static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) @@ -2033,13 +2036,62 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8); } -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift) { struct page *tce_mem = NULL; + __be64 *addr; + unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT; + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); + if (!tce_mem) { + pr_err("Failed to allocate a TCE memory, order=%d\n", order); + return NULL; + } + addr = page_address(tce_mem); + memset(addr, 0, 1UL << (order + PAGE_SHIFT)); + + return addr; +} + +static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, struct iommu_table *tbl) +{ void *addr; + const unsigned window_shift = ilog2(window_size); + unsigned entries_shift = window_shift - page_shift; + unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT); + const unsigned long tce_table_size = 1UL << table_shift; + + if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) + return -EINVAL; + + /* Allocate TCE table */ + addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift); + if (!addr) + return -ENOMEM; + + /* Setup linux iommu table */ + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset, + page_shift); + + pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", + window_size, tce_table_size, bus_offset); + + return 0; +} + +static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) +{ + if (!tbl->it_size) + return; + + free_pages(tbl->it_base, get_order(tbl->it_size << 3)); +} + +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ struct iommu_table *tbl; - unsigned int tce_table_size, end; int64_t rc; /* We shouldn't already have a 32-bit DMA associated */ @@ -2056,24 +2108,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; - end = (1 << ilog2(phb->ioda.m32_pci_base)); - tce_table_size = (end / 0x1000) * 8; pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", - end); + phb->ioda.m32_pci_base); - /* Allocate TCE table */ - tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, - get_order(tce_table_size)); - if (!tce_mem) { - pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); + /* Setup linux iommu table */ + rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node, + 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl); + if (rc) { + pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); goto fail; } - addr = page_address(tce_mem); - memset(addr, 0, tce_table_size); - - /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - IOMMU_PAGE_SHIFT_4K); tbl->it_ops = &pnv_ioda2_iommu_ops; iommu_init_table(tbl, phb->hose->node); @@ -2119,9 +2163,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, fail: if (pe->tce32_seg >= 0) pe->tce32_seg = -1; - if (tce_mem) - __free_pages(tce_mem, get_order(tce_table_size)); if (tbl) { + pnv_pci_ioda2_table_free_pages(tbl); pnv_pci_unlink_table_and_group(tbl, &pe->table_group); iommu_free_table(tbl, "pnv"); } -- cgit v1.2.3 From 43cb60ab7f8c91b9bf5988edc318dee99ec93b9b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:18 +1000 Subject: powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window This is a part of moving DMA window programming to an iommu_ops callback. pnv_pci_ioda2_set_window() takes an iommu_table_group as a first parameter (not pnv_ioda_pe) as it is going to be used as a callback for VFIO DDW code. This should cause no behavioural change. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Gavin Shan Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 47 +++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e61610cccf6e..552f6ac52560 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1970,6 +1970,43 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } } +static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = pe->phb; + int64_t rc; + const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; + const __u64 win_size = tbl->it_size << tbl->it_page_shift; + + pe_info(pe, "Setting up window %llx..%llx pg=%x\n", + start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); + + /* + * Map TCE table through TVT. The TVE index is the PE number + * shifted by 1 bit for 32-bits DMA space. + */ + rc = opal_pci_map_pe_dma_window(phb->opal_id, + pe->pe_number, + pe->pe_number << 1, + 1, + __pa(tbl->it_base), + tbl->it_size << 3, + IOMMU_PAGE_SIZE(tbl)); + if (rc) { + pe_err(pe, "Failed to configure TCE table, err %ld\n", rc); + return rc; + } + + pnv_pci_link_table_and_group(phb->hose->node, num, + tbl, &pe->table_group); + pnv_pci_ioda2_tce_invalidate_entire(pe); + + return 0; +} + static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { uint16_t window_id = (pe->pe_number << 1 ) + 1; @@ -2125,21 +2162,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->table_group.ops = &pnv_pci_ioda2_ops; #endif - /* - * Map TCE table through TVT. The TVE index is the PE number - * shifted by 1 bit for 32-bits DMA space. - */ - rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, 1, __pa(tbl->it_base), - tbl->it_size << 3, 1ULL << tbl->it_page_shift); + rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); if (rc) { pe_err(pe, "Failed to configure 32-bit TCE table," " err %ld\n", rc); goto fail; } - pnv_pci_ioda2_tce_invalidate_entire(pe); - /* OPAL variant of PHB3 invalidated TCEs */ if (phb->ioda.tce_inval_reg) tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); -- cgit v1.2.3 From bbb845c4bac88d8feffa8945dd28b50849984e30 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:19 +1000 Subject: powerpc/powernv: Implement multilevel TCE tables TCE tables might get too big in case of 4K IOMMU pages and DDW enabled on huge guests (hundreds of GB of RAM) so the kernel might be unable to allocate contiguous chunk of physical memory to store the TCE table. To address this, POWER8 CPU (actually, IODA2) supports multi-level TCE tables, up to 5 levels which splits the table into a tree of smaller subtables. This adds multi-level TCE tables support to pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages() helpers. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 2 + arch/powerpc/platforms/powernv/pci-ioda.c | 105 +++++++++++++++++++++++++++--- arch/powerpc/platforms/powernv/pci.c | 13 ++++ 3 files changed, 111 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 4636734604d7..706cfc0b1190 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -96,6 +96,8 @@ struct iommu_pool { struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ + unsigned long it_indirect_levels; + unsigned long it_level_size; unsigned long it_offset; /* Offset into global table */ unsigned long it_base; /* mapped address of tce table */ unsigned long it_index; /* which iommu table this is */ diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 552f6ac52560..21d8c61f0b03 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -50,6 +50,9 @@ /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) +#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_MAX_LEVELS 5 + static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, @@ -1977,6 +1980,8 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, table_group); struct pnv_phb *phb = pe->phb; int64_t rc; + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; const __u64 win_size = tbl->it_size << tbl->it_page_shift; @@ -1991,9 +1996,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, pe->pe_number << 1, - 1, + tbl->it_indirect_levels + 1, __pa(tbl->it_base), - tbl->it_size << 3, + size << 3, IOMMU_PAGE_SIZE(tbl)); if (rc) { pe_err(pe, "Failed to configure TCE table, err %ld\n", rc); @@ -2073,11 +2078,16 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8); } -static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift) +static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, + unsigned levels, unsigned long limit, + unsigned long *current_offset) { struct page *tce_mem = NULL; - __be64 *addr; + __be64 *addr, *tmp; unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT; + unsigned long allocated = 1UL << (order + PAGE_SHIFT); + unsigned entries = 1UL << (shift - 3); + long i; tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); if (!tce_mem) { @@ -2085,31 +2095,79 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift) return NULL; } addr = page_address(tce_mem); - memset(addr, 0, 1UL << (order + PAGE_SHIFT)); + memset(addr, 0, allocated); + + --levels; + if (!levels) { + *current_offset += allocated; + return addr; + } + + for (i = 0; i < entries; ++i) { + tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift, + levels, limit, current_offset); + if (!tmp) + break; + + addr[i] = cpu_to_be64(__pa(tmp) | + TCE_PCI_READ | TCE_PCI_WRITE); + + if (*current_offset >= limit) + break; + } return addr; } +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned level); + static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, - __u32 page_shift, __u64 window_size, struct iommu_table *tbl) + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl) { void *addr; + unsigned long offset = 0, level_shift; const unsigned window_shift = ilog2(window_size); unsigned entries_shift = window_shift - page_shift; unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT); const unsigned long tce_table_size = 1UL << table_shift; + if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) + return -EINVAL; + if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) return -EINVAL; + /* Adjust direct table size from window_size and levels */ + entries_shift = (entries_shift + levels - 1) / levels; + level_shift = entries_shift + 3; + level_shift = max_t(unsigned, level_shift, PAGE_SHIFT); + /* Allocate TCE table */ - addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift); + addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, + levels, tce_table_size, &offset); + + /* addr==NULL means that the first level allocation failed */ if (!addr) return -ENOMEM; + /* + * First level was allocated but some lower level failed as + * we did not allocate as much as we wanted, + * release partially allocated table. + */ + if (offset < tce_table_size) { + pnv_pci_ioda2_table_do_free_pages(addr, + 1ULL << (level_shift - 3), levels - 1); + return -ENOMEM; + } + /* Setup linux iommu table */ pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset, page_shift); + tbl->it_level_size = 1ULL << (level_shift - 3); + tbl->it_indirect_levels = levels - 1; pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", window_size, tce_table_size, bus_offset); @@ -2117,12 +2175,40 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, return 0; } +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned level) +{ + const unsigned long addr_ul = (unsigned long) addr & + ~(TCE_PCI_READ | TCE_PCI_WRITE); + + if (level) { + long i; + u64 *tmp = (u64 *) addr_ul; + + for (i = 0; i < size; ++i) { + unsigned long hpa = be64_to_cpu(tmp[i]); + + if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE))) + continue; + + pnv_pci_ioda2_table_do_free_pages(__va(hpa), size, + level - 1); + } + } + + free_pages(addr_ul, get_order(size << 3)); +} + static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) { + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + if (!tbl->it_size) return; - free_pages(tbl->it_base, get_order(tbl->it_size << 3)); + pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, + tbl->it_indirect_levels); } static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, @@ -2150,7 +2236,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* Setup linux iommu table */ rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node, - 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl); + 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, + POWERNV_IOMMU_DEFAULT_LEVELS, tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); goto fail; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index d465b9c32388..765d8ed558d0 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -575,6 +575,19 @@ struct pci_ops pnv_pci_ops = { static __be64 *pnv_tce(struct iommu_table *tbl, long idx) { __be64 *tmp = ((__be64 *)tbl->it_base); + int level = tbl->it_indirect_levels; + const long shift = ilog2(tbl->it_level_size); + unsigned long mask = (tbl->it_level_size - 1) << (level * shift); + + while (level) { + int n = (idx & mask) >> (level * shift); + unsigned long tce = be64_to_cpu(tmp[n]); + + tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); + idx &= ~mask; + mask >>= shift; + --level; + } return tmp + idx; } -- cgit v1.2.3 From 4793d65d1ac056d92b594d05c6aab3c040d913dd Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:20 +1000 Subject: vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API This extends iommu_table_group_ops by a set of callbacks to support dynamic DMA windows management. create_table() creates a TCE table with specific parameters. it receives iommu_table_group to know nodeid in order to allocate TCE table memory closer to the PHB. The exact format of allocated multi-level table might be also specific to the PHB model (not the case now though). This callback calculated the DMA window offset on a PCI bus from @num and stores it in a just created table. set_window() sets the window at specified TVT index + @num on PHB. unset_window() unsets the window from specified TVT. This adds a free() callback to iommu_table_ops to free the memory (potentially a tree of tables) allocated for the TCE table. create_table() and free() are supposed to be called once per VFIO container and set_window()/unset_window() are supposed to be called for every group in a container. This adds IOMMU capabilities to iommu_table_group such as default 32bit window parameters and others. This makes use of new values in vfio_iommu_spapr_tce. IODA1/P5IOC2 do not support DDW so they do not advertise pagemasks to the userspace. Signed-off-by: Alexey Kardashevskiy Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 19 ++++++ arch/powerpc/platforms/powernv/pci-ioda.c | 96 ++++++++++++++++++++++++++--- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 7 ++- drivers/vfio/vfio_iommu_spapr_tce.c | 19 +++--- 4 files changed, 124 insertions(+), 17 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 706cfc0b1190..e5541759a37d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -70,6 +70,7 @@ struct iommu_table_ops { /* get() returns a physical address */ unsigned long (*get)(struct iommu_table *tbl, long index); void (*flush)(struct iommu_table *tbl); + void (*free)(struct iommu_table *tbl); }; /* These are used by VIO */ @@ -146,6 +147,17 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, struct iommu_table_group; struct iommu_table_group_ops { + long (*create_table)(struct iommu_table_group *table_group, + int num, + __u32 page_shift, + __u64 window_size, + __u32 levels, + struct iommu_table **ptbl); + long (*set_window)(struct iommu_table_group *table_group, + int num, + struct iommu_table *tblnew); + long (*unset_window)(struct iommu_table_group *table_group, + int num); /* Switch ownership from platform code to external user (e.g. VFIO) */ void (*take_ownership)(struct iommu_table_group *table_group); /* Switch ownership from external user (e.g. VFIO) back to core */ @@ -159,6 +171,13 @@ struct iommu_table_group_link { }; struct iommu_table_group { + /* IOMMU properties */ + __u32 tce32_start; + __u32 tce32_size; + __u64 pgsizes; /* Bitmap of supported page sizes */ + __u32 max_dynamic_windows_supported; + __u32 max_levels; + struct iommu_group *group; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; struct iommu_table_group_ops *ops; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 21d8c61f0b03..ee3e643c16bb 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -1870,6 +1871,12 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); } +static void pnv_ioda2_table_free(struct iommu_table *tbl) +{ + pnv_pci_ioda2_table_free_pages(tbl); + iommu_free_table(tbl, "pnv"); +} + static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API @@ -1877,6 +1884,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, + .free = pnv_ioda2_table_free, }; static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, @@ -1947,6 +1955,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, TCE_PCI_SWINV_PAIR); tbl->it_ops = &pnv_ioda1_iommu_ops; + pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; + pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; iommu_init_table(tbl, phb->hose->node); if (pe->flags & PNV_IODA_PE_DEV) { @@ -1985,7 +1995,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; const __u64 win_size = tbl->it_size << tbl->it_page_shift; - pe_info(pe, "Setting up window %llx..%llx pg=%x\n", + pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num, start_addr, start_addr + win_size - 1, IOMMU_PAGE_SIZE(tbl)); @@ -1995,7 +2005,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, */ rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, + (pe->pe_number << 1) + num, tbl->it_indirect_levels + 1, __pa(tbl->it_base), size << 3, @@ -2040,7 +2050,67 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) pe->tce_bypass_enabled = enable; } +static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl); + +static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + int nid = pe->phb->hose->node; + __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start; + long ret; + struct iommu_table *tbl; + + tbl = pnv_pci_table_alloc(nid); + if (!tbl) + return -ENOMEM; + + ret = pnv_pci_ioda2_table_alloc_pages(nid, + bus_offset, page_shift, window_size, + levels, tbl); + if (ret) { + iommu_free_table(tbl, "pnv"); + return ret; + } + + tbl->it_ops = &pnv_ioda2_iommu_ops; + if (pe->phb->ioda.tce_inval_reg) + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + + *ptbl = tbl; + + return 0; +} + #ifdef CONFIG_IOMMU_API +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, + int num) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = pe->phb; + long ret; + + pe_info(pe, "Removing DMA window #%d\n", num); + + ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + (pe->pe_number << 1) + num, + 0/* levels */, 0/* table address */, + 0/* table size */, 0/* page size */); + if (ret) + pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); + else + pnv_pci_ioda2_tce_invalidate_entire(pe); + + pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); + + return ret; +} + static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, @@ -2060,6 +2130,9 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { + .create_table = pnv_pci_ioda2_create_table, + .set_window = pnv_pci_ioda2_set_window, + .unset_window = pnv_pci_ioda2_unset_window, .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; @@ -2214,7 +2287,7 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct iommu_table *tbl; + struct iommu_table *tbl = NULL; int64_t rc; /* We shouldn't already have a 32-bit DMA associated */ @@ -2224,10 +2297,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; - tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); - pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; @@ -2235,13 +2306,22 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, phb->ioda.m32_pci_base); /* Setup linux iommu table */ - rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node, - 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, - POWERNV_IOMMU_DEFAULT_LEVELS, tbl); + pe->table_group.tce32_start = 0; + pe->table_group.tce32_size = phb->ioda.m32_pci_base; + pe->table_group.max_dynamic_windows_supported = + IOMMU_TABLE_GROUP_MAX_TABLES; + pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; + pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M; + + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, + IOMMU_PAGE_SHIFT_4K, + pe->table_group.tce32_size, + POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); goto fail; } + pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); tbl->it_ops = &pnv_ioda2_iommu_ops; iommu_init_table(tbl, phb->hose->node); diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index eaec85b63b34..f2bdfea3b68d 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -127,6 +127,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, u64 phb_id; int64_t rc; static int primary = 1; + struct iommu_table_group *table_group; + struct iommu_table *tbl; pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name); @@ -201,7 +203,10 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table() * should not be called for phb->p5ioc2.table_group.tables[0] ever. */ - phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table; + tbl = phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table; + table_group = &phb->p5ioc2.table_group; + table_group->tce32_start = tbl->it_offset << tbl->it_page_shift; + table_group->tce32_size = tbl->it_size << tbl->it_page_shift; } void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index a9e2d13c03c0..6d919eb4251f 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -135,7 +135,6 @@ static int tce_iommu_enable(struct tce_container *container) { int ret = 0; unsigned long locked; - struct iommu_table *tbl; struct iommu_table_group *table_group; if (!container->grp) @@ -171,13 +170,19 @@ static int tce_iommu_enable(struct tce_container *container) * this is that we cannot tell here the amount of RAM used by the guest * as this information is only available from KVM and VFIO is * KVM agnostic. + * + * So we do not allow enabling a container without a group attached + * as there is no way to know how much we should increment + * the locked_vm counter. */ table_group = iommu_group_get_iommudata(container->grp); if (!table_group) return -ENODEV; - tbl = table_group->tables[0]; - locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; + if (!table_group->tce32_size) + return -EPERM; + + locked = table_group->tce32_size >> PAGE_SHIFT; ret = try_increment_locked_vm(locked); if (ret) return ret; @@ -350,7 +355,6 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { struct vfio_iommu_spapr_tce_info info; - struct iommu_table *tbl; struct iommu_table_group *table_group; if (WARN_ON(!container->grp)) @@ -358,8 +362,7 @@ static long tce_iommu_ioctl(void *iommu_data, table_group = iommu_group_get_iommudata(container->grp); - tbl = table_group->tables[0]; - if (WARN_ON_ONCE(!tbl)) + if (!table_group) return -ENXIO; minsz = offsetofend(struct vfio_iommu_spapr_tce_info, @@ -371,8 +374,8 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz < minsz) return -EINVAL; - info.dma32_window_start = tbl->it_offset << tbl->it_page_shift; - info.dma32_window_size = tbl->it_size << tbl->it_page_shift; + info.dma32_window_start = table_group->tce32_start; + info.dma32_window_size = table_group->tce32_size; info.flags = 0; if (copy_to_user((void __user *)arg, &info, minsz)) -- cgit v1.2.3 From c035e37b58c75ca216bfd1d5de3c1080ac0022b9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:21 +1000 Subject: powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE release The existing code programmed TVT#0 with some address and then immediately released that memory. This makes use of pnv_pci_ioda2_unset_window() and pnv_pci_ioda2_set_bypass() which do correct resource release and TVT update. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index ee3e643c16bb..510bb9c8a9e1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1288,34 +1288,21 @@ m64_failed: return -EBUSY; } +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, + int num); +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); + static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) { - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; struct iommu_table *tbl; - unsigned long addr; int64_t rc; - bus = dev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; tbl = pe->table_group.tables[0]; - addr = tbl->it_base; - - opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, 1, __pa(addr), - 0, 0x1000); - - rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, - pe->pe_number, - (pe->pe_number << 1) + 1, - pe->tce_bypass_base, - 0); + rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (rc) pe_warn(pe, "OPAL error %ld release DMA window\n", rc); - pnv_pci_unlink_table_and_group(tbl, &pe->table_group); + pnv_pci_ioda2_set_bypass(pe, false); if (pe->table_group.group) { iommu_group_put(pe->table_group.group); BUG_ON(pe->table_group.group); -- cgit v1.2.3 From 0054719386d96984153ad31d714a8be4ec7eba80 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:22 +1000 Subject: powerpc/iommu/ioda2: Add get_table_size() to calculate the size of future table This adds a way for the IOMMU user to know how much a new table will use so it can be accounted in the locked_vm limit before allocation happens. This stores the allocated table size in pnv_pci_ioda2_get_table_size() so the locked_vm counter can be updated correctly when a table is being disposed. This defines an iommu_table_group_ops callback to let VFIO know how much memory will be locked if a table is created. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 5 +++++ arch/powerpc/platforms/powernv/pci-ioda.c | 34 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index e5541759a37d..9d3749287689 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -99,6 +99,7 @@ struct iommu_table { unsigned long it_size; /* Size of iommu table in entries */ unsigned long it_indirect_levels; unsigned long it_level_size; + unsigned long it_allocated_size; unsigned long it_offset; /* Offset into global table */ unsigned long it_base; /* mapped address of tce table */ unsigned long it_index; /* which iommu table this is */ @@ -147,6 +148,10 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, struct iommu_table_group; struct iommu_table_group_ops { + unsigned long (*get_table_size)( + __u32 page_shift, + __u64 window_size, + __u32 levels); long (*create_table)(struct iommu_table_group *table_group, int num, __u32 page_shift, diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 510bb9c8a9e1..a7e098dba23d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2074,6 +2074,38 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, } #ifdef CONFIG_IOMMU_API +static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, + __u64 window_size, __u32 levels) +{ + unsigned long bytes = 0; + const unsigned window_shift = ilog2(window_size); + unsigned entries_shift = window_shift - page_shift; + unsigned table_shift = entries_shift + 3; + unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift); + unsigned long direct_table_size; + + if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) || + (window_size > memory_hotplug_max()) || + !is_power_of_2(window_size)) + return 0; + + /* Calculate a direct table size from window_size and levels */ + entries_shift = (entries_shift + levels - 1) / levels; + table_shift = entries_shift + 3; + table_shift = max_t(unsigned, table_shift, PAGE_SHIFT); + direct_table_size = 1UL << table_shift; + + for ( ; levels; --levels) { + bytes += _ALIGN_UP(tce_table_size, direct_table_size); + + tce_table_size /= direct_table_size; + tce_table_size <<= 3; + tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size); + } + + return bytes; +} + static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, int num) { @@ -2117,6 +2149,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { + .get_table_size = pnv_pci_ioda2_get_table_size, .create_table = pnv_pci_ioda2_create_table, .set_window = pnv_pci_ioda2_set_window, .unset_window = pnv_pci_ioda2_unset_window, @@ -2228,6 +2261,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, page_shift); tbl->it_level_size = 1ULL << (level_shift - 3); tbl->it_indirect_levels = levels - 1; + tbl->it_allocated_size = offset; pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", window_size, tce_table_size, bus_offset); -- cgit v1.2.3 From 46d3e1e16294c587a74093b1f5474c1b33b72381 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 5 Jun 2015 16:35:23 +1000 Subject: vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in ownership control Before the IOMMU user (VFIO) would take control over the IOMMU table belonging to a specific IOMMU group. This approach did not allow sharing tables between IOMMU groups attached to the same container. This introduces a new IOMMU ownership flavour when the user can not just control the existing IOMMU table but remove/create tables on demand. If an IOMMU implements take/release_ownership() callbacks, this lets the user have full control over the IOMMU group. When the ownership is taken, the platform code removes all the windows so the caller must create them. Before returning the ownership back to the platform code, VFIO unprograms and removes all the tables it created. This changes IODA2's onwership handler to remove the existing table rather than manipulating with the existing one. From now on, iommu_take_ownership() and iommu_release_ownership() are only called from the vfio_iommu_spapr_tce driver. Old-style ownership is still supported allowing VFIO to run on older P5IOC2 and IODA IO controllers. No change in userspace-visible behaviour is expected. Since it recreates TCE tables on each ownership change, related kernel traces will appear more often. This adds a pnv_pci_ioda2_setup_default_config() which is called when PE is being configured at boot time and when the ownership is passed from VFIO to the platform code. Signed-off-by: Alexey Kardashevskiy [aw: for the vfio related changes] Acked-by: Alex Williamson Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 101 ++++++++++++++++-------------- drivers/vfio/vfio_iommu_spapr_tce.c | 88 +++++++++++++++++++++++++- 2 files changed, 141 insertions(+), 48 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index a7e098dba23d..b9f0f430e249 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2073,6 +2073,49 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, return 0; } +static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) +{ + struct iommu_table *tbl = NULL; + long rc; + + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, + IOMMU_PAGE_SHIFT_4K, + pe->table_group.tce32_size, + POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); + if (rc) { + pe_err(pe, "Failed to create 32-bit TCE table, err %ld", + rc); + return rc; + } + + iommu_init_table(tbl, pe->phb->hose->node); + + rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + if (rc) { + pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", + rc); + pnv_ioda2_table_free(tbl); + return rc; + } + + if (!pnv_iommu_bypass_disabled) + pnv_pci_ioda2_set_bypass(pe, true); + + /* OPAL variant of PHB3 invalidated TCEs */ + if (pe->phb->ioda.tce_inval_reg) + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + + /* + * Setting table base here only for carrying iommu_group + * further down to let iommu_add_device() do the job. + * pnv_pci_ioda_dma_dev_setup will override it later anyway. + */ + if (pe->flags & PNV_IODA_PE_DEV) + set_iommu_table_base(&pe->pdev->dev, tbl); + + return 0; +} + #ifdef CONFIG_IOMMU_API static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels) @@ -2134,9 +2177,12 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); + /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */ + struct iommu_table *tbl = pe->table_group.tables[0]; - iommu_take_ownership(table_group->tables[0]); pnv_pci_ioda2_set_bypass(pe, false); + pnv_pci_ioda2_unset_window(&pe->table_group, 0); + pnv_ioda2_table_free(tbl); } static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) @@ -2144,8 +2190,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); - iommu_release_ownership(table_group->tables[0]); - pnv_pci_ioda2_set_bypass(pe, true); + pnv_pci_ioda2_setup_default_config(pe); } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { @@ -2308,7 +2353,6 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct iommu_table *tbl = NULL; int64_t rc; /* We shouldn't already have a 32-bit DMA associated */ @@ -2333,58 +2377,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, IOMMU_TABLE_GROUP_MAX_TABLES; pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M; - - rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, - IOMMU_PAGE_SHIFT_4K, - pe->table_group.tce32_size, - POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); - if (rc) { - pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); - goto fail; - } - pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); - - tbl->it_ops = &pnv_ioda2_iommu_ops; - iommu_init_table(tbl, phb->hose->node); #ifdef CONFIG_IOMMU_API pe->table_group.ops = &pnv_pci_ioda2_ops; #endif - rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + rc = pnv_pci_ioda2_setup_default_config(pe); if (rc) { - pe_err(pe, "Failed to configure 32-bit TCE table," - " err %ld\n", rc); - goto fail; + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + return; } - /* OPAL variant of PHB3 invalidated TCEs */ - if (phb->ioda.tce_inval_reg) - tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); - - if (pe->flags & PNV_IODA_PE_DEV) { - /* - * Setting table base here only for carrying iommu_group - * further down to let iommu_add_device() do the job. - * pnv_pci_ioda_dma_dev_setup will override it later anyway. - */ - set_iommu_table_base(&pe->pdev->dev, tbl); + if (pe->flags & PNV_IODA_PE_DEV) iommu_add_device(&pe->pdev->dev); - } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); - - /* Also create a bypass window */ - if (!pnv_iommu_bypass_disabled) - pnv_pci_ioda2_set_bypass(pe, true); - - return; -fail: - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; - if (tbl) { - pnv_pci_ioda2_table_free_pages(tbl); - pnv_pci_unlink_table_and_group(tbl, &pe->table_group); - iommu_free_table(tbl, "pnv"); - } } static void pnv_ioda_setup_dma(struct pnv_phb *phb) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 6d919eb4251f..203caacf2242 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -333,6 +333,45 @@ static long tce_iommu_build(struct tce_container *container, return ret; } +static long tce_iommu_create_table(struct tce_container *container, + struct iommu_table_group *table_group, + int num, + __u32 page_shift, + __u64 window_size, + __u32 levels, + struct iommu_table **ptbl) +{ + long ret, table_size; + + table_size = table_group->ops->get_table_size(page_shift, window_size, + levels); + if (!table_size) + return -EINVAL; + + ret = try_increment_locked_vm(table_size >> PAGE_SHIFT); + if (ret) + return ret; + + ret = table_group->ops->create_table(table_group, num, + page_shift, window_size, levels, ptbl); + + WARN_ON(!ret && !(*ptbl)->it_ops->free); + WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); + + if (ret) + decrement_locked_vm(table_size >> PAGE_SHIFT); + + return ret; +} + +static void tce_iommu_free_table(struct iommu_table *tbl) +{ + unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; + + tbl->it_ops->free(tbl); + decrement_locked_vm(pages); +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -546,15 +585,62 @@ static int tce_iommu_take_ownership(struct tce_container *container, static void tce_iommu_release_ownership_ddw(struct tce_container *container, struct iommu_table_group *table_group) { + long i; + + if (!table_group->ops->unset_window) { + WARN_ON_ONCE(1); + return; + } + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + /* Store table pointer as unset_window resets it */ + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl) + continue; + + table_group->ops->unset_window(table_group, i); + tce_iommu_clear(container, tbl, + tbl->it_offset, tbl->it_size); + tce_iommu_free_table(tbl); + } + table_group->ops->release_ownership(table_group); } static long tce_iommu_take_ownership_ddw(struct tce_container *container, struct iommu_table_group *table_group) { + long ret; + struct iommu_table *tbl = NULL; + + if (!table_group->ops->create_table || !table_group->ops->set_window || + !table_group->ops->release_ownership) { + WARN_ON_ONCE(1); + return -EFAULT; + } + table_group->ops->take_ownership(table_group); - return 0; + ret = tce_iommu_create_table(container, + table_group, + 0, /* window number */ + IOMMU_PAGE_SHIFT_4K, + table_group->tce32_size, + 1, /* default levels */ + &tbl); + if (!ret) { + ret = table_group->ops->set_window(table_group, 0, tbl); + if (ret) + tce_iommu_free_table(tbl); + else + table_group->tables[0] = tbl; + } + + if (ret) + table_group->ops->release_ownership(table_group); + + return ret; } static int tce_iommu_attach_group(void *iommu_data, -- cgit v1.2.3 From 4bece972fce6e597cb513bdcae4a04e14fc0dd81 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 15 Jun 2015 15:01:32 +1000 Subject: powerpc/powernv: pnv_init_idle_states() should only run on powernv Although this init call checks for device tree properties before doing anything, it should still only run on powernv machines. Reviewed-by: Shreyas B Prabhu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index bd39a120bd60..59d735d2e5c0 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -289,5 +290,4 @@ out_free: out: return 0; } - -subsys_initcall(pnv_init_idle_states); +machine_subsys_initcall(powernv, pnv_init_idle_states); -- cgit v1.2.3 From 02b6505c8f84cbb4be459c9bf8ebbb7b0754e764 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 17 Jun 2015 11:36:57 +1000 Subject: powerpc/powernv: Increase opal-irqchip initcall priority The eeh subsystem for powernv requires the opal event irqchip to be initialised prior to initialisation or the following errors are produced (and eeh doesn't work as expected): irq: XICS didn't like hwirq-0x9 to VIRQ17 mapping (rc=-22) pnv_eeh_post_init: Can't request OPAL event interrupt (0) On powernv eeh is initialised from a subsys_initcall due to a check for machine_is(powernv) in eeh_init(). This patch increases the initcall priority of opal_event_init() to an arch_initcall to ensure the opal event interface is initialised prior to any users of it. Signed-off-by: Alistair Popple Reported-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-irqchip.c | 4 ++++ arch/powerpc/platforms/powernv/opal.c | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index 841135f48981..e2e7d75f52f3 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -231,6 +231,7 @@ out: of_node_put(opal_node); return rc; } +machine_arch_initcall(powernv, opal_event_init); /** * opal_event_request(unsigned int opal_event_nr) - Request an event @@ -244,6 +245,9 @@ out: */ int opal_event_request(unsigned int opal_event_nr) { + if (WARN_ON_ONCE(!opal_event_irqchip.domain)) + return NO_IRQ; + return irq_create_mapping(opal_event_irqchip.domain, opal_event_nr); } EXPORT_SYMBOL(opal_event_request); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 9e9c483eee4d..f084afa0e3ba 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -657,9 +657,6 @@ static int __init opal_init(void) return -ENODEV; } - /* Initialise OPAL events */ - opal_event_init(); - /* Register OPAL consoles if any ports */ if (firmware_has_feature(FW_FEATURE_OPALv2)) consoles = of_find_node_by_path("/ibm,opal/consoles"); -- cgit v1.2.3 From 7185795a62589292015484985635b4a38029a2b9 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 15 Jun 2015 17:25:34 +0800 Subject: powerpc/powernv: fix construction of opal PRD messages We currently have a bug in the PRD code, where the contents of an incoming message (beyond the header) will be overwritten by the list item manipulations when adding to to the prd_msg_queue. This change reorders struct opal_prd_msg_queue_item, so that the message body doesn't overlap the list_head. We also clarify the memcpy of the message, as we're copying unnecessary bytes at the end of the message data. Signed-off-by: Jeremy Kerr Acked-by: Stewart Smith Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-prd.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index d9e72bde79f5..46cb3feb0a13 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -32,9 +32,13 @@ #include +/** + * The msg member must be at the end of the struct, as it's followed by the + * message data. + */ struct opal_prd_msg_queue_item { - struct opal_prd_msg_header msg; struct list_head list; + struct opal_prd_msg_header msg; }; static struct device_node *prd_node; @@ -351,22 +355,23 @@ static int opal_prd_msg_notifier(struct notifier_block *nb, struct opal_prd_msg_queue_item *item; struct opal_prd_msg_header *hdr; struct opal_msg *msg = _msg; + int msg_size, item_size; unsigned long flags; - int size; if (msg_type != OPAL_MSG_PRD) return 0; - /* Calculate total size of the item we need to store. The 'size' field - * in the header includes the header itself. */ + /* Calculate total size of the message and item we need to store. The + * 'size' field in the header includes the header itself. */ hdr = (void *)msg->params; - size = (sizeof(*item) - sizeof(item->msg)) + be16_to_cpu(hdr->size); + msg_size = be16_to_cpu(hdr->size); + item_size = msg_size + sizeof(*item) - sizeof(item->msg); - item = kzalloc(size, GFP_ATOMIC); + item = kzalloc(item_size, GFP_ATOMIC); if (!item) return -ENOMEM; - memcpy(&item->msg, msg->params, size); + memcpy(&item->msg, msg->params, msg_size); spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); list_add_tail(&item->list, &opal_prd_msg_queue); -- cgit v1.2.3 From b5926430dfa07d17e5d768c16b0d81c13a793f7c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 15 Jun 2015 17:49:59 +1000 Subject: powerpc/iommu/ioda2: Enable compile with IOV=on and IOMMU_API=off The pnv_pci_ioda2_unset_window() function is used to do the final cleanup of a DMA window being released: - via VFIO ioctl by the guest request; - via unplugging a virtual PCI function. However the function was under #ifdef CONFIG_IOMMU_API and was missing. This moves the helper outside of IOMMU_API block and enables it for either or both IOMMU_API and PCI_IOV. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 50 ++++++++++++++++--------------- 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b9f0f430e249..8424f5cdf0b1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2116,6 +2116,32 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) return 0; } +#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV) +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, + int num) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = pe->phb; + long ret; + + pe_info(pe, "Removing DMA window #%d\n", num); + + ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + (pe->pe_number << 1) + num, + 0/* levels */, 0/* table address */, + 0/* table size */, 0/* page size */); + if (ret) + pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); + else + pnv_pci_ioda2_tce_invalidate_entire(pe); + + pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); + + return ret; +} +#endif + #ifdef CONFIG_IOMMU_API static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels) @@ -2149,30 +2175,6 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, return bytes; } -static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, - int num) -{ - struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pnv_phb *phb = pe->phb; - long ret; - - pe_info(pe, "Removing DMA window #%d\n", num); - - ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - (pe->pe_number << 1) + num, - 0/* levels */, 0/* table address */, - 0/* table size */, 0/* page size */); - if (ret) - pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); - else - pnv_pci_ioda2_tce_invalidate_entire(pe); - - pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); - - return ret; -} - static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, -- cgit v1.2.3 From 5c89a87d13d168eacb8110810544a98ce0f4319d Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 18 Jun 2015 11:41:36 +1000 Subject: powerpc/powernv: Fix wrong IOMMU table in pnv_ioda_setup_bus_dma() When pnv_pci_ioda_fixup() is called during PHB fixup time, each PE in the sorted list of PEs (phb::pe_dma_list) is iterated to setup the PE's DMA32 space by pnv_ioda_setup_bus_dma() if the PE's DMA32 weight is bigger than zero. The function also assigns all the subordinate PCI devices of the PE's primary bus with the PE's DMA32 IOMMU table. It causes the PCI devicess in the child PEs, which don't have DMA weight, receives wrong IOMMU table and then IOMMU group. The patch fixes above issue by more check on the PE's coverage and don't assign IOMMU table to those PCI devices, which belong to the child PEs. The problem was found on Firestone platform initially. Suggested-by: Gavin Shan Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/platforms') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 8424f5cdf0b1..5738d315248b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1661,7 +1661,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); iommu_add_device(&dev->dev); - if (dev->subordinate) + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate); } } -- cgit v1.2.3