diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-20 11:48:06 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-20 11:48:06 -0700 |
commit | 45824fc0da6e46cc5d563105e1eaaf3098a686f9 (patch) | |
tree | 8e57c1f18104ed5f0d74d9eed9dc0365b3c137b8 /arch/powerpc/platforms | |
parent | 8c2b418c3f95a488f5226870eee68574d323f0f8 (diff) | |
parent | d9101bfa6adc831bda8836c4d774820553c14942 (diff) | |
download | linux-45824fc0da6e46cc5d563105e1eaaf3098a686f9.tar.bz2 |
Merge tag 'powerpc-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman:
"This is a bit late, partly due to me travelling, and partly due to a
power outage knocking out some of my test systems *while* I was
travelling.
- Initial support for running on a system with an Ultravisor, which
is software that runs below the hypervisor and protects guests
against some attacks by the hypervisor.
- Support for building the kernel to run as a "Secure Virtual
Machine", ie. as a guest capable of running on a system with an
Ultravisor.
- Some changes to our DMA code on bare metal, to allow devices with
medium sized DMA masks (> 32 && < 59 bits) to use more than 2GB of
DMA space.
- Support for firmware assisted crash dumps on bare metal (powernv).
- Two series fixing bugs in and refactoring our PCI EEH code.
- A large series refactoring our exception entry code to use gas
macros, both to make it more readable and also enable some future
optimisations.
As well as many cleanups and other minor features & fixups.
Thanks to: Adam Zerella, Alexey Kardashevskiy, Alistair Popple, Andrew
Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Anshuman Khandual,
Balbir Singh, Benjamin Herrenschmidt, Cédric Le Goater, Christophe
JAILLET, Christophe Leroy, Christopher M. Riedl, Christoph Hellwig,
Claudio Carvalho, Daniel Axtens, David Gibson, David Hildenbrand,
Desnes A. Nunes do Rosario, Ganesh Goudar, Gautham R. Shenoy, Greg
Kurz, Guerney Hunt, Gustavo Romero, Halil Pasic, Hari Bathini, Joakim
Tjernlund, Jonathan Neuschafer, Jordan Niethe, Leonardo Bras, Lianbo
Jiang, Madhavan Srinivasan, Mahesh Salgaonkar, Mahesh Salgaonkar,
Masahiro Yamada, Maxiwell S. Garcia, Michael Anderson, Nathan
Chancellor, Nathan Lynch, Naveen N. Rao, Nicholas Piggin, Oliver
O'Halloran, Qian Cai, Ram Pai, Ravi Bangoria, Reza Arbab, Ryan Grimm,
Sam Bobroff, Santosh Sivaraj, Segher Boessenkool, Sukadev Bhattiprolu,
Thiago Bauermann, Thiago Jung Bauermann, Thomas Gleixner, Tom
Lendacky, Vasant Hegde"
* tag 'powerpc-5.4-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (264 commits)
powerpc/mm/mce: Keep irqs disabled during lockless page table walk
powerpc: Use ftrace_graph_ret_addr() when unwinding
powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
ftrace: Look up the address of return_to_handler() using helpers
powerpc: dump kernel log before carrying out fadump or kdump
docs: powerpc: Add missing documentation reference
powerpc/xmon: Fix output of XIVE IPI
powerpc/xmon: Improve output of XIVE interrupts
powerpc/mm/radix: remove useless kernel messages
powerpc/fadump: support holes in kernel boot memory area
powerpc/fadump: remove RMA_START and RMA_END macros
powerpc/fadump: update documentation about option to release opalcore
powerpc/fadump: consider f/w load area
powerpc/opalcore: provide an option to invalidate /sys/firmware/opal/core file
powerpc/opalcore: export /sys/firmware/opal/core for analysing opal crashes
powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP
powerpc/fadump: add support to preserve crash data on FADUMP disabled kernel
powerpc/fadump: improve how crashed kernel's memory is reserved
powerpc/fadump: consider reserved ranges while releasing memory
powerpc/fadump: make crash memory ranges array allocation generic
...
Diffstat (limited to 'arch/powerpc/platforms')
44 files changed, 3056 insertions, 1062 deletions
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index b369ed4e3675..25ebe634a661 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -272,14 +272,6 @@ config PPC4xx_GPIO help Enable gpiolib support for ppc440 based boards -config PPC4xx_OCM - bool "PPC4xx On Chip Memory (OCM) support" - depends on 4xx - select PPC_LIB_RHEAP - help - Enable OCM support for PowerPC 4xx platforms with on chip memory, - OCM provides the fast place for memory access to improve performance. - # 44x specific CPU modules, selected based on the board above. config 440EP bool diff --git a/arch/powerpc/platforms/4xx/Makefile b/arch/powerpc/platforms/4xx/Makefile index f5ae27ca131b..d009d2e0b9e8 100644 --- a/arch/powerpc/platforms/4xx/Makefile +++ b/arch/powerpc/platforms/4xx/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += uic.o machine_check.o -obj-$(CONFIG_PPC4xx_OCM) += ocm.o obj-$(CONFIG_4xx_SOC) += soc.o obj-$(CONFIG_PCI) += pci.o obj-$(CONFIG_PPC4xx_HSTA_MSI) += hsta_msi.o diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c deleted file mode 100644 index ba3257406ced..000000000000 --- a/arch/powerpc/platforms/4xx/ocm.c +++ /dev/null @@ -1,390 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * PowerPC 4xx OCM memory allocation support - * - * (C) Copyright 2009, Applied Micro Circuits Corporation - * Victor Gallardo (vgallardo@amcc.com) - * - * See file CREDITS for list of people who contributed to this - * project. - */ - -#include <linux/kernel.h> -#include <linux/dma-mapping.h> -#include <linux/of.h> -#include <linux/of_address.h> -#include <asm/rheap.h> -#include <asm/ppc4xx_ocm.h> -#include <linux/slab.h> -#include <linux/debugfs.h> - -#define OCM_DISABLED 0 -#define OCM_ENABLED 1 - -struct ocm_block { - struct list_head list; - void __iomem *addr; - int size; - const char *owner; -}; - -/* non-cached or cached region */ -struct ocm_region { - phys_addr_t phys; - void __iomem *virt; - - int memtotal; - int memfree; - - rh_info_t *rh; - struct list_head list; -}; - -struct ocm_info { - int index; - int status; - int ready; - - phys_addr_t phys; - - int alignment; - int memtotal; - int cache_size; - - struct ocm_region nc; /* non-cached region */ - struct ocm_region c; /* cached region */ -}; - -static struct ocm_info *ocm_nodes; -static int ocm_count; - -static struct ocm_info *ocm_get_node(unsigned int index) -{ - if (index >= ocm_count) { - printk(KERN_ERR "PPC4XX OCM: invalid index"); - return NULL; - } - - return &ocm_nodes[index]; -} - -static int ocm_free_region(struct ocm_region *ocm_reg, const void *addr) -{ - struct ocm_block *blk, *tmp; - unsigned long offset; - - if (!ocm_reg->virt) - return 0; - - list_for_each_entry_safe(blk, tmp, &ocm_reg->list, list) { - if (blk->addr == addr) { - offset = addr - ocm_reg->virt; - ocm_reg->memfree += blk->size; - rh_free(ocm_reg->rh, offset); - list_del(&blk->list); - kfree(blk); - return 1; - } - } - - return 0; -} - -static void __init ocm_init_node(int count, struct device_node *node) -{ - struct ocm_info *ocm; - - const unsigned int *cell_index; - const unsigned int *cache_size; - int len; - - struct resource rsrc; - - ocm = ocm_get_node(count); - - cell_index = of_get_property(node, "cell-index", &len); - if (!cell_index) { - printk(KERN_ERR "PPC4XX OCM: missing cell-index property"); - return; - } - ocm->index = *cell_index; - - if (of_device_is_available(node)) - ocm->status = OCM_ENABLED; - - cache_size = of_get_property(node, "cached-region-size", &len); - if (cache_size) - ocm->cache_size = *cache_size; - - if (of_address_to_resource(node, 0, &rsrc)) { - printk(KERN_ERR "PPC4XX OCM%d: could not get resource address\n", - ocm->index); - return; - } - - ocm->phys = rsrc.start; - ocm->memtotal = (rsrc.end - rsrc.start + 1); - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (%s)\n", - ocm->index, ocm->memtotal, - (ocm->status == OCM_DISABLED) ? "disabled" : "enabled"); - - if (ocm->status == OCM_DISABLED) - return; - - /* request region */ - - if (!request_mem_region(ocm->phys, ocm->memtotal, "ppc4xx_ocm")) { - printk(KERN_ERR "PPC4XX OCM%d: could not request region\n", - ocm->index); - return; - } - - /* Configure non-cached and cached regions */ - - ocm->nc.phys = ocm->phys; - ocm->nc.memtotal = ocm->memtotal - ocm->cache_size; - ocm->nc.memfree = ocm->nc.memtotal; - - ocm->c.phys = ocm->phys + ocm->nc.memtotal; - ocm->c.memtotal = ocm->cache_size; - ocm->c.memfree = ocm->c.memtotal; - - if (ocm->nc.memtotal == 0) - ocm->nc.phys = 0; - - if (ocm->c.memtotal == 0) - ocm->c.phys = 0; - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (non-cached)\n", - ocm->index, ocm->nc.memtotal); - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (cached)\n", - ocm->index, ocm->c.memtotal); - - /* ioremap the non-cached region */ - if (ocm->nc.memtotal) { - ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal, - _PAGE_EXEC | pgprot_val(PAGE_KERNEL_NCG)); - - if (!ocm->nc.virt) { - printk(KERN_ERR - "PPC4XX OCM%d: failed to ioremap non-cached memory\n", - ocm->index); - ocm->nc.memfree = 0; - return; - } - } - - /* ioremap the cached region */ - - if (ocm->c.memtotal) { - ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal, - _PAGE_EXEC | pgprot_val(PAGE_KERNEL)); - - if (!ocm->c.virt) { - printk(KERN_ERR - "PPC4XX OCM%d: failed to ioremap cached memory\n", - ocm->index); - ocm->c.memfree = 0; - return; - } - } - - /* Create Remote Heaps */ - - ocm->alignment = 4; /* default 4 byte alignment */ - - if (ocm->nc.virt) { - ocm->nc.rh = rh_create(ocm->alignment); - rh_attach_region(ocm->nc.rh, 0, ocm->nc.memtotal); - } - - if (ocm->c.virt) { - ocm->c.rh = rh_create(ocm->alignment); - rh_attach_region(ocm->c.rh, 0, ocm->c.memtotal); - } - - INIT_LIST_HEAD(&ocm->nc.list); - INIT_LIST_HEAD(&ocm->c.list); - - ocm->ready = 1; -} - -static int ocm_debugfs_show(struct seq_file *m, void *v) -{ - struct ocm_block *blk, *tmp; - unsigned int i; - - for (i = 0; i < ocm_count; i++) { - struct ocm_info *ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - seq_printf(m, "PPC4XX OCM : %d\n", ocm->index); - seq_printf(m, "PhysAddr : %pa\n", &(ocm->phys)); - seq_printf(m, "MemTotal : %d Bytes\n", ocm->memtotal); - seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal); - seq_printf(m, "MemTotal(C) : %d Bytes\n\n", ocm->c.memtotal); - - seq_printf(m, "NC.PhysAddr : %pa\n", &(ocm->nc.phys)); - seq_printf(m, "NC.VirtAddr : 0x%p\n", ocm->nc.virt); - seq_printf(m, "NC.MemTotal : %d Bytes\n", ocm->nc.memtotal); - seq_printf(m, "NC.MemFree : %d Bytes\n", ocm->nc.memfree); - - list_for_each_entry_safe(blk, tmp, &ocm->nc.list, list) { - seq_printf(m, "NC.MemUsed : %d Bytes (%s)\n", - blk->size, blk->owner); - } - - seq_printf(m, "\nC.PhysAddr : %pa\n", &(ocm->c.phys)); - seq_printf(m, "C.VirtAddr : 0x%p\n", ocm->c.virt); - seq_printf(m, "C.MemTotal : %d Bytes\n", ocm->c.memtotal); - seq_printf(m, "C.MemFree : %d Bytes\n", ocm->c.memfree); - - list_for_each_entry_safe(blk, tmp, &ocm->c.list, list) { - seq_printf(m, "C.MemUsed : %d Bytes (%s)\n", - blk->size, blk->owner); - } - - seq_putc(m, '\n'); - } - - return 0; -} - -static int ocm_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, ocm_debugfs_show, NULL); -} - -static const struct file_operations ocm_debugfs_fops = { - .open = ocm_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int ocm_debugfs_init(void) -{ - struct dentry *junk; - - junk = debugfs_create_dir("ppc4xx_ocm", 0); - if (!junk) { - printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create dir\n"); - return -1; - } - - if (debugfs_create_file("info", 0644, junk, NULL, &ocm_debugfs_fops)) { - printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create file\n"); - return -1; - } - - return 0; -} - -void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align, - int flags, const char *owner) -{ - void __iomem *addr = NULL; - unsigned long offset; - struct ocm_info *ocm; - struct ocm_region *ocm_reg; - struct ocm_block *ocm_blk; - int i; - - for (i = 0; i < ocm_count; i++) { - ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - if (flags == PPC4XX_OCM_NON_CACHED) - ocm_reg = &ocm->nc; - else - ocm_reg = &ocm->c; - - if (!ocm_reg->virt) - continue; - - if (align < ocm->alignment) - align = ocm->alignment; - - offset = rh_alloc_align(ocm_reg->rh, size, align, NULL); - - if (IS_ERR_VALUE(offset)) - continue; - - ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL); - if (!ocm_blk) { - rh_free(ocm_reg->rh, offset); - break; - } - - *phys = ocm_reg->phys + offset; - addr = ocm_reg->virt + offset; - size = ALIGN(size, align); - - ocm_blk->addr = addr; - ocm_blk->size = size; - ocm_blk->owner = owner; - list_add_tail(&ocm_blk->list, &ocm_reg->list); - - ocm_reg->memfree -= size; - - break; - } - - return addr; -} - -void ppc4xx_ocm_free(const void *addr) -{ - int i; - - if (!addr) - return; - - for (i = 0; i < ocm_count; i++) { - struct ocm_info *ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - if (ocm_free_region(&ocm->nc, addr) || - ocm_free_region(&ocm->c, addr)) - return; - } -} - -static int __init ppc4xx_ocm_init(void) -{ - struct device_node *np; - int count; - - count = 0; - for_each_compatible_node(np, NULL, "ibm,ocm") - count++; - - if (!count) - return 0; - - ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL); - if (!ocm_nodes) - return -ENOMEM; - - ocm_count = count; - count = 0; - - for_each_compatible_node(np, NULL, "ibm,ocm") { - ocm_init_node(count, np); - count++; - } - - ocm_debugfs_init(); - - return 0; -} - -arch_initcall(ppc4xx_ocm_init); diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index f3fb79fccc72..d82e3664ffdf 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -197,7 +197,8 @@ endmenu config PPC601_SYNC_FIX bool "Workarounds for PPC601 bugs" - depends on PPC_BOOK3S_32 && PPC_PMAC + depends on PPC_BOOK3S_601 && PPC_PMAC + default y help Some versions of the PPC601 (the first PowerPC chip) have bugs which mean that extra synchronization instructions are required near diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 56a7c814160d..12543e53fa96 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -6,6 +6,9 @@ config PPC64 This option selects whether a 32-bit or a 64-bit kernel will be built. +config PPC_BOOK3S_32 + bool + menu "Processor support" choice prompt "Processor Type" @@ -21,13 +24,20 @@ choice If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx. -config PPC_BOOK3S_32 - bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" +config PPC_BOOK3S_6xx + bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601" + select PPC_BOOK3S_32 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select PPC_HAVE_KUEP select PPC_HAVE_KUAP +config PPC_BOOK3S_601 + bool "PowerPC 601" + select PPC_BOOK3S_32 + select PPC_FPU + select PPC_HAVE_KUAP + config PPC_85xx bool "Freescale 85xx" select E500 @@ -450,8 +460,10 @@ config NOT_COHERENT_CACHE depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ GAMECUBE_COMMON || AMIGAONE select ARCH_HAS_DMA_COHERENT_TO_PFN + select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_SYNC_DMA_FOR_CPU + select DMA_DIRECT_REMAP default n if PPC_47x default y diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 16dfee29aa41..ca9ffc1c8685 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, window->table.it_size = size >> window->table.it_page_shift; window->table.it_ops = &cell_iommu_ops; - iommu_init_table(&window->table, iommu->nid); + iommu_init_table(&window->table, iommu->nid, 0, 0); pr_debug("\tioid %d\n", window->ioid); pr_debug("\tblocksize %ld\n", window->table.it_blocksize); diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index 77fee09104f8..b500a6e47e6b 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void) */ iommu_table_iobmap.it_blocksize = 4; iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops; - iommu_init_table(&iommu_table_iobmap, 0); + iommu_init_table(&iommu_table_iobmap, 0, 0, 0); pr_debug(" <- %s\n", __func__); } diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 850eee860cf2..938803eab0ad 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -12,7 +12,6 @@ config PPC_POWERNV select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 - select PPC_SCOM select ARCH_RANDOM select CPU_FREQ select PPC_DOORBELL @@ -47,3 +46,7 @@ config PPC_VAS VAS adapters are found in POWER9 based systems. If unsure, say N. + +config SCOM_DEBUGFS + bool "Expose SCOM controllers via debugfs" + depends on DEBUG_FS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index da2e99efbd04..a3ac9646119d 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -4,15 +4,19 @@ obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o +obj-y += ultravisor.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o +obj-$(CONFIG_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_OPAL_CORE) += opal-core.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o -obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o obj-$(CONFIG_OCXL_BASE) += ocxl.o +obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 620a986209f5..6bc24a47e9ef 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -34,6 +34,7 @@ #include "powernv.h" #include "pci.h" +#include "../../../../drivers/pci/pci.h" static int eeh_event_irq = -EINVAL; @@ -41,13 +42,10 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - if (!pdev->is_virtfn) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); eeh_add_device_early(pdn); eeh_add_device_late(pdev); eeh_sysfs_add_device(pdev); @@ -199,6 +197,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); #endif /* CONFIG_DEBUG_FS */ +void pnv_eeh_enable_phbs(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + /* + * If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; + } +} + /** * pnv_eeh_post_init - EEH platform dependent post initialization * @@ -213,9 +230,7 @@ int pnv_eeh_post_init(void) struct pnv_phb *phb; int ret = 0; - /* Probe devices & build address cache */ - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); /* Register OPAL event notifier */ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); @@ -237,19 +252,11 @@ int pnv_eeh_post_init(void) if (!eeh_enabled()) disable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; - /* - * If EEH is enabled, we're going to rely on that. - * Otherwise, we restore to conventional mechanism - * to clear frozen PE during PCI config access. - */ - if (eeh_enabled()) - phb->flags |= PNV_PHB_FLAG_EEH; - else - phb->flags &= ~PNV_PHB_FLAG_EEH; - /* Create debugfs entries */ #ifdef CONFIG_DEBUG_FS if (phb->has_dbgfs || !phb->dbgfs) @@ -377,6 +384,8 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -402,9 +411,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) /* Create PE */ ret = eeh_add_to_parent_pe(edev); if (ret) { - pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n", - __func__, hose->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret); + eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret); return NULL; } @@ -453,11 +460,17 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) * Enable EEH explicitly so that we will do EEH check * while accessing I/O stuff */ - eeh_add_flag(EEH_ENABLED); + if (!eeh_has_flag(EEH_ENABLED)) { + enable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + eeh_add_flag(EEH_ENABLED); + } /* Save memory bars */ eeh_save_bars(edev); + eeh_edev_dbg(edev, "EEH enabled on device\n"); + return NULL; } @@ -837,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) int aer = edev ? edev->aer_cap : 0; u32 ctrl; - pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n", __func__, pci_domain_nr(dev->bus), dev->bus->number, option); @@ -895,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option) if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL)) return __pnv_eeh_bridge_reset(pdev, option); + pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(pdev->bus), + pdev->bus->number, option); + switch (option) { case EEH_RESET_FUNDAMENTAL: scope = OPAL_RESET_PCI_FUNDAMENTAL; @@ -1113,17 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) return -EIO; } + if (pci_is_root_bus(bus)) + return pnv_eeh_root_reset(hose, option); + /* - * If dealing with the root bus (or the bus underneath the - * root port), we reset the bus underneath the root port. + * For hot resets try use the generic PCI error recovery reset + * functions. These correctly handles the case where the secondary + * bus is behind a hotplug slot and it will use the slot provided + * reset methods to prevent spurious hotplug events during the reset. * - * The cxl driver depends on this behaviour for bi-modal card - * switching. + * Fundemental resets need to be handled internally to EEH since the + * PCI core doesn't really have a concept of a fundemental reset, + * mainly because there's no standard way to generate one. Only a + * few devices require an FRESET so it should be fine. */ - if (pci_is_root_bus(bus) || - pci_is_root_bus(bus->parent)) - return pnv_eeh_root_reset(hose, option); + if (option != EEH_RESET_FUNDAMENTAL) { + /* + * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the + * de-assert step. It's like the OPAL reset API was + * poorly designed or something... + */ + if (option == EEH_RESET_DEACTIVATE) + return 0; + rc = pci_bus_error_reset(bus->self); + if (!rc) + return 0; + } + + /* otherwise, use the generic bridge reset. this might call into FW */ + if (pci_is_root_bus(bus->parent)) + return pnv_eeh_root_reset(hose, option); return pnv_eeh_bridge_reset(bus->self, option); } diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 09f49eed7fb8..78599bca66c2 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -675,7 +675,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) sprs.ptcr = mfspr(SPRN_PTCR); sprs.rpr = mfspr(SPRN_RPR); sprs.tscr = mfspr(SPRN_TSCR); - sprs.ldbar = mfspr(SPRN_LDBAR); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + sprs.ldbar = mfspr(SPRN_LDBAR); sprs_saved = true; @@ -789,7 +790,8 @@ core_woken: mtspr(SPRN_MMCR0, sprs.mmcr0); mtspr(SPRN_MMCR1, sprs.mmcr1); mtspr(SPRN_MMCR2, sprs.mmcr2); - mtspr(SPRN_LDBAR, sprs.ldbar); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + mtspr(SPRN_LDBAR, sprs.ldbar); mtspr(SPRN_SPRG3, local_paca->sprg_vdso); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index c16249d251f1..b95b9e3c4c98 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -89,6 +89,7 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) } EXPORT_SYMBOL(pnv_pci_get_npu_dev); +#ifdef CONFIG_IOMMU_API /* * Returns the PE assoicated with the PCI device of the given * NPU. Returns the linked pci device if pci_dev != NULL. @@ -192,106 +193,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) return 0; } -/* - * Enables 32 bit DMA on NPU. - */ -static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) -{ - struct pci_dev *gpdev; - struct pnv_ioda_pe *gpe; - int64_t rc; - - /* - * Find the assoicated PCI devices and get the dma window - * information from there. - */ - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) - return; - - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return; - - rc = pnv_npu_set_window(&npe->table_group, 0, - gpe->table_group.tables[0]); - - /* - * NVLink devices use the same TCE table configuration as - * their parent device so drivers shouldn't be doing DMA - * operations directly on these devices. - */ - set_dma_ops(&npe->pdev->dev, &dma_dummy_ops); -} - -/* - * Enables bypass mode on the NPU. The NPU only supports one - * window per link, so bypass needs to be explicitly enabled or - * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be - * active at the same time. - */ -static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) -{ - struct pnv_phb *phb = npe->phb; - int64_t rc = 0; - phys_addr_t top = memblock_end_of_DRAM(); - - if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev) - return -EINVAL; - - rc = pnv_npu_unset_window(&npe->table_group, 0); - if (rc != OPAL_SUCCESS) - return rc; - - /* Enable the bypass window */ - - top = roundup_pow_of_two(top); - dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n", - npe->pe_number); - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - 0 /* bypass base */, top); - - if (rc == OPAL_SUCCESS) - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - return rc; -} - -void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) -{ - int i; - struct pnv_phb *phb; - struct pci_dn *pdn; - struct pnv_ioda_pe *npe; - struct pci_dev *npdev; - - for (i = 0; ; ++i) { - npdev = pnv_pci_get_npu_dev(gpdev, i); - - if (!npdev) - break; - - pdn = pci_get_pdn(npdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return; - - phb = pci_bus_to_host(npdev->bus)->private_data; - - /* We only do bypass if it's enabled on the linked device */ - npe = &phb->ioda.pe_array[pdn->pe_number]; - - if (bypass) { - dev_info(&npdev->dev, - "Using 64-bit DMA iommu bypass\n"); - pnv_npu_dma_set_bypass(npe); - } else { - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); - pnv_npu_dma_set_32(npe); - } - } -} - -#ifdef CONFIG_IOMMU_API /* Switch ownership from platform code to external user (e.g. VFIO) */ static void pnv_npu_take_ownership(struct iommu_table_group *table_group) { diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 29ca523c1c79..a2aa5e433ac8 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -257,7 +257,7 @@ OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); -OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ); OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); @@ -287,3 +287,6 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); +OPAL_CALL(opal_mpipl_update, OPAL_MPIPL_UPDATE); +OPAL_CALL(opal_mpipl_register_tag, OPAL_MPIPL_REGISTER_TAG); +OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG); diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c new file mode 100644 index 000000000000..ed895d82c048 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Interface for exporting the OPAL ELF core. + * Heavily inspired from fs/proc/vmcore.c + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal core: " fmt + +#include <linux/memblock.h> +#include <linux/uaccess.h> +#include <linux/proc_fs.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/crash_core.h> +#include <linux/of.h> + +#include <asm/page.h> +#include <asm/opal.h> +#include <asm/fadump-internal.h> + +#include "opal-fadump.h" + +#define MAX_PT_LOAD_CNT 8 + +/* NT_AUXV note related info */ +#define AUXV_CNT 1 +#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off)) + +struct opalcore_config { + u32 num_cpus; + /* PIR value of crashing CPU */ + u32 crashing_cpu; + + /* CPU state data info from F/W */ + u64 cpu_state_destination_vaddr; + u64 cpu_state_data_size; + u64 cpu_state_entry_size; + + /* OPAL memory to be exported as PT_LOAD segments */ + u64 ptload_addr[MAX_PT_LOAD_CNT]; + u64 ptload_size[MAX_PT_LOAD_CNT]; + u64 ptload_cnt; + + /* Pointer to the first PT_LOAD in the ELF core file */ + Elf64_Phdr *ptload_phdr; + + /* Total size of opalcore file. */ + size_t opalcore_size; + + /* Buffer for all the ELF core headers and the PT_NOTE */ + size_t opalcorebuf_sz; + char *opalcorebuf; + + /* NT_AUXV buffer */ + char auxv_buf[AUXV_DESC_SZ]; +}; + +struct opalcore { + struct list_head list; + u64 paddr; + size_t size; + loff_t offset; +}; + +static LIST_HEAD(opalcore_list); +static struct opalcore_config *oc_conf; +static const struct opal_mpipl_fadump *opalc_metadata; +static const struct opal_mpipl_fadump *opalc_cpu_metadata; + +/* + * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered + * by kernel, SIGTERM otherwise. + */ +bool kernel_initiated; + +static struct opalcore * __init get_new_element(void) +{ + return kzalloc(sizeof(struct opalcore), GFP_KERNEL); +} + +static inline int is_opalcore_usable(void) +{ + return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0; +} + +static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name, + u32 type, void *data, + size_t data_len) +{ + Elf64_Nhdr *note = (Elf64_Nhdr *)buf; + Elf64_Word namesz = strlen(name) + 1; + + note->n_namesz = cpu_to_be32(namesz); + note->n_descsz = cpu_to_be32(data_len); + note->n_type = cpu_to_be32(type); + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word)); + memcpy(buf, name, namesz); + buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word)); + + return buf; +} + +static void fill_prstatus(struct elf_prstatus *prstatus, int pir, + struct pt_regs *regs) +{ + memset(prstatus, 0, sizeof(struct elf_prstatus)); + elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs); + + /* + * Overload PID with PIR value. + * As a PIR value could also be '0', add an offset of '100' + * to every PIR to avoid misinterpretations in GDB. + */ + prstatus->pr_pid = cpu_to_be32(100 + pir); + prstatus->pr_ppid = cpu_to_be32(1); + + /* + * Indicate SIGUSR1 for crash initiated from kernel. + * SIGTERM otherwise. + */ + if (pir == oc_conf->crashing_cpu) { + short sig; + + sig = kernel_initiated ? SIGUSR1 : SIGTERM; + prstatus->pr_cursig = cpu_to_be16(sig); + } +} + +static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf, + u64 opal_boot_entry) +{ + Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf; + int idx = 0; + + memset(bufp, 0, AUXV_DESC_SZ); + + /* Entry point of OPAL */ + bufp[idx++] = cpu_to_be64(AT_ENTRY); + bufp[idx++] = cpu_to_be64(opal_boot_entry); + + /* end of vector */ + bufp[idx++] = cpu_to_be64(AT_NULL); + + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV, + oc_conf->auxv_buf, AUXV_DESC_SZ); + return buf; +} + +/* + * Read from the ELF header and then the crash dump. + * Returns number of bytes read on success, -errno on failure. + */ +static ssize_t read_opalcore(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + struct opalcore *m; + ssize_t tsz, avail; + loff_t tpos = pos; + + if (pos >= oc_conf->opalcore_size) + return 0; + + /* Adjust count if it goes beyond opalcore size */ + avail = oc_conf->opalcore_size - pos; + if (count > avail) + count = avail; + + if (count == 0) + return 0; + + /* Read ELF core header and/or PT_NOTE segment */ + if (tpos < oc_conf->opalcorebuf_sz) { + tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count); + memcpy(to, oc_conf->opalcorebuf + tpos, tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + + list_for_each_entry(m, &opalcore_list, list) { + /* nothing more to read here */ + if (count == 0) + break; + + if (tpos < m->offset + m->size) { + void *addr; + + tsz = min_t(size_t, m->offset + m->size - tpos, count); + addr = (void *)(m->paddr + tpos - m->offset); + memcpy(to, __va(addr), tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + } + + return (tpos - pos); +} + +static struct bin_attribute opal_core_attr = { + .attr = {.name = "core", .mode = 0400}, + .read = read_opalcore +}; + +/* + * Read CPU state dump data and convert it into ELF notes. + * + * Each register entry is of 16 bytes, A numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. + */ +static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + struct elf_prstatus prstatus; + Elf64_Word *first_cpu_note; + struct pt_regs regs; + char *bufp; + int i; + + size_per_thread = oc_conf->cpu_state_entry_size; + bufp = __va(oc_conf->cpu_state_destination_vaddr); + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", oc_conf->num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + /* + * Skip past the first CPU note. Fill this note with the + * crashing CPU's prstatus. + */ + first_cpu_note = buf; + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + + for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + thread_pir = be32_to_cpu(thdr->pir); + + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, false, ®s); + + pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir, + be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip)); + fill_prstatus(&prstatus, thread_pir, ®s); + + if (thread_pir != oc_conf->crashing_cpu) { + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } else { + /* + * Add crashing CPU as the first NT_PRSTATUS note for + * GDB to process the core file appropriately. + */ + append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } + } + + return buf; +} + +static int __init create_opalcore(void) +{ + u64 opal_boot_entry, opal_base_addr, paddr; + u32 hdr_size, cpu_notes_size, count; + struct device_node *dn; + struct opalcore *new; + loff_t opalcore_off; + struct page *page; + Elf64_Phdr *phdr; + Elf64_Ehdr *elf; + int i, ret; + char *bufp; + + /* Get size of header & CPU notes for OPAL core */ + hdr_size = (sizeof(Elf64_Ehdr) + + ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr))); + cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + + CRASH_CORE_NOTE_DESC_BYTES)) + + (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ)); + + /* Allocate buffer to setup OPAL core */ + oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size); + oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz, + GFP_KERNEL | __GFP_ZERO); + if (!oc_conf->opalcorebuf) { + pr_err("Not enough memory to setup OPAL core (size: %lu)\n", + oc_conf->opalcorebuf_sz); + oc_conf->opalcorebuf_sz = 0; + return -ENOMEM; + } + count = oc_conf->opalcorebuf_sz / PAGE_SIZE; + page = virt_to_page(oc_conf->opalcorebuf); + for (i = 0; i < count; i++) + mark_page_reserved(page + i); + + pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf); + + /* Read OPAL related device-tree entries */ + dn = of_find_node_by_name(NULL, "ibm,opal"); + if (dn) { + ret = of_property_read_u64(dn, "opal-base-address", + &opal_base_addr); + pr_debug("opal-base-address: %llx\n", opal_base_addr); + ret |= of_property_read_u64(dn, "opal-boot-address", + &opal_boot_entry); + pr_debug("opal-boot-address: %llx\n", opal_boot_entry); + } + if (!dn || ret) + pr_warn("WARNING: Failed to read OPAL base & entry values\n"); + + /* Use count to keep track of the program headers */ + count = 0; + + bufp = oc_conf->opalcorebuf; + elf = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELFDATA2MSB; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = cpu_to_be16(ET_CORE); + elf->e_machine = cpu_to_be16(ELF_ARCH); + elf->e_version = cpu_to_be32(EV_CURRENT); + elf->e_entry = 0; + elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr)); + elf->e_shoff = 0; + elf->e_flags = 0; + + elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr)); + elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr)); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_NOTE); + phdr->p_flags = 0; + phdr->p_align = 0; + phdr->p_paddr = phdr->p_vaddr = 0; + phdr->p_offset = cpu_to_be64(hdr_size); + phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size); + count++; + + opalcore_off = oc_conf->opalcorebuf_sz; + oc_conf->ptload_phdr = (Elf64_Phdr *)bufp; + paddr = 0; + for (i = 0; i < oc_conf->ptload_cnt; i++) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_LOAD); + phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X); + phdr->p_align = 0; + + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = oc_conf->ptload_addr[i]; + new->size = oc_conf->ptload_size[i]; + new->offset = opalcore_off; + list_add_tail(&new->list, &opalcore_list); + + phdr->p_paddr = cpu_to_be64(paddr); + phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr); + phdr->p_filesz = phdr->p_memsz = + cpu_to_be64(oc_conf->ptload_size[i]); + phdr->p_offset = cpu_to_be64(opalcore_off); + + count++; + opalcore_off += oc_conf->ptload_size[i]; + paddr += oc_conf->ptload_size[i]; + } + + elf->e_phnum = cpu_to_be16(count); + + bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp); + bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry); + + oc_conf->opalcore_size = opalcore_off; + return 0; +} + +static void opalcore_cleanup(void) +{ + if (oc_conf == NULL) + return; + + /* Remove OPAL core sysfs file */ + sysfs_remove_bin_file(opal_kobj, &opal_core_attr); + oc_conf->ptload_phdr = NULL; + oc_conf->ptload_cnt = 0; + + /* free the buffer used for setting up OPAL core */ + if (oc_conf->opalcorebuf) { + void *end = (void *)((u64)oc_conf->opalcorebuf + + oc_conf->opalcorebuf_sz); + + free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL); + oc_conf->opalcorebuf = NULL; + oc_conf->opalcorebuf_sz = 0; + } + + kfree(oc_conf); + oc_conf = NULL; +} +__exitcall(opalcore_cleanup); + +static void __init opalcore_config_init(void) +{ + u32 idx, cpu_data_version; + struct device_node *np; + const __be32 *prop; + u64 addr = 0; + int i, ret; + + np = of_find_node_by_path("/ibm,opal/dump"); + if (np == NULL) + return; + + if (!of_device_is_compatible(np, "ibm,opal-dump")) { + pr_warn("Support missing for this f/w version!\n"); + return; + } + + /* Check if dump has been initiated on last reboot */ + prop = of_get_property(np, "mpipl-boot", NULL); + if (!prop) { + of_node_put(np); + return; + } + + /* Get OPAL metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("OPAL metadata addr: %llx\n", addr); + opalc_metadata = __va(addr); + + /* Get OPAL CPU metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL CPU metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opalc_cpu_metadata = __va(addr); + + /* Allocate memory for config buffer */ + oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL); + if (oc_conf == NULL) + goto error_out; + + /* Parse OPAL metadata */ + if (opalc_metadata->version != OPAL_MPIPL_VERSION) { + pr_warn("Supported OPAL metadata version: %u, found: %u!\n", + OPAL_MPIPL_VERSION, opalc_metadata->version); + pr_warn("WARNING: F/W using newer OPAL metadata format!!\n"); + } + + oc_conf->ptload_cnt = 0; + idx = be32_to_cpu(opalc_metadata->region_cnt); + if (idx > MAX_PT_LOAD_CNT) { + pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)", + MAX_PT_LOAD_CNT, idx); + idx = MAX_PT_LOAD_CNT; + } + for (i = 0; i < idx; i++) { + oc_conf->ptload_addr[oc_conf->ptload_cnt] = + be64_to_cpu(opalc_metadata->region[i].dest); + oc_conf->ptload_size[oc_conf->ptload_cnt++] = + be64_to_cpu(opalc_metadata->region[i].size); + } + oc_conf->ptload_cnt = i; + oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir); + + if (!oc_conf->ptload_cnt) { + pr_err("OPAL memory regions not found\n"); + goto error_out; + } + + /* Parse OPAL CPU metadata */ + cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version); + if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU data version: %u, found: %u!\n", + HDAT_FADUMP_CPU_DATA_VER, cpu_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest); + if (!addr) { + pr_err("CPU state data not found!\n"); + goto error_out; + } + oc_conf->cpu_state_destination_vaddr = (u64)__va(addr); + + oc_conf->cpu_state_data_size = + be64_to_cpu(opalc_cpu_metadata->region[0].size); + oc_conf->cpu_state_entry_size = + be32_to_cpu(opalc_cpu_metadata->cpu_data_size); + + if ((oc_conf->cpu_state_entry_size == 0) || + (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid.\n"); + goto error_out; + } + oc_conf->num_cpus = (oc_conf->cpu_state_data_size / + oc_conf->cpu_state_entry_size); + + of_node_put(np); + return; + +error_out: + pr_err("Could not export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + of_node_put(np); +} + +static ssize_t fadump_release_opalcore_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int input = -1; + + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { + if (oc_conf == NULL) { + pr_err("'/sys/firmware/opal/core' file not accessible!\n"); + return -EPERM; + } + + /* + * Take away '/sys/firmware/opal/core' and release all memory + * used for exporting this file. + */ + opalcore_cleanup(); + } else + return -EINVAL; + + return count; +} + +static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore, + 0200, NULL, + fadump_release_opalcore_store); + +static int __init opalcore_init(void) +{ + int rc = -1; + + opalcore_config_init(); + + if (oc_conf == NULL) + return rc; + + create_opalcore(); + + /* + * If oc_conf->opalcorebuf= is set in the 2nd kernel, + * then capture the dump. + */ + if (!(is_opalcore_usable())) { + pr_err("Failed to export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + return rc; + } + + /* Set OPAL core file size */ + opal_core_attr.size = oc_conf->opalcore_size; + + /* Export OPAL core sysfs file */ + rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr); + if (rc != 0) { + pr_err("Failed to export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + return rc; + } + + rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr); + if (rc) { + pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n", + rc); + } + + return 0; +} +fs_initcall(opalcore_init); diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c new file mode 100644 index 000000000000..d361d37d975f --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal fadump: " fmt + +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/opal.h> +#include <asm/fadump-internal.h> + +#include "opal-fadump.h" + + +#ifdef CONFIG_PRESERVE_FA_DUMP +/* + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel, + * ensure crash data is preserved in hope that the subsequent memory + * preserving kernel boot is going to process this crash data. + */ +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const struct opal_fadump_mem_struct *opal_fdm_active; + const __be32 *prop; + unsigned long dn; + u64 addr = 0; + s64 ret; + + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) + return; + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_debug("Could not get Kernel metadata (%lld)\n", ret); + return; + } + + /* + * Preserve memory only if kernel memory regions are registered + * with f/w for MPIPL. + */ + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + opal_fdm_active = (void *)addr; + if (opal_fdm_active->registered_regions == 0) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get boot memory tag (%lld)\n", ret); + return; + } + + /* + * Memory below this address can be used for booting a + * capture kernel or petitboot kernel. Preserve everything + * above this address for processing crashdump. + */ + fadump_conf->boot_mem_top = be64_to_cpu(addr); + pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top); + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; +} + +#else /* CONFIG_PRESERVE_FA_DUMP */ +static const struct opal_fadump_mem_struct *opal_fdm_active; +static const struct opal_mpipl_fadump *opal_cpu_metadata; +static struct opal_fadump_mem_struct *opal_fdm; + +#ifdef CONFIG_OPAL_CORE +extern bool kernel_initiated; +#endif + +static int opal_fadump_unregister(struct fw_dump *fadump_conf); + +static void opal_fadump_update_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + pr_debug("Boot memory regions count: %d\n", fdm->region_cnt); + + /* + * The destination address of the first boot memory region is the + * destination address of boot memory regions. + */ + fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest; + pr_debug("Destination address of boot memory regions: %#016llx\n", + fadump_conf->boot_mem_dest_addr); + + fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr; +} + +/* + * This function is called in the capture kernel to get configuration details + * from metadata setup by the first kernel. + */ +static void opal_fadump_get_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + unsigned long base, size, last_end, hole_size; + int i; + + if (!fadump_conf->dump_active) + return; + + last_end = 0; + hole_size = 0; + fadump_conf->boot_memory_size = 0; + + pr_debug("Boot memory regions:\n"); + for (i = 0; i < fdm->region_cnt; i++) { + base = fdm->rgn[i].src; + size = fdm->rgn[i].size; + pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size); + + fadump_conf->boot_mem_addr[i] = base; + fadump_conf->boot_mem_sz[i] = size; + fadump_conf->boot_memory_size += size; + hole_size += (base - last_end); + + last_end = base + size; + } + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest; + + /* + * Rarely, but it can so happen that system crashes before all + * boot memory regions are registered for MPIPL. In such + * cases, warn that the vmcore may not be accurate and proceed + * anyway as that is the best bet considering free pages, cache + * pages, user pages, etc are usually filtered out. + * + * Hope the memory that could not be preserved only has pages + * that are usually filtered out while saving the vmcore. + */ + if (fdm->region_cnt > fdm->registered_regions) { + pr_warn("Not all memory regions were saved!!!\n"); + pr_warn(" Unsaved memory regions:\n"); + i = fdm->registered_regions; + while (i < fdm->region_cnt) { + pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n", + i, fdm->rgn[i].src, fdm->rgn[i].size); + i++; + } + + pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n"); + pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n"); + } + + fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size); + fadump_conf->boot_mem_regs_cnt = fdm->region_cnt; + opal_fadump_update_config(fadump_conf, fdm); +} + +/* Initialize kernel metadata */ +static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) +{ + fdm->version = OPAL_FADUMP_VERSION; + fdm->region_cnt = 0; + fdm->registered_regions = 0; + fdm->fadumphdr_addr = 0; +} + +static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + u64 addr = fadump_conf->reserve_dump_area_start; + int i; + + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* Boot memory regions */ + for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) { + opal_fdm->rgn[i].src = fadump_conf->boot_mem_addr[i]; + opal_fdm->rgn[i].dest = addr; + opal_fdm->rgn[i].size = fadump_conf->boot_mem_sz[i]; + + opal_fdm->region_cnt++; + addr += fadump_conf->boot_mem_sz[i]; + } + + /* + * Kernel metadata is passed to f/w and retrieved in capture kerenl. + * So, use it to save fadump header address instead of calculating it. + */ + opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest + + fadump_conf->boot_memory_size); + + opal_fadump_update_config(fadump_conf, opal_fdm); + + return addr; +} + +static u64 opal_fadump_get_metadata_size(void) +{ + return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct)); +} + +static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) +{ + int err = 0; + s64 ret; + + /* + * Use the last page(s) in FADump memory reservation for + * kernel metadata. + */ + fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start + + fadump_conf->reserve_dump_area_size - + opal_fadump_get_metadata_size()); + pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata); + + /* Initialize kernel metadata before registering the address with f/w */ + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* + * Register metadata address with f/w. Can be retrieved in + * the capture kernel. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, + fadump_conf->kernel_metadata); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set kernel metadata tag!\n"); + err = -EPERM; + } + + /* + * Register boot memory top address with f/w. Should be retrieved + * by a kernel that intends to preserve crash'ed kernel's memory. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM, + fadump_conf->boot_mem_top); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set boot memory tag!\n"); + err = -EPERM; + } + + return err; +} + +static u64 opal_fadump_get_bootmem_min(void) +{ + return OPAL_FADUMP_MIN_BOOT_MEM; +} + +static int opal_fadump_register(struct fw_dump *fadump_conf) +{ + s64 rc = OPAL_PARAMETER; + int i, err = -EIO; + + for (i = 0; i < opal_fdm->region_cnt; i++) { + rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE, + opal_fdm->rgn[i].src, + opal_fdm->rgn[i].dest, + opal_fdm->rgn[i].size); + if (rc != OPAL_SUCCESS) + break; + + opal_fdm->registered_regions++; + } + + switch (rc) { + case OPAL_SUCCESS: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_RESOURCE: + /* If MAX regions limit in f/w is hit, warn and proceed. */ + pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n", + (opal_fdm->region_cnt - opal_fdm->registered_regions)); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_PARAMETER: + pr_err("Failed to register. Parameter Error(%lld).\n", rc); + break; + case OPAL_HARDWARE: + pr_err("Support not available.\n"); + fadump_conf->fadump_supported = 0; + fadump_conf->fadump_enabled = 0; + break; + default: + pr_err("Failed to register. Unknown Error(%lld).\n", rc); + break; + } + + /* + * If some regions were registered before OPAL_MPIPL_ADD_RANGE + * OPAL call failed, unregister all regions. + */ + if ((err < 0) && (opal_fdm->registered_regions > 0)) + opal_fadump_unregister(fadump_conf); + + return err; +} + +static int opal_fadump_unregister(struct fw_dump *fadump_conf) +{ + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0); + if (rc) { + pr_err("Failed to un-register - unexpected Error(%lld).\n", rc); + return -EIO; + } + + opal_fdm->registered_regions = 0; + fadump_conf->dump_registered = 0; + return 0; +} + +static int opal_fadump_invalidate(struct fw_dump *fadump_conf) +{ + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0); + if (rc) { + pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + opal_fdm_active = NULL; + return 0; +} + +static void opal_fadump_cleanup(struct fw_dump *fadump_conf) +{ + s64 ret; + + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0); + if (ret != OPAL_SUCCESS) + pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); +} + +/* + * Verify if CPU state data is available. If available, do a bit of sanity + * checking before processing this data. + */ +static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf) +{ + if (!opal_cpu_metadata) + return false; + + fadump_conf->cpu_state_data_version = + be32_to_cpu(opal_cpu_metadata->cpu_data_version); + fadump_conf->cpu_state_entry_size = + be32_to_cpu(opal_cpu_metadata->cpu_data_size); + fadump_conf->cpu_state_dest_vaddr = + (u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest)); + fadump_conf->cpu_state_data_size = + be64_to_cpu(opal_cpu_metadata->region[0].size); + + if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU state data version: %u, found: %d!\n", + HDAT_FADUMP_CPU_DATA_VER, + fadump_conf->cpu_state_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + if ((fadump_conf->cpu_state_dest_vaddr == 0) || + (fadump_conf->cpu_state_entry_size == 0) || + (fadump_conf->cpu_state_entry_size > + fadump_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid. Ignoring!\n"); + return false; + } + + return true; +} + +/* + * Convert CPU state data saved at the time of crash into ELF notes. + * + * While the crashing CPU's register data is saved by the kernel, CPU state + * data for all CPUs is saved by f/w. In CPU state data provided by f/w, + * each register entry is of 16 bytes, a numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. If this data is + * missing or in unsupported format, append crashing CPU's register data + * saved by the kernel in the PT_NOTE, to have something to work with in + * the vmcore file. + */ +static int __init +opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf, + struct fadump_crash_info_header *fdh) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + bool is_cpu_data_valid = false; + u32 num_cpus = 1, *note_buf; + struct pt_regs regs; + char *bufp; + int rc, i; + + if (is_opal_fadump_cpu_data_valid(fadump_conf)) { + size_per_thread = fadump_conf->cpu_state_entry_size; + num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread); + bufp = __va(fadump_conf->cpu_state_dest_vaddr); + is_cpu_data_valid = true; + } + + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + if (!is_cpu_data_valid) + goto out; + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + for (i = 0; i < num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + + thread_pir = be32_to_cpu(thdr->pir); + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * If this is kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu == thread_pir) { + note_buf = fadump_regs_to_elf_notes(note_buf, + &fdh->regs); + pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + fdh->crashing_cpu, fdh->regs.gpr[1], + fdh->regs.nip); + continue; + } + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, true, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + thread_pir, regs.gpr[1], regs.nip); + } + +out: + /* + * CPU state data is invalid/unsupported. Try appending crashing CPU's + * register data, if it is saved by the kernel. + */ + if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) { + if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) { + fadump_free_cpu_notes_buf(); + return -ENODEV; + } + + pr_warn("WARNING: appending only crashing CPU's register data\n"); + note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs)); + } + + final_note(note_buf); + + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + return 0; +} + +static int __init opal_fadump_process(struct fw_dump *fadump_conf) +{ + struct fadump_crash_info_header *fdh; + int rc = -EINVAL; + + if (!opal_fdm_active || !fadump_conf->fadumphdr_addr) + return rc; + + /* Validate the fadump crash info header */ + fdh = __va(fadump_conf->fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + pr_err("Crash info header is not valid.\n"); + return rc; + } + +#ifdef CONFIG_OPAL_CORE + /* + * If this is a kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN) + kernel_initiated = true; +#endif + + rc = opal_fadump_build_cpu_notes(fadump_conf, fdh); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return rc; +} + +static void opal_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ + const struct opal_fadump_mem_struct *fdm_ptr; + u64 dumped_bytes = 0; + int i; + + if (fadump_conf->dump_active) + fdm_ptr = opal_fdm_active; + else + fdm_ptr = opal_fdm; + + for (i = 0; i < fdm_ptr->region_cnt; i++) { + /* + * Only regions that are registered for MPIPL + * would have dump data. + */ + if ((fadump_conf->dump_active) && + (i < fdm_ptr->registered_regions)) + dumped_bytes = fdm_ptr->rgn[i].size; + + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + fdm_ptr->rgn[i].size, dumped_bytes); + } + + /* Dump is active. Show reserved area start address. */ + if (fadump_conf->dump_active) { + seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", + fadump_conf->reserve_dump_area_start); + } +} + +static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + int rc; + + /* + * Unlike on pSeries platform, logical CPU number is not provided + * with architected register state data. So, store the crashing + * CPU's PIR instead to plug the appropriate register data for + * crashing CPU in the vmcore file. + */ + fdh->crashing_cpu = (u32)mfspr(SPRN_PIR); + + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg); + if (rc == OPAL_UNSUPPORTED) { + pr_emerg("Reboot type %d not supported.\n", + OPAL_REBOOT_MPIPL); + } else if (rc == OPAL_HARDWARE) + pr_emerg("No backend support for MPIPL!\n"); +} + +static struct fadump_ops opal_fadump_ops = { + .fadump_init_mem_struct = opal_fadump_init_mem_struct, + .fadump_get_metadata_size = opal_fadump_get_metadata_size, + .fadump_setup_metadata = opal_fadump_setup_metadata, + .fadump_get_bootmem_min = opal_fadump_get_bootmem_min, + .fadump_register = opal_fadump_register, + .fadump_unregister = opal_fadump_unregister, + .fadump_invalidate = opal_fadump_invalidate, + .fadump_cleanup = opal_fadump_cleanup, + .fadump_process = opal_fadump_process, + .fadump_region_show = opal_fadump_region_show, + .fadump_trigger = opal_fadump_trigger, +}; + +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const __be32 *prop; + unsigned long dn; + u64 addr = 0; + int i, len; + s64 ret; + + /* + * Check if Firmware-Assisted Dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) { + pr_debug("FADump support is missing!\n"); + return; + } + + if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) { + pr_err("Support missing for this f/w version!\n"); + return; + } + + prop = of_get_flat_dt_prop(dn, "fw-load-area", &len); + if (prop) { + /* + * Each f/w load area is an (address,size) pair, + * 2 cells each, totalling 4 cells per range. + */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, end; + + base = of_read_number(prop + (i * 4) + 0, 2); + end = base; + end += of_read_number(prop + (i * 4) + 2, 2); + if (end > OPAL_FADUMP_MIN_BOOT_MEM) { + pr_err("F/W load area: 0x%llx-0x%llx\n", + base, end); + pr_err("F/W version not supported!\n"); + return; + } + } + } + + fadump_conf->ops = &opal_fadump_ops; + fadump_conf->fadump_supported = 1; + + /* + * Firmware supports 32-bit field for size. Align it to PAGE_SIZE + * and request firmware to copy multiple kernel boot memory regions. + */ + fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE); + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get Kernel metadata (%lld)\n", ret); + return; + } + + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + + opal_fdm_active = __va(addr); + if (opal_fdm_active->version != OPAL_FADUMP_VERSION) { + pr_warn("Supported kernel metadata version: %u, found: %d!\n", + OPAL_FADUMP_VERSION, opal_fdm_active->version); + pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n"); + } + + /* Kernel regions not registered with f/w for MPIPL */ + if (opal_fdm_active->registered_regions == 0) { + opal_fdm_active = NULL; + return; + } + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if (addr) { + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opal_cpu_metadata = __va(addr); + } + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + opal_fadump_get_config(fadump_conf, opal_fdm_active); +} +#endif /* !CONFIG_PRESERVE_FA_DUMP */ diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h new file mode 100644 index 000000000000..f1e9ecf548c5 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _POWERNV_OPAL_FADUMP_H +#define _POWERNV_OPAL_FADUMP_H + +#include <asm/reg.h> + +/* + * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum + * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't + * mess with crash'ed kernel's memory during MPIPL. + */ +#define OPAL_FADUMP_MIN_BOOT_MEM (0x30000000UL) + +/* + * OPAL FADump metadata structure format version + * + * OPAL FADump kernel metadata structure stores kernel metadata needed to + * register-for/process crash dump. Format version is used to keep a tab on + * the changes in the structure format. The changes, if any, to the format + * are expected to be minimal and backward compatible. + */ +#define OPAL_FADUMP_VERSION 0x1 + +/* + * OPAL FADump kernel metadata + * + * The address of this structure will be registered with f/w for retrieving + * and processing during crash dump. + */ +struct opal_fadump_mem_struct { + u8 version; + u8 reserved[3]; + u16 region_cnt; /* number of regions */ + u16 registered_regions; /* Regions registered for MPIPL */ + u64 fadumphdr_addr; + struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS]; +} __packed; + +/* + * CPU state data + * + * CPU state data information is provided by f/w. The format for this data + * is defined in the HDAT spec. Version is used to keep a tab on the changes + * in this CPU state data format. Changes to this format are unlikely, but + * if there are any changes, please refer to latest HDAT specification. + */ +#define HDAT_FADUMP_CPU_DATA_VER 1 + +#define HDAT_FADUMP_CORE_INACTIVE (0x0F) + +/* HDAT thread header for register entries */ +struct hdat_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; + + __be32 offset; /* Offset to Register Entries array */ + __be32 ecnt; /* Number of entries */ + __be32 esize; /* Alloc size of each array entry in bytes */ + __be32 eactsz; /* Actual size of each array entry in bytes */ +} __packed; + +/* Register types populated by f/w */ +#define HDAT_FADUMP_REG_TYPE_GPR 0x01 +#define HDAT_FADUMP_REG_TYPE_SPR 0x02 + +/* ID numbers used by f/w while populating certain registers */ +#define HDAT_FADUMP_REG_ID_NIP 0x7D0 +#define HDAT_FADUMP_REG_ID_MSR 0x7D1 +#define HDAT_FADUMP_REG_ID_CCR 0x7D2 + +/* HDAT register entry. */ +struct hdat_fadump_reg_entry { + __be32 reg_type; + __be32 reg_num; + __be64 reg_val; +} __packed; + +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, + u32 reg_type, u32 reg_num, + u64 reg_val) +{ + if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case SPRN_CTR: + regs->ctr = reg_val; + break; + case SPRN_LR: + regs->link = reg_val; + break; + case SPRN_XER: + regs->xer = reg_val; + break; + case SPRN_DAR: + regs->dar = reg_val; + break; + case SPRN_DSISR: + regs->dsisr = reg_val; + break; + case HDAT_FADUMP_REG_ID_NIP: + regs->nip = reg_val; + break; + case HDAT_FADUMP_REG_ID_MSR: + regs->msr = reg_val; + break; + case HDAT_FADUMP_REG_ID_CCR: + regs->ccr = reg_val; + break; + } +} + +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, + unsigned int reg_entry_size, + bool cpu_endian, + struct pt_regs *regs) +{ + struct hdat_fadump_reg_entry *reg_entry; + u64 val; + int i; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct hdat_fadump_reg_entry *)bufp; + val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) : + reg_entry->reg_val); + opal_fadump_set_regval_regnum(regs, + be32_to_cpu(reg_entry->reg_type), + be32_to_cpu(reg_entry->reg_num), + val); + } +} + +#endif /* _POWERNV_OPAL_FADUMP_H */ diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 186109bdd41b..e04b20625cb9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -53,9 +53,9 @@ static void export_imc_mode_and_cmd(struct device_node *node, struct imc_pmu *pmu_ptr) { static u64 loc, *imc_mode_addr, *imc_cmd_addr; - int chip = 0, nid; char mode[16], cmd[16]; u32 cb_offset; + struct imc_mem_info *ptr = pmu_ptr->mem_info; imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); @@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node, if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; - for_each_node(nid) { - loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset; + while (ptr->vbase != NULL) { + loc = (u64)(ptr->vbase) + cb_offset; imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET); - sprintf(mode, "imc_mode_%d", nid); + sprintf(mode, "imc_mode_%d", (u32)(ptr->id)); if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent, imc_mode_addr)) goto err; imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET); - sprintf(cmd, "imc_cmd_%d", nid); + sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id)); if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent, imc_cmd_addr)) goto err; - chip++; + ptr++; } return; diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index dc51d03c6370..d26da19a611f 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -29,23 +29,23 @@ struct memcons { static struct memcons *opal_memcons = NULL; -ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count) { const char *conbuf; ssize_t ret; size_t first_read = 0; uint32_t out_pos, avail; - if (!opal_memcons) + if (!mc) return -ENODEV; - out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos)); + out_pos = be32_to_cpu(READ_ONCE(mc->out_pos)); /* Now we've read out_pos, put a barrier in before reading the new * data it points to in conbuf. */ smp_rmb(); - conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys)); + conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); /* When the buffer has wrapped, read from the out_pos marker to the end * of the buffer, and then read the remaining data as in the un-wrapped @@ -53,7 +53,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) if (out_pos & MEMCONS_OUT_POS_WRAP) { out_pos &= MEMCONS_OUT_POS_MASK; - avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos; + avail = be32_to_cpu(mc->obuf_size) - out_pos; ret = memory_read_from_buffer(to, count, &pos, conbuf + out_pos, avail); @@ -71,7 +71,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) } /* Sanity check. The firmware should not do this to us. */ - if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) { + if (out_pos > be32_to_cpu(mc->obuf_size)) { pr_err("OPAL: memory console corruption. Aborting read.\n"); return -EINVAL; } @@ -86,6 +86,11 @@ out: return ret; } +ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +{ + return memcons_copy(opal_memcons, to, pos, count); +} + static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *to, loff_t pos, size_t count) @@ -98,32 +103,48 @@ static struct bin_attribute opal_msglog_attr = { .read = opal_msglog_read }; -void __init opal_msglog_init(void) +struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name) { u64 mcaddr; struct memcons *mc; - if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { - pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); - return; + if (of_property_read_u64(node, mc_prop_name, &mcaddr)) { + pr_warn("%s property not found, no message log\n", + mc_prop_name); + goto out_err; } mc = phys_to_virt(mcaddr); if (!mc) { - pr_warn("OPAL: memory console address is invalid\n"); - return; + pr_warn("memory console address is invalid\n"); + goto out_err; } if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { - pr_warn("OPAL: memory console version is invalid\n"); - return; + pr_warn("memory console version is invalid\n"); + goto out_err; } - /* Report maximum size */ - opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) + - be32_to_cpu(mc->obuf_size); + return mc; + +out_err: + return NULL; +} + +u32 memcons_get_size(struct memcons *mc) +{ + return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size); +} + +void __init opal_msglog_init(void) +{ + opal_memcons = memcons_init(opal_node, "ibm,opal-memcons"); + if (!opal_memcons) { + pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n"); + return; + } - opal_memcons = mc; + opal_msglog_attr.size = memcons_get_size(opal_memcons); } void __init opal_msglog_sysfs_init(void) diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index e072bf157d62..45f4223a790f 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -342,7 +342,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb, int msg_size, item_size; unsigned long flags; - if (msg_type != OPAL_MSG_PRD) + if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2) return 0; /* Calculate total size of the message and item we need to store. The @@ -393,6 +393,12 @@ static int opal_prd_probe(struct platform_device *pdev) return rc; } + rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb); + if (rc) { + pr_err("Couldn't register PRD2 event notifier\n"); + return rc; + } + rc = misc_register(&opal_prd_dev); if (rc) { pr_err("failed to register miscdev\n"); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 66430eebe869..fd510d961b8c 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * PowerNV LPC bus handling. + * PowerNV SCOM bus debugfs interface * + * Copyright 2010 Benjamin Herrenschmidt, IBM Corp + * <benh@kernel.crashing.org> + * and David Gibson, IBM Corporation. * Copyright 2013 IBM Corp. */ @@ -10,62 +13,13 @@ #include <linux/bug.h> #include <linux/gfp.h> #include <linux/slab.h> +#include <linux/uaccess.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/opal.h> -#include <asm/scom.h> - -/* - * We could probably fit that inside the scom_map_t - * which is a void* after all but it's really too ugly - * so let's kmalloc it for now - */ -struct opal_scom_map { - uint32_t chip; - uint64_t addr; -}; - -static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) -{ - struct opal_scom_map *m; - const __be32 *gcid; - - if (!of_get_property(dev, "scom-controller", NULL)) { - pr_err("%s: device %pOF is not a SCOM controller\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - gcid = of_get_property(dev, "ibm,chip-id", NULL); - if (!gcid) { - pr_err("%s: device %pOF has no ibm,chip-id\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - m = kmalloc(sizeof(*m), GFP_KERNEL); - if (!m) - return NULL; - m->chip = be32_to_cpup(gcid); - m->addr = reg; - - return (scom_map_t)m; -} - -static void opal_scom_unmap(scom_map_t map) -{ - kfree(map); -} - -static int opal_xscom_err_xlate(int64_t rc) -{ - switch(rc) { - case 0: - return 0; - /* Add more translations if necessary */ - default: - return -EIO; - } -} +#include <asm/debugfs.h> +#include <asm/prom.h> static u64 opal_scom_unmangle(u64 addr) { @@ -98,39 +52,154 @@ static u64 opal_scom_unmangle(u64 addr) return addr; } -static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) +static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value) { - struct opal_scom_map *m = map; int64_t rc; __be64 v; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v)); + if (rc) { + *value = 0xfffffffffffffffful; + return -EIO; + } *value = be64_to_cpu(v); - return opal_xscom_err_xlate(rc); + return 0; } -static int opal_scom_write(scom_map_t map, u64 reg, u64 value) +static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value) { - struct opal_scom_map *m = map; int64_t rc; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_write(m->chip, reg, value); - return opal_xscom_err_xlate(rc); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_write(chip, reg, value); + if (rc) + return -EIO; + return 0; +} + +struct scom_debug_entry { + u32 chip; + struct debugfs_blob_wrapper path; + char name[16]; +}; + +static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = opal_scom_read(ent->chip, reg_base, reg, &val); + if (!rc) + rc = put_user(val, ubuf64); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + *ppos += 8; + done += 8; + } + return done; +} + +static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = get_user(val, ubuf64); + if (!rc) + rc = opal_scom_write(ent->chip, reg_base, reg, val); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + done += 8; + } + return done; } -static const struct scom_controller opal_scom_controller = { - .map = opal_scom_map, - .unmap = opal_scom_unmap, - .read = opal_scom_read, - .write = opal_scom_write +static const struct file_operations scom_debug_fops = { + .read = scom_debug_read, + .write = scom_debug_write, + .open = simple_open, + .llseek = default_llseek, }; -static int opal_xscom_init(void) +static int scom_debug_init_one(struct dentry *root, struct device_node *dn, + int chip) { - if (firmware_has_feature(FW_FEATURE_OPAL)) - scom_init(&opal_scom_controller); + struct scom_debug_entry *ent; + struct dentry *dir; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->chip = chip; + snprintf(ent->name, 16, "%08x", chip); + ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn); + ent->path.size = strlen((char *)ent->path.data); + + dir = debugfs_create_dir(ent->name, root); + if (!dir) { + kfree(ent->path.data); + kfree(ent); + return -1; + } + + debugfs_create_blob("devspec", 0400, dir, &ent->path); + debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops); + return 0; } -machine_arch_initcall(powernv, opal_xscom_init); + +static int scom_debug_init(void) +{ + struct device_node *dn; + struct dentry *root; + int chip, rc; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return 0; + + root = debugfs_create_dir("scom", powerpc_debugfs_root); + if (!root) + return -1; + + rc = 0; + for_each_node_with_property(dn, "scom-controller") { + chip = of_get_ibm_chip_id(dn); + WARN_ON(chip == -1); + rc |= scom_debug_init_one(root, dn, chip); + } + + return rc; +} +device_initcall(scom_debug_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index aba443be7daa..38e90270280b 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -58,6 +58,8 @@ static DEFINE_SPINLOCK(opal_write_lock); static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; static uint32_t opal_heartbeat; static struct task_struct *kopald_tsk; +static struct opal_msg *opal_msg; +static u32 opal_msg_size __ro_after_init; void opal_configure_cores(void) { @@ -271,14 +273,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg) static void opal_handle_message(void) { s64 ret; - /* - * TODO: pre-allocate a message buffer depending on opal-msg-size - * value in /proc/device-tree. - */ - static struct opal_msg msg; u32 type; - ret = opal_get_msg(__pa(&msg), sizeof(msg)); + ret = opal_get_msg(__pa(opal_msg), opal_msg_size); /* No opal message pending. */ if (ret == OPAL_RESOURCE) return; @@ -290,14 +287,14 @@ static void opal_handle_message(void) return; } - type = be32_to_cpu(msg.msg_type); + type = be32_to_cpu(opal_msg->msg_type); /* Sanity check */ if (type >= OPAL_MSG_TYPE_MAX) { pr_warn_once("%s: Unknown message type: %u\n", __func__, type); return; } - opal_message_do_notify(type, (void *)&msg); + opal_message_do_notify(type, (void *)opal_msg); } static irqreturn_t opal_message_notify(int irq, void *data) @@ -306,10 +303,24 @@ static irqreturn_t opal_message_notify(int irq, void *data) return IRQ_HANDLED; } -static int __init opal_message_init(void) +static int __init opal_message_init(struct device_node *opal_node) { int ret, i, irq; + ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size); + if (ret) { + pr_notice("Failed to read opal-msg-size property\n"); + opal_msg_size = sizeof(struct opal_msg); + } + + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + if (!opal_msg) { + opal_msg_size = sizeof(struct opal_msg); + /* Try to allocate fixed message size */ + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + BUG_ON(opal_msg == NULL); + } + for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); @@ -705,7 +716,10 @@ static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, bin_attr->size); } -static BIN_ATTR_RO(symbol_map, 0); +static struct bin_attribute symbol_map_attr = { + .attr = {.name = "symbol_map", .mode = 0400}, + .read = symbol_map_read +}; static void opal_export_symmap(void) { @@ -722,10 +736,10 @@ static void opal_export_symmap(void) return; /* Setup attributes */ - bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0])); - bin_attr_symbol_map.size = be64_to_cpu(syms[1]); + symbol_map_attr.private = __va(be64_to_cpu(syms[0])); + symbol_map_attr.size = be64_to_cpu(syms[1]); - rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map); + rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr); if (rc) pr_warn("Error %d creating OPAL symbols file\n", rc); } @@ -910,7 +924,7 @@ static int __init opal_init(void) } /* Initialise OPAL messaging system */ - opal_message_init(); + opal_message_init(opal_node); /* Initialise OPAL asynchronous completion interface */ opal_async_comp_init(); diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index e28f03e1eb5e..a0b9c0c23ed2 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) struct page *tce_mem = NULL; __be64 *addr; - tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); + tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN, + shift - PAGE_SHIFT); if (!tce_mem) { pr_err("Failed to allocate a TCE memory, level shift=%d\n", shift); @@ -48,6 +49,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) return addr; } +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned int levels); + static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; @@ -57,9 +61,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) while (level) { int n = (idx & mask) >> (level * shift); - unsigned long tce; + unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n])); - if (tmp[n] == 0) { + if (!tce) { __be64 *tmp2; if (!alloc) @@ -70,10 +74,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) if (!tmp2) return NULL; - tmp[n] = cpu_to_be64(__pa(tmp2) | - TCE_PCI_READ | TCE_PCI_WRITE); + tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE; + oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0, + cpu_to_be64(tce))); + if (oldtce) { + pnv_pci_ioda2_table_do_free_pages(tmp2, + ilog2(tbl->it_level_size) + 3, 1); + tce = oldtce; + } } - tce = be64_to_cpu(tmp[n]); tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); idx &= ~mask; @@ -161,6 +170,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) if (ptce) *ptce = cpu_to_be64(0); + else + /* Skip the rest of the level */ + i |= tbl->it_level_size - 1; } } @@ -260,7 +272,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, unsigned int table_shift = max_t(unsigned int, entries_shift + 3, PAGE_SHIFT); const unsigned long tce_table_size = 1UL << table_shift; - unsigned int tmplevels = levels; if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; @@ -268,9 +279,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!is_power_of_2(window_size)) return -EINVAL; - if (alloc_userspace_copy && (window_size > (1ULL << 32))) - tmplevels = 1; - /* Adjust direct table size from window_size and levels */ entries_shift = (entries_shift + levels - 1) / levels; level_shift = entries_shift + 3; @@ -281,7 +289,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, &total_allocated); + 1, tce_table_size, &offset, &total_allocated); /* addr==NULL means that the first level allocation failed */ if (!addr) @@ -292,18 +300,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (tmplevels == levels && offset < tce_table_size) + if (levels == 1 && offset < tce_table_size) goto free_tces_exit; /* Allocate userspace view of the TCE table */ if (alloc_userspace_copy) { offset = 0; uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, + 1, tce_table_size, &offset, &total_allocated_uas); if (!uas) goto free_tces_exit; - if (tmplevels == levels && (offset < tce_table_size || + if (levels == 1 && (offset < tce_table_size || total_allocated_uas != total_allocated)) goto free_uas_exit; } @@ -318,7 +326,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", window_size, tce_table_size, bus_offset, tbl->it_base, - tbl->it_userspace, tmplevels, levels); + tbl->it_userspace, 1, levels); return 0; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d8080558d020..c28d0d9b7ee0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1939,26 +1939,12 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, } #ifdef CONFIG_IOMMU_API -static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) +/* Common for IODA1 and IODA2 */ +static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction, + bool realmode) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); - - return ret; + return pnv_tce_xchg(tbl, index, hpa, direction, !realmode); } #endif @@ -1973,8 +1959,8 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda1_iommu_ops = { .set = pnv_ioda1_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda1_tce_xchg, - .exchange_rm = pnv_ioda1_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_p7ioc_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda1_tce_free, @@ -2103,30 +2089,6 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, return ret; } -#ifdef CONFIG_IOMMU_API -static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); - - return ret; -} -#endif - static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, long npages) { @@ -2138,8 +2100,8 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda2_tce_xchg, - .exchange_rm = pnv_ioda2_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_ioda2_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, @@ -2303,7 +2265,7 @@ found: tbl->it_ops = &pnv_ioda1_iommu_ops; pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; - iommu_init_table(tbl, phb->hose->node); + iommu_init_table(tbl, phb->hose->node, 0, 0); if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); @@ -2420,6 +2382,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = NULL; long rc; + unsigned long res_start, res_end; /* * crashkernel= specifies the kdump kernel's maximum memory at @@ -2433,19 +2396,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1); + + /* + * We create the default window as big as we can. The constraint is + * the max order of allocation possible. The TCE table is likely to + * end up being multilevel and with on-demand allocation in place, + * the initial use is not going to be huge as the default window aims + * to support crippled devices (i.e. not fully 64bit DMAble) only. + */ + /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */ + const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory); + /* Each TCE level cannot exceed maxblock so go multilevel if needed */ + unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT); + unsigned long tcelevel_order = ilog2(maxblock >> 3); + unsigned int levels = tces_order / tcelevel_order; + + if (tces_order % tcelevel_order) + levels += 1; + /* + * We try to stick to default levels (which is >1 at the moment) in + * order to save memory by relying on on-demain TCE level allocation. + */ + levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS); - rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, - IOMMU_PAGE_SHIFT_4K, - window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT, + window_size, levels, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); return rc; } - iommu_init_table(tbl, pe->phb->hose->node); + /* We use top part of 32bit space for MMIO so exclude it from DMA */ + res_start = 0; + res_end = 0; + if (window_size > pe->phb->ioda.m32_pci_base) { + res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift; + res_end = min(window_size, SZ_4G) >> tbl->it_page_shift; + } + iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end); rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); if (rc) { diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6104418c9ad5..2825d004dece 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -54,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) break; } - if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) { + if (!of_device_is_compatible(parent, "ibm,ioda2-phb") && + !of_device_is_compatible(parent, "ibm,ioda3-phb")) { of_node_put(parent); continue; } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 469c24463247..f914f0b14e4e 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -219,7 +219,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach( struct pnv_ioda_pe *pe); /* pci-ioda-tce.c */ -#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_DEFAULT_LEVELS 2 #define POWERNV_IOMMU_MAX_LEVELS 5 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index fd4a1c5a6369..1aa51c4fa904 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -30,4 +30,9 @@ extern void opal_event_shutdown(void); bool cpu_core_split_required(void); +struct memcons; +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count); +u32 memcons_get_size(struct memcons *mc); +struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name); + #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index a5e52f9eed3c..83498604d322 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -24,6 +24,7 @@ #include <linux/bug.h> #include <linux/pci.h> #include <linux/cpufreq.h> +#include <linux/memblock.h> #include <asm/machdep.h> #include <asm/firmware.h> @@ -166,6 +167,14 @@ static void __init pnv_init(void) else #endif add_preferred_console("hvc", 0, NULL); + + if (!radix_enabled()) { + int i; + + /* Allocate per cpu area to save old slb contents during MCE */ + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i)); + } } static void __init pnv_init_IRQ(void) diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c new file mode 100644 index 000000000000..e4a00ad06f9d --- /dev/null +++ b/arch/powerpc/platforms/powernv/ultravisor.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ultravisor high level interfaces + * + * Copyright 2019, IBM Corporation. + * + */ +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/of_fdt.h> +#include <linux/of.h> + +#include <asm/ultravisor.h> +#include <asm/firmware.h> +#include <asm/machdep.h> + +#include "powernv.h" + +static struct kobject *ultravisor_kobj; + +int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname, + int depth, void *data) +{ + if (!of_flat_dt_is_compatible(node, "ibm,ultravisor")) + return 0; + + powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR; + pr_debug("Ultravisor detected!\n"); + return 1; +} + +static struct memcons *uv_memcons; + +static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + return memcons_copy(uv_memcons, to, pos, count); +} + +static struct bin_attribute uv_msglog_attr = { + .attr = {.name = "msglog", .mode = 0400}, + .read = uv_msglog_read +}; + +static int __init uv_init(void) +{ + struct device_node *node; + + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + return 0; + + node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware"); + if (!node) + return -ENODEV; + + uv_memcons = memcons_init(node, "memcons"); + if (!uv_memcons) + return -ENOENT; + + uv_msglog_attr.size = memcons_get_size(uv_memcons); + + ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj); + if (!ultravisor_kobj) + return -ENOMEM; + + return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr); +} +machine_subsys_initcall(powernv, uv_init); diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index bdaeaecdc06b..1193c294b8d0 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -184,10 +184,7 @@ static void spu_unmap(struct spu *spu) * setup_areas - Map the spu regions into the address space. * * The current HV requires the spu shadow regs to be mapped with the - * PTE page protection bits set as read-only (PP=3). This implementation - * uses the low level __ioremap() to bypass the page protection settings - * inforced by ioremap_prot() to get the needed PTE bits set for the - * shadow regs. + * PTE page protection bits set as read-only. */ static int __init setup_areas(struct spu *spu) @@ -195,9 +192,8 @@ static int __init setup_areas(struct spu *spu) struct table {char* name; unsigned long addr; unsigned long size;}; unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); - spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr, - sizeof(struct spe_shadow), - shadow_flags); + spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr, + sizeof(struct spe_shadow), shadow_flags); if (!spu_pdata(spu)->shadow) { pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__); goto fail_ioremap; diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f7b484f55553..9e35cddddf73 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -145,3 +145,17 @@ config PAPR_SCM tristate "Support for the PAPR Storage Class Memory interface" help Enable access to hypervisor provided storage class memory. + +config PPC_SVM + bool "Secure virtual machine (SVM) support for POWER" + depends on PPC_PSERIES + select SWIOTLB + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED + help + There are certain POWER platforms which support secure guests using + the Protected Execution Facility, with the help of an Ultravisor + executing below the hypervisor layer. This enables support for + those guests. + + If unsure, say "N". diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index ab3d59aeacca..a3c74a5cf20d 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -26,6 +26,8 @@ obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o +obj-$(CONFIG_PPC_SVM) += svm.o +obj-$(CONFIG_FA_DUMP) += rtas-fadump.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 9edae1863e2f..893ba3f562c4 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -42,42 +42,44 @@ static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; static int ibm_configure_pe; -#ifdef CONFIG_PCI_IOV void pseries_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - struct pci_dn *physfn_pdn; - struct eeh_dev *edev; - if (!pdev->is_virtfn) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; - pdn->device_id = pdev->device; - pdn->vendor_id = pdev->vendor; - pdn->class_code = pdev->class; - /* - * Last allow unfreeze return code used for retrieval - * by user space in eeh-sysfs to show the last command - * completion from platform. - */ - pdn->last_allow_rc = 0; - physfn_pdn = pci_get_pdn(pdev->physfn); - pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; - edev = pdn_to_eeh_dev(pdn); + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); +#ifdef CONFIG_PCI_IOV + if (pdev->is_virtfn) { + struct pci_dn *physfn_pdn; - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ + pdn->device_id = pdev->device; + pdn->vendor_id = pdev->vendor; + pdn->class_code = pdev->class; + /* + * Last allow unfreeze return code used for retrieval + * by user space in eeh-sysfs to show the last command + * completion from platform. + */ + pdn->last_allow_rc = 0; + physfn_pdn = pci_get_pdn(pdev->physfn); + pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; + } +#endif eeh_add_device_early(pdn); eeh_add_device_late(pdev); - edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); - eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ - eeh_add_to_parent_pe(edev); /* Add as VF PE type */ - eeh_sysfs_add_device(pdev); +#ifdef CONFIG_PCI_IOV + if (pdev->is_virtfn) { + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); -} + edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); + eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ + eeh_add_to_parent_pe(edev); /* Add as VF PE type */ + } #endif + eeh_sysfs_add_device(pdev); +} /* * Buffer for reporting slot-error-detail rtas calls. Its here @@ -144,10 +146,8 @@ static int pseries_eeh_init(void) /* Set EEH probe mode */ eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); -#ifdef CONFIG_PCI_IOV /* Set EEH machine dependent code */ ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; -#endif return 0; } @@ -251,6 +251,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* * Update class code and mode of eeh device. We need * correctly reflects that current device is root port @@ -280,8 +282,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8); /* Enable EEH on the device */ + eeh_edev_dbg(edev, "Enabling EEH on device\n"); ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); - if (!ret) { + if (ret) { + eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret); + } else { /* Retrieve PE address */ edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); pe.addr = edev->pe_config_addr; @@ -297,11 +302,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) if (enable) { eeh_add_flag(EEH_ENABLED); eeh_add_to_parent_pe(edev); - - pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n", - __func__, pdn->busno, PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), pe.phb->global_number, - pe.addr); } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) && (pdn_to_eeh_dev(pdn->parent))->pe) { /* This device doesn't support EEH, but it may have an @@ -310,6 +310,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; eeh_add_to_parent_pe(edev); } + eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n", + (enable ? "enabled" : "unsupported"), ret); } /* Save memory bars */ diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 46d0d35b9ca4..8e700390f3d6 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -880,34 +880,44 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) switch (hp_elog->action) { case PSERIES_HP_ELOG_ACTION_ADD: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { + switch (hp_elog->id_type) { + case PSERIES_HP_ELOG_ID_DRC_COUNT: count = hp_elog->_drc_u.drc_count; rc = dlpar_memory_add_by_count(count); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { + break; + case PSERIES_HP_ELOG_ID_DRC_INDEX: drc_index = hp_elog->_drc_u.drc_index; rc = dlpar_memory_add_by_index(drc_index); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { + break; + case PSERIES_HP_ELOG_ID_DRC_IC: count = hp_elog->_drc_u.ic.count; drc_index = hp_elog->_drc_u.ic.index; rc = dlpar_memory_add_by_ic(count, drc_index); - } else { + break; + default: rc = -EINVAL; + break; } break; case PSERIES_HP_ELOG_ACTION_REMOVE: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { + switch (hp_elog->id_type) { + case PSERIES_HP_ELOG_ID_DRC_COUNT: count = hp_elog->_drc_u.drc_count; rc = dlpar_memory_remove_by_count(count); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { + break; + case PSERIES_HP_ELOG_ID_DRC_INDEX: drc_index = hp_elog->_drc_u.drc_index; rc = dlpar_memory_remove_by_index(drc_index); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { + break; + case PSERIES_HP_ELOG_ID_DRC_IC: count = hp_elog->_drc_u.ic.count; drc_index = hp_elog->_drc_u.ic.index; rc = dlpar_memory_remove_by_ic(count, drc_index); - } else { + break; + default: rc = -EINVAL; + break; } break; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 889dc2e44b89..6ba081dd61c9 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -36,6 +36,7 @@ #include <asm/udbg.h> #include <asm/mmzone.h> #include <asm/plpar_wrappers.h> +#include <asm/svm.h> #include "pseries.h" @@ -609,7 +610,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, pci->phb->node); + iommu_init_table(tbl, pci->phb->node, 0, 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -621,7 +622,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) #ifdef CONFIG_IOMMU_API static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned - long *tce, enum dma_data_direction *direction) + long *tce, enum dma_data_direction *direction, + bool realmode) { long rc; unsigned long ioba = (unsigned long) index << tbl->it_page_shift; @@ -649,7 +651,7 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, #ifdef CONFIG_IOMMU_API - .exchange = tce_exchange_pseries, + .xchg_no_kill = tce_exchange_pseries, #endif .clear = tce_freemulti_pSeriesLP, .get = tce_get_pSeriesLP @@ -690,7 +692,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) iommu_table_setparms_lpar(ppci->phb, pdn, tbl, ppci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, ppci->phb->node); + iommu_init_table(tbl, ppci->phb->node, 0, 0); iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -719,7 +721,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, phb->node); + iommu_init_table(tbl, phb->node, 0, 0); set_iommu_table_base(&dev->dev, tbl); return; } @@ -1169,7 +1171,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) iommu_table_setparms_lpar(pci->phb, pdn, tbl, pci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, pci->phb->node); + iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); pr_debug(" created table: %p\n", pci->table_group); @@ -1318,7 +1320,15 @@ void iommu_init_early_pSeries(void) of_reconfig_notifier_register(&iommu_reconfig_nb); register_memory_notifier(&iommu_mem_nb); - set_pci_dma_ops(&dma_iommu_ops); + /* + * Secure guest memory is inacessible to devices so regular DMA isn't + * possible. + * + * In that case keep devices' dma_map_ops as NULL so that the generic + * DMA code path will use SWIOTLB to bounce buffers for DMA. + */ + if (!is_secure_guest()) + set_pci_dma_ops(&dma_iommu_ops); } static int __init disable_multitce(char *str) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 09bb878c21e0..36b846f6e74e 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1413,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data) return 0; } -/* Must be called in user context */ +/* + * Must be called in process context. The caller must hold the + * cpus_lock. + */ static int pseries_lpar_resize_hpt(unsigned long shift) { struct hpt_resize_state state = { @@ -1467,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift) t1 = ktime_get(); - rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL); + rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit, + &state, NULL); t2 = ktime_get(); @@ -1527,16 +1531,24 @@ void __init hpte_init_pseries(void) mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range; mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all; mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; - register_process_table = pseries_lpar_register_process_table; if (firmware_has_feature(FW_FEATURE_HPT_RESIZE)) mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; + + /* + * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall + * to inform the hypervisor that we wish to use the HPT. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pseries_lpar_register_process_table(0, 0, 0); } void radix_init_pseries(void) { pr_info("Using radix MMU under hypervisor\n"); - register_process_table = pseries_lpar_register_process_table; + + pseries_lpar_register_process_table(__pa(process_tb), + 0, PRTB_SIZE_SHIFT - 12); } #ifdef CONFIG_PPC_SMLPAR diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index fe812bebdf5e..b571285f6c14 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -9,6 +9,7 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/kobject.h> +#include <linux/sched.h> #include <linux/smp.h> #include <linux/stat.h> #include <linux/completion.h> @@ -207,7 +208,11 @@ static int update_dt_node(__be32 phandle, s32 scope) prop_data += vd; } + + cond_resched(); } + + cond_resched(); } while (rtas_rc == 1); of_node_put(dn); @@ -310,8 +315,12 @@ int pseries_devicetree_update(s32 scope) add_dt_node(phandle, drc_index); break; } + + cond_resched(); } } + + cond_resched(); } while (rc == 1); kfree(rtas_buf); diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 1eae1d09980c..722830978639 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -229,8 +229,7 @@ void __init pSeries_final_fixup(void) pSeries_request_regions(); - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); #ifdef CONFIG_PCI_IOV ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f16fdd0f71f7..3acdcc3bb908 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -76,6 +76,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_UE 0x00 #define MC_ERROR_TYPE_SLB 0x01 #define MC_ERROR_TYPE_ERAT 0x02 +#define MC_ERROR_TYPE_UNKNOWN 0x03 #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 @@ -87,6 +88,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_UE_LOAD_STORE 3 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 +#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 +#define UE_LOGICAL_ADDR_PROVIDED 0x20 + #define MC_ERROR_SLB_PARITY 0 #define MC_ERROR_SLB_MULTIHIT 1 #define MC_ERROR_SLB_INDETERMINATE 2 @@ -113,27 +117,6 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) } } -static -inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) -{ - __be64 addr = 0; - - switch (mlog->error_type) { - case MC_ERROR_TYPE_UE: - if (mlog->sub_err_type & 0x40) - addr = mlog->effective_address; - break; - case MC_ERROR_TYPE_SLB: - case MC_ERROR_TYPE_ERAT: - case MC_ERROR_TYPE_TLB: - if (mlog->sub_err_type & 0x80) - addr = mlog->effective_address; - default: - break; - } - return be64_to_cpu(addr); -} - /* * Enable the hotplug interrupt late because processing them may touch other * devices or systems (e.g. hugepages) that have not been initialized at the @@ -511,160 +494,165 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } -#define VAL_TO_STRING(ar, val) \ - (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") -static void pseries_print_mce_info(struct pt_regs *regs, - struct rtas_error_log *errp) +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) { - const char *level, *sevstr; + struct mce_error_info mce_err = { 0 }; + unsigned long eaddr = 0, paddr = 0; struct pseries_errorlog *pseries_log; struct pseries_mc_errorlog *mce_log; - u8 error_type, err_sub_type; - u64 addr; - u8 initiator = rtas_error_initiator(errp); int disposition = rtas_error_disposition(errp); + int initiator = rtas_error_initiator(errp); + int severity = rtas_error_severity(errp); + u8 error_type, err_sub_type; - static const char * const initiators[] = { - [0] = "Unknown", - [1] = "CPU", - [2] = "PCI", - [3] = "ISA", - [4] = "Memory", - [5] = "Power Mgmt", - }; - static const char * const mc_err_types[] = { - [0] = "UE", - [1] = "SLB", - [2] = "ERAT", - [3] = "Unknown", - [4] = "TLB", - [5] = "D-Cache", - [6] = "Unknown", - [7] = "I-Cache", - }; - static const char * const mc_ue_types[] = { - [0] = "Indeterminate", - [1] = "Instruction fetch", - [2] = "Page table walk ifetch", - [3] = "Load/Store", - [4] = "Page table walk Load/Store", - }; - - /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ - static const char * const mc_slb_types[] = { - [0] = "Parity", - [1] = "Multihit", - [2] = "Indeterminate", - }; - - /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ - static const char * const mc_soft_types[] = { - [0] = "Unknown", - [1] = "Parity", - [2] = "Multihit", - [3] = "Indeterminate", - }; - - if (!rtas_error_extended(errp)) { - pr_err("Machine check interrupt: Missing extended error log\n"); - return; - } + if (initiator == RTAS_INITIATOR_UNKNOWN) + mce_err.initiator = MCE_INITIATOR_UNKNOWN; + else if (initiator == RTAS_INITIATOR_CPU) + mce_err.initiator = MCE_INITIATOR_CPU; + else if (initiator == RTAS_INITIATOR_PCI) + mce_err.initiator = MCE_INITIATOR_PCI; + else if (initiator == RTAS_INITIATOR_ISA) + mce_err.initiator = MCE_INITIATOR_ISA; + else if (initiator == RTAS_INITIATOR_MEMORY) + mce_err.initiator = MCE_INITIATOR_MEMORY; + else if (initiator == RTAS_INITIATOR_POWERMGM) + mce_err.initiator = MCE_INITIATOR_POWERMGM; + else + mce_err.initiator = MCE_INITIATOR_UNKNOWN; + + if (severity == RTAS_SEVERITY_NO_ERROR) + mce_err.severity = MCE_SEV_NO_ERROR; + else if (severity == RTAS_SEVERITY_EVENT) + mce_err.severity = MCE_SEV_WARNING; + else if (severity == RTAS_SEVERITY_WARNING) + mce_err.severity = MCE_SEV_WARNING; + else if (severity == RTAS_SEVERITY_ERROR_SYNC) + mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_ERROR) + mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_FATAL) + mce_err.severity = MCE_SEV_FATAL; + else + mce_err.severity = MCE_SEV_FATAL; + + if (severity <= RTAS_SEVERITY_ERROR_SYNC) + mce_err.sync_error = true; + else + mce_err.sync_error = false; + + mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err.error_class = MCE_ECLASS_UNKNOWN; + + if (!rtas_error_extended(errp)) + goto out; pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); if (pseries_log == NULL) - return; + goto out; mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; err_sub_type = rtas_mc_error_sub_type(mce_log); - switch (rtas_error_severity(errp)) { - case RTAS_SEVERITY_NO_ERROR: - level = KERN_INFO; - sevstr = "Harmless"; - break; - case RTAS_SEVERITY_WARNING: - level = KERN_WARNING; - sevstr = ""; - break; - case RTAS_SEVERITY_ERROR: - case RTAS_SEVERITY_ERROR_SYNC: - level = KERN_ERR; - sevstr = "Severe"; - break; - case RTAS_SEVERITY_FATAL: - default: - level = KERN_ERR; - sevstr = "Fatal"; - break; - } + switch (mce_log->error_type) { + case MC_ERROR_TYPE_UE: + mce_err.error_type = MCE_ERROR_TYPE_UE; + switch (err_sub_type) { + case MC_ERROR_UE_IFETCH: + mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; + break; + case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH: + mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; + break; + case MC_ERROR_UE_LOAD_STORE: + mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; + break; + case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE: + mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; + break; + case MC_ERROR_UE_INDETERMINATE: + default: + mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) + eaddr = be64_to_cpu(mce_log->effective_address); -#ifdef CONFIG_PPC_BOOK3S_64 - /* Display faulty slb contents for SLB errors. */ - if (error_type == MC_ERROR_TYPE_SLB) - slb_dump_contents(local_paca->mce_faulty_slbs); -#endif + if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { + paddr = be64_to_cpu(mce_log->logical_address); + } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { + unsigned long pfn; - printk("%s%s Machine check interrupt [%s]\n", level, sevstr, - disposition == RTAS_DISP_FULLY_RECOVERED ? - "Recovered" : "Not recovered"); - if (user_mode(regs)) { - printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, - regs->nip, current->pid, current->comm); - } else { - printk("%s NIP [%016lx]: %pS\n", level, regs->nip, - (void *)regs->nip); - } - printk("%s Initiator: %s\n", level, - VAL_TO_STRING(initiators, initiator)); + pfn = addr_to_pfn(regs, eaddr); + if (pfn != ULONG_MAX) + paddr = pfn << PAGE_SHIFT; + } - switch (error_type) { - case MC_ERROR_TYPE_UE: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_ue_types, err_sub_type)); break; case MC_ERROR_TYPE_SLB: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_slb_types, err_sub_type)); + mce_err.error_type = MCE_ERROR_TYPE_SLB; + switch (err_sub_type) { + case MC_ERROR_SLB_PARITY: + mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY; + break; + case MC_ERROR_SLB_MULTIHIT: + mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + break; + case MC_ERROR_SLB_INDETERMINATE: + default: + mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_ERAT: + mce_err.error_type = MCE_ERROR_TYPE_ERAT; + switch (err_sub_type) { + case MC_ERROR_ERAT_PARITY: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY; + break; + case MC_ERROR_ERAT_MULTIHIT: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + break; + case MC_ERROR_ERAT_INDETERMINATE: + default: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + break; case MC_ERROR_TYPE_TLB: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_soft_types, err_sub_type)); + mce_err.error_type = MCE_ERROR_TYPE_TLB; + switch (err_sub_type) { + case MC_ERROR_TLB_PARITY: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY; + break; + case MC_ERROR_TLB_MULTIHIT: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + break; + case MC_ERROR_TLB_INDETERMINATE: + default: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + break; + case MC_ERROR_TYPE_D_CACHE: + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; + case MC_ERROR_TYPE_I_CACHE: + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; + break; + case MC_ERROR_TYPE_UNKNOWN: default: - printk("%s Error type: %s\n", level, - VAL_TO_STRING(mc_err_types, error_type)); + mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; break; } - addr = rtas_mc_get_effective_addr(mce_log); - if (addr) - printk("%s Effective address: %016llx\n", level, addr); -} - -static int mce_handle_error(struct rtas_error_log *errp) -{ - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - int disposition = rtas_error_disposition(errp); - u8 error_type; - - if (!rtas_error_extended(errp)) - goto out; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (pseries_log == NULL) - goto out; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; - #ifdef CONFIG_PPC_BOOK3S_64 if (disposition == RTAS_DISP_NOT_RECOVERED) { switch (error_type) { @@ -682,98 +670,24 @@ static int mce_handle_error(struct rtas_error_log *errp) slb_save_contents(local_paca->mce_faulty_slbs); flush_and_reload_slb(); disposition = RTAS_DISP_FULLY_RECOVERED; - rtas_set_disposition_recovered(errp); break; default: break; } + } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { + /* Platform corrected itself but could be degraded */ + printk(KERN_ERR "MCE: limited recovery, system may " + "be degraded\n"); + disposition = RTAS_DISP_FULLY_RECOVERED; } #endif out: - return disposition; -} - -#ifdef CONFIG_MEMORY_FAILURE - -static DEFINE_PER_CPU(int, rtas_ue_count); -static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]); + save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, + &mce_err, regs->nip, eaddr, paddr); -#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 -#define UE_LOGICAL_ADDR_PROVIDED 0x20 - - -static void pseries_hwpoison_work_fn(struct work_struct *work) -{ - unsigned long paddr; - int index; - - while (__this_cpu_read(rtas_ue_count) > 0) { - index = __this_cpu_read(rtas_ue_count) - 1; - paddr = __this_cpu_read(rtas_ue_paddr[index]); - memory_failure(paddr >> PAGE_SHIFT, 0); - __this_cpu_dec(rtas_ue_count); - } -} - -static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn); - -static void queue_ue_paddr(unsigned long paddr) -{ - int index; - - index = __this_cpu_inc_return(rtas_ue_count) - 1; - if (index >= MAX_MC_EVT) { - __this_cpu_dec(rtas_ue_count); - return; - } - this_cpu_write(rtas_ue_paddr[index], paddr); - schedule_work(&hwpoison_work); -} - -static void pseries_do_memory_failure(struct pt_regs *regs, - struct pseries_mc_errorlog *mce_log) -{ - unsigned long paddr; - - if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { - paddr = be64_to_cpu(mce_log->logical_address); - } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { - unsigned long pfn; - - pfn = addr_to_pfn(regs, - be64_to_cpu(mce_log->effective_address)); - if (pfn == ULONG_MAX) - return; - paddr = pfn << PAGE_SHIFT; - } else { - return; - } - queue_ue_paddr(paddr); -} - -static void pseries_process_ue(struct pt_regs *regs, - struct rtas_error_log *errp) -{ - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - - if (!rtas_error_extended(errp)) - return; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (!pseries_log) - return; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - - if (mce_log->error_type == MC_ERROR_TYPE_UE) - pseries_do_memory_failure(regs, mce_log); + return disposition; } -#else -static inline void pseries_process_ue(struct pt_regs *regs, - struct rtas_error_log *errp) { } -#endif /*CONFIG_MEMORY_FAILURE */ /* * Process MCE rtas errlog event. @@ -795,49 +709,51 @@ static void mce_process_errlog_event(struct irq_work *work) * Return 1 if corrected (or delivered a signal). * Return 0 if there is nothing we can do. */ -static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) +static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - int disposition = rtas_error_disposition(err); - - pseries_print_mce_info(regs, err); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; - - } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { /* Platform corrected itself */ recovered = 1; + } else if (evt->severity == MCE_SEV_FATAL) { + /* Fatal machine check */ + pr_err("Machine check interrupt is fatal\n"); + recovered = 0; + } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - printk(KERN_ERR "MCE: limited recovery, system may " - "be degraded\n"); - recovered = 1; - - } else if (user_mode(regs) && !is_global_init(current) && - rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { - + if (!recovered && evt->sync_error) { /* - * If we received a synchronous error when in userspace - * kill the task. Firmware may report details of the fail - * asynchronously, so we can't rely on the target and type - * fields being valid here. + * Try to kill processes if we get a synchronous machine check + * (e.g., one caused by execution of this instruction). This + * will devolve into a panic if we try to kill init or are in + * an interrupt etc. + * + * TODO: Queue up this address for hwpoisioning later. + * TODO: This is not quite right for d-side machine + * checks ->nip is not necessarily the important + * address. */ - printk(KERN_ERR "MCE: uncorrectable error, killing task " - "%s:%d\n", current->comm, current->pid); - - _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); - recovered = 1; + if ((user_mode(regs))) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else if (die_will_crash()) { + /* + * die() would kill the kernel, so better to go via + * the platform reboot code that will log the + * machine check. + */ + recovered = 0; + } else { + die("Machine check", regs, SIGBUS); + recovered = 1; + } } - pseries_process_ue(regs, err); - - /* Queue irq work to log this rtas event later. */ - irq_work_queue(&mce_errlog_process_work); - return recovered; } @@ -853,14 +769,21 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) */ int pSeries_machine_check_exception(struct pt_regs *regs) { - struct rtas_error_log *errp; + struct machine_check_event evt; - if (fwnmi_active) { - fwnmi_release_errinfo(); - errp = fwnmi_get_errlog(); - if (errp && recover_mce(regs, errp)) - return 1; + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return 0; + + /* Print things out */ + if (evt.version != MCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt.version); + return 0; } + machine_check_print_event_info(&evt, user_mode(regs), false); + + if (recover_mce(regs, &evt)) + return 1; return 0; } @@ -877,7 +800,12 @@ long pseries_machine_check_realmode(struct pt_regs *regs) * to panic. Hence we will call it as soon as we go into * virtual mode. */ - disposition = mce_handle_error(errp); + disposition = mce_handle_error(regs, errp); + fwnmi_release_errinfo(); + + /* Queue irq work to log this rtas event later. */ + irq_work_queue(&mce_errlog_process_work); + if (disposition == RTAS_DISP_FULLY_RECOVERED) return 1; } diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c new file mode 100644 index 000000000000..70c3013fdd07 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWERVM platform. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "rtas fadump: " fmt + +#include <linux/string.h> +#include <linux/memblock.h> +#include <linux/delay.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/fadump.h> +#include <asm/fadump-internal.h> + +#include "rtas-fadump.h" + +static struct rtas_fadump_mem_struct fdm; +static const struct rtas_fadump_mem_struct *fdm_active; + +static void rtas_fadump_update_config(struct fw_dump *fadump_conf, + const struct rtas_fadump_mem_struct *fdm) +{ + fadump_conf->boot_mem_dest_addr = + be64_to_cpu(fdm->rmr_region.destination_address); + + fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr + + fadump_conf->boot_memory_size); +} + +/* + * This function is called in the capture kernel to get configuration details + * setup in the first kernel and passed to the f/w. + */ +static void rtas_fadump_get_config(struct fw_dump *fadump_conf, + const struct rtas_fadump_mem_struct *fdm) +{ + fadump_conf->boot_mem_addr[0] = + be64_to_cpu(fdm->rmr_region.source_address); + fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len); + fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0]; + + fadump_conf->boot_mem_top = fadump_conf->boot_memory_size; + fadump_conf->boot_mem_regs_cnt = 1; + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = + be64_to_cpu(fdm->cpu_state_data.destination_address); + + rtas_fadump_update_config(fadump_conf, fdm); +} + +static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + u64 addr = fadump_conf->reserve_dump_area_start; + + memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct)); + addr = addr & PAGE_MASK; + + fdm.header.dump_format_version = cpu_to_be32(0x00000001); + fdm.header.dump_num_sections = cpu_to_be16(3); + fdm.header.dump_status_flag = 0; + fdm.header.offset_first_dump_section = + cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, + cpu_state_data)); + + /* + * Fields for disk dump option. + * We are not using disk dump option, hence set these fields to 0. + */ + fdm.header.dd_block_size = 0; + fdm.header.dd_block_offset = 0; + fdm.header.dd_num_blocks = 0; + fdm.header.dd_offset_disk_path = 0; + + /* set 0 to disable an automatic dump-reboot. */ + fdm.header.max_time_auto = 0; + + /* Kernel dump sections */ + /* cpu state data section. */ + fdm.cpu_state_data.request_flag = + cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.cpu_state_data.source_data_type = + cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); + fdm.cpu_state_data.source_address = 0; + fdm.cpu_state_data.source_len = + cpu_to_be64(fadump_conf->cpu_state_data_size); + fdm.cpu_state_data.destination_address = cpu_to_be64(addr); + addr += fadump_conf->cpu_state_data_size; + + /* hpte region section */ + fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.hpte_region.source_data_type = + cpu_to_be16(RTAS_FADUMP_HPTE_REGION); + fdm.hpte_region.source_address = 0; + fdm.hpte_region.source_len = + cpu_to_be64(fadump_conf->hpte_region_size); + fdm.hpte_region.destination_address = cpu_to_be64(addr); + addr += fadump_conf->hpte_region_size; + + /* RMA region section */ + fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rmr_region.source_data_type = + cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); + fdm.rmr_region.source_address = cpu_to_be64(0); + fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size); + fdm.rmr_region.destination_address = cpu_to_be64(addr); + addr += fadump_conf->boot_memory_size; + + rtas_fadump_update_config(fadump_conf, &fdm); + + return addr; +} + +static u64 rtas_fadump_get_bootmem_min(void) +{ + return RTAS_FADUMP_MIN_BOOT_MEM; +} + +static int rtas_fadump_register(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc, err = -EIO; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_REGISTER, &fdm, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + + } while (wait_time); + + switch (rc) { + case 0: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case -1: + pr_err("Failed to register. Hardware Error(%d).\n", rc); + break; + case -3: + if (!is_fadump_boot_mem_contiguous()) + pr_err("Can't have holes in boot memory area.\n"); + else if (!is_fadump_reserved_mem_contiguous()) + pr_err("Can't have holes in reserved memory area.\n"); + + pr_err("Failed to register. Parameter Error(%d).\n", rc); + err = -EINVAL; + break; + case -9: + pr_err("Already registered!\n"); + fadump_conf->dump_registered = 1; + err = -EEXIST; + break; + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; + } + + return err; +} + +static int rtas_fadump_unregister(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_UNREGISTER, &fdm, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + pr_err("Failed to un-register - unexpected error(%d).\n", rc); + return -EIO; + } + + fadump_conf->dump_registered = 0; + return 0; +} + +static int rtas_fadump_invalidate(struct fw_dump *fadump_conf) +{ + unsigned int wait_time; + int rc; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_INVALIDATE, fdm_active, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + pr_err("Failed to invalidate - unexpected error (%d).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + fdm_active = NULL; + return 0; +} + +#define RTAS_FADUMP_GPR_MASK 0xffffff0000000000 +static inline int rtas_fadump_gpr_index(u64 id) +{ + char str[3]; + int i = -1; + + if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) { + /* get the digits at the end */ + id &= ~RTAS_FADUMP_GPR_MASK; + id >>= 24; + str[2] = '\0'; + str[1] = id & 0xff; + str[0] = (id >> 8) & 0xff; + if (kstrtoint(str, 10, &i)) + i = -EINVAL; + if (i > 31) + i = -1; + } + return i; +} + +void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) +{ + int i; + + i = rtas_fadump_gpr_index(reg_id); + if (i >= 0) + regs->gpr[i] = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("NIA")) + regs->nip = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("MSR")) + regs->msr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("CTR")) + regs->ctr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("LR")) + regs->link = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("XER")) + regs->xer = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("CR")) + regs->ccr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("DAR")) + regs->dar = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("DSISR")) + regs->dsisr = (unsigned long)reg_val; +} + +static struct rtas_fadump_reg_entry* +rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry, + struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + + while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) { + rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id), + be64_to_cpu(reg_entry->reg_value)); + reg_entry++; + } + reg_entry++; + return reg_entry; +} + +/* + * Read CPU state dump data and convert it into ELF notes. + * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be + * used to access the data to allow for additional fields to be added without + * affecting compatibility. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, + * 8 Byte ASCII identifier and 8 Byte register value. The register entry + * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part + * of register value. For more details refer to PAPR document. + * + * Only for the crashing cpu we ignore the CPU dump data and get exact + * state from fadump crash info structure populated by first kernel at the + * time of crash. + */ +static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) +{ + struct rtas_fadump_reg_save_area_header *reg_header; + struct fadump_crash_info_header *fdh = NULL; + struct rtas_fadump_reg_entry *reg_entry; + u32 num_cpus, *note_buf; + int i, rc = 0, cpu = 0; + struct pt_regs regs; + unsigned long addr; + void *vaddr; + + addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address); + vaddr = __va(addr); + + reg_header = vaddr; + if (be64_to_cpu(reg_header->magic_number) != + fadump_str_to_u64("REGSAVE")) { + pr_err("Unable to read register save area.\n"); + return -ENOENT; + } + + pr_debug("--------CPU State Data------------\n"); + pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number)); + pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset)); + + vaddr += be32_to_cpu(reg_header->num_cpu_offset); + num_cpus = be32_to_cpu(*((__be32 *)(vaddr))); + pr_debug("NumCpus : %u\n", num_cpus); + vaddr += sizeof(u32); + reg_entry = (struct rtas_fadump_reg_entry *)vaddr; + + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + + if (fadump_conf->fadumphdr_addr) + fdh = __va(fadump_conf->fadumphdr_addr); + + for (i = 0; i < num_cpus; i++) { + if (be64_to_cpu(reg_entry->reg_id) != + fadump_str_to_u64("CPUSTRT")) { + pr_err("Unable to read CPU state data\n"); + rc = -ENOENT; + goto error_out; + } + /* Lower 4 bytes of reg_value contains logical cpu id */ + cpu = (be64_to_cpu(reg_entry->reg_value) & + RTAS_FADUMP_CPU_ID_MASK); + if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); + continue; + } + pr_debug("Reading register data for cpu %d...\n", cpu); + if (fdh && fdh->crashing_cpu == cpu) { + regs = fdh->regs; + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); + } else { + reg_entry++; + reg_entry = rtas_fadump_read_regs(reg_entry, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + } + } + final_note(note_buf); + + if (fdh) { + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + } + return 0; + +error_out: + fadump_free_cpu_notes_buf(); + return rc; + +} + +/* + * Validate and process the dump data stored by firmware before exporting + * it through '/proc/vmcore'. + */ +static int __init rtas_fadump_process(struct fw_dump *fadump_conf) +{ + struct fadump_crash_info_header *fdh; + int rc = 0; + + if (!fdm_active || !fadump_conf->fadumphdr_addr) + return -EINVAL; + + /* Check if the dump data is valid. */ + if ((be16_to_cpu(fdm_active->header.dump_status_flag) == + RTAS_FADUMP_ERROR_FLAG) || + (fdm_active->cpu_state_data.error_flags != 0) || + (fdm_active->rmr_region.error_flags != 0)) { + pr_err("Dump taken by platform is not valid\n"); + return -EINVAL; + } + if ((fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) || + !fdm_active->cpu_state_data.bytes_dumped) { + pr_err("Dump taken by platform is incomplete\n"); + return -EINVAL; + } + + /* Validate the fadump crash info header */ + fdh = __va(fadump_conf->fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + pr_err("Crash info header is not valid.\n"); + return -EINVAL; + } + + rc = rtas_fadump_build_cpu_notes(fadump_conf); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return 0; +} + +static void rtas_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ + const struct rtas_fadump_section *cpu_data_section; + const struct rtas_fadump_mem_struct *fdm_ptr; + + if (fdm_active) + fdm_ptr = fdm_active; + else + fdm_ptr = &fdm; + + cpu_data_section = &(fdm_ptr->cpu_state_data); + seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(cpu_data_section->destination_address), + be64_to_cpu(cpu_data_section->destination_address) + + be64_to_cpu(cpu_data_section->source_len) - 1, + be64_to_cpu(cpu_data_section->source_len), + be64_to_cpu(cpu_data_section->bytes_dumped)); + + seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->hpte_region.destination_address), + be64_to_cpu(fdm_ptr->hpte_region.destination_address) + + be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, + be64_to_cpu(fdm_ptr->hpte_region.source_len), + be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); + + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + be64_to_cpu(fdm_ptr->rmr_region.source_address), + be64_to_cpu(fdm_ptr->rmr_region.destination_address)); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + be64_to_cpu(fdm_ptr->rmr_region.source_len), + be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); + + /* Dump is active. Show reserved area start address. */ + if (fdm_active) { + seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", + fadump_conf->reserve_dump_area_start); + } +} + +static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + /* Call ibm,os-term rtas call to trigger firmware assisted dump */ + rtas_os_term((char *)msg); +} + +static struct fadump_ops rtas_fadump_ops = { + .fadump_init_mem_struct = rtas_fadump_init_mem_struct, + .fadump_get_bootmem_min = rtas_fadump_get_bootmem_min, + .fadump_register = rtas_fadump_register, + .fadump_unregister = rtas_fadump_unregister, + .fadump_invalidate = rtas_fadump_invalidate, + .fadump_process = rtas_fadump_process, + .fadump_region_show = rtas_fadump_region_show, + .fadump_trigger = rtas_fadump_trigger, +}; + +void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + int i, size, num_sections; + const __be32 *sections; + const __be32 *token; + + /* + * Check if Firmware Assisted dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); + if (!token) + return; + + fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token); + fadump_conf->ops = &rtas_fadump_ops; + fadump_conf->fadump_supported = 1; + + /* Firmware supports 64-bit value for size, align it to pagesize. */ + fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE); + + /* + * The 'ibm,kernel-dump' rtas node is present only if there is + * dump data waiting for us. + */ + fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); + if (fdm_active) { + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active)); + } + + /* Get the sizes required to store dump data for the firmware provided + * dump sections. + * For each dump section type supported, a 32bit cell which defines + * the ID of a supported section followed by two 32 bit cells which + * gives the size of the section in bytes. + */ + sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + &size); + + if (!sections) + return; + + num_sections = size / (3 * sizeof(u32)); + + for (i = 0; i < num_sections; i++, sections += 3) { + u32 type = (u32)of_read_number(sections, 1); + + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + fadump_conf->cpu_state_data_size = + of_read_ulong(§ions[1], 2); + break; + case RTAS_FADUMP_HPTE_REGION: + fadump_conf->hpte_region_size = + of_read_ulong(§ions[1], 2); + break; + } + } +} diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h new file mode 100644 index 000000000000..fd59bd7ca9c3 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWERVM platform. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _PSERIES_RTAS_FADUMP_H +#define _PSERIES_RTAS_FADUMP_H + +/* + * On some Power systems where RMO is 128MB, it still requires minimum of + * 256MB for kernel to boot successfully. When kdump infrastructure is + * configured to save vmcore over network, we run into OOM issue while + * loading modules related to network setup. Hence we need additional 64M + * of memory to avoid OOM issue. + */ +#define RTAS_FADUMP_MIN_BOOT_MEM ((0x1UL << 28) + (0x1UL << 26)) + +/* Firmware provided dump sections */ +#define RTAS_FADUMP_CPU_STATE_DATA 0x0001 +#define RTAS_FADUMP_HPTE_REGION 0x0002 +#define RTAS_FADUMP_REAL_MODE_REGION 0x0011 + +/* Dump request flag */ +#define RTAS_FADUMP_REQUEST_FLAG 0x00000001 + +/* Dump status flag */ +#define RTAS_FADUMP_ERROR_FLAG 0x2000 + +/* Kernel Dump section info */ +struct rtas_fadump_section { + __be32 request_flag; + __be16 source_data_type; + __be16 error_flags; + __be64 source_address; + __be64 source_len; + __be64 bytes_dumped; + __be64 destination_address; +}; + +/* ibm,configure-kernel-dump header. */ +struct rtas_fadump_section_header { + __be32 dump_format_version; + __be16 dump_num_sections; + __be16 dump_status_flag; + __be32 offset_first_dump_section; + + /* Fields for disk dump option. */ + __be32 dd_block_size; + __be64 dd_block_offset; + __be64 dd_num_blocks; + __be32 dd_offset_disk_path; + + /* Maximum time allowed to prevent an automatic dump-reboot. */ + __be32 max_time_auto; +}; + +/* + * Firmware Assisted dump memory structure. This structure is required for + * registering future kernel dump with power firmware through rtas call. + * + * No disk dump option. Hence disk dump path string section is not included. + */ +struct rtas_fadump_mem_struct { + struct rtas_fadump_section_header header; + + /* Kernel dump sections */ + struct rtas_fadump_section cpu_state_data; + struct rtas_fadump_section hpte_region; + + /* + * TODO: Extend multiple boot memory regions support in the kernel + * for this platform. + */ + struct rtas_fadump_section rmr_region; +}; + +/* + * The firmware-assisted dump format. + * + * The register save area is an area in the partition's memory used to preserve + * the register contents (CPU state data) for the active CPUs during a firmware + * assisted dump. The dump format contains register save area header followed + * by register entries. Each list of registers for a CPU starts with "CPUSTRT" + * and ends with "CPUEND". + */ + +/* Register save area header. */ +struct rtas_fadump_reg_save_area_header { + __be64 magic_number; + __be32 version; + __be32 num_cpu_offset; +}; + +/* Register entry. */ +struct rtas_fadump_reg_entry { + __be64 reg_id; + __be64 reg_value; +}; + +/* Utility macros */ +#define RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry) \ +({ \ + while (be64_to_cpu(reg_entry->reg_id) != \ + fadump_str_to_u64("CPUEND")) \ + reg_entry++; \ + reg_entry++; \ +}) + +#define RTAS_FADUMP_CPU_ID_MASK ((1UL << 32) - 1) + +#endif /* _PSERIES_RTAS_FADUMP_H */ diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index f5940cc71c37..f8adcd0e4589 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -69,6 +69,7 @@ #include <asm/security_features.h> #include <asm/asm-const.h> #include <asm/swiotlb.h> +#include <asm/svm.h> #include "pseries.h" #include "../../../../drivers/pci/pci.h" @@ -141,17 +142,19 @@ static void __init fwnmi_init(void) } #ifdef CONFIG_PPC_BOOK3S_64 - /* Allocate per cpu slb area to save old slb contents during MCE */ - size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; - slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry), - MEMBLOCK_LOW_LIMIT, ppc64_rma_size, - NUMA_NO_NODE); - if (!slb_ptr) - panic("Failed to allocate %zu bytes below %pa for slb area\n", - size, &ppc64_rma_size); - - for_each_possible_cpu(i) - paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); + if (!radix_enabled()) { + /* Allocate per cpu area to save old slb contents during MCE */ + size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; + slb_ptr = memblock_alloc_try_nid_raw(size, + sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!slb_ptr) + panic("Failed to allocate %zu bytes below %pa for slb area\n", + size, &ppc64_rma_size); + + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); + } #endif } @@ -297,8 +300,10 @@ static inline int alloc_dispatch_logs(void) static int alloc_dispatch_log_kmem_cache(void) { + void (*ctor)(void *) = get_dtl_cache_ctor(); + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, - DISPATCH_LOG_BYTES, 0, NULL); + DISPATCH_LOG_BYTES, 0, ctor); if (!dtl_cache) { pr_warn("Failed to create dispatch trace log buffer cache\n"); pr_warn("Stolen time statistics will be unreliable\n"); @@ -316,6 +321,9 @@ static void pseries_lpar_idle(void) * low power mode by ceding processor to hypervisor */ + if (!prep_irq_for_idle()) + return; + /* Indicate to hypervisor that we are idle. */ get_lppaca()->idle = 1; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 4b3ef8d9c63f..ad61e90032da 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -41,6 +41,7 @@ #include <asm/dbell.h> #include <asm/plpar_wrappers.h> #include <asm/code-patching.h> +#include <asm/svm.h> #include "pseries.h" #include "offline_states.h" @@ -221,7 +222,7 @@ static __init void pSeries_smp_probe_xics(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) + if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest()) smp_ops->cause_ipi = smp_pseries_cause_ipi; else smp_ops->cause_ipi = icp_ops->cause_ipi; diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c new file mode 100644 index 000000000000..40c0637203d5 --- /dev/null +++ b/arch/powerpc/platforms/pseries/svm.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Secure VM platform + * + * Copyright 2018 IBM Corporation + * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com> + */ + +#include <linux/mm.h> +#include <asm/machdep.h> +#include <asm/svm.h> +#include <asm/swiotlb.h> +#include <asm/ultravisor.h> + +static int __init init_svm(void) +{ + if (!is_secure_guest()) + return 0; + + /* Don't release the SWIOTLB buffer. */ + ppc_swiotlb_enable = 1; + + /* + * Since the guest memory is inaccessible to the host, devices always + * need to use the SWIOTLB buffer for DMA even if dma_capable() says + * otherwise. + */ + swiotlb_force = SWIOTLB_FORCE; + + /* Share the SWIOTLB buffer with the host. */ + swiotlb_update_mem_attributes(); + + return 0; +} +machine_early_initcall(pseries, init_svm); + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_unshare_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_share_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +/* There's one dispatch log per CPU. */ +#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE) + +static struct page *dtl_page_store[NR_DTL_PAGE]; +static long dtl_nr_pages; + +static bool is_dtl_page_shared(struct page *page) +{ + long i; + + for (i = 0; i < dtl_nr_pages; i++) + if (dtl_page_store[i] == page) + return true; + + return false; +} + +void dtl_cache_ctor(void *addr) +{ + unsigned long pfn = PHYS_PFN(__pa(addr)); + struct page *page = pfn_to_page(pfn); + + if (!is_dtl_page_shared(page)) { + dtl_page_store[dtl_nr_pages] = page; + dtl_nr_pages++; + WARN_ON(dtl_nr_pages >= NR_DTL_PAGE); + uv_share_page(pfn, 1); + } +} diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 3473eef7628c..79e2287991db 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1193,7 +1193,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) else tbl->it_ops = &iommu_table_pseries_ops; - return iommu_init_table(tbl, -1); + return iommu_init_table(tbl, -1, 0, 0); } /** |