diff options
Diffstat (limited to 'arch/powerpc/platforms')
38 files changed, 1055 insertions, 186 deletions
diff --git a/arch/powerpc/platforms/44x/sam440ep.c b/arch/powerpc/platforms/44x/sam440ep.c index 688ffeab0699..55fed5e4de14 100644 --- a/arch/powerpc/platforms/44x/sam440ep.c +++ b/arch/powerpc/platforms/44x/sam440ep.c @@ -70,7 +70,7 @@ static struct i2c_board_info sam440ep_rtc_info = { .irq = -1, }; -static int sam440ep_setup_rtc(void) +static int __init sam440ep_setup_rtc(void) { return i2c_register_board_info(0, &sam440ep_rtc_info, 1); } diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig index b625a2c6f4f2..e4c745981912 100644 --- a/arch/powerpc/platforms/52xx/Kconfig +++ b/arch/powerpc/platforms/52xx/Kconfig @@ -33,7 +33,6 @@ config PPC_EFIKA bool "bPlan Efika 5k2. MPC5200B based computer" depends on PPC_MPC52xx select PPC_RTAS - select RTAS_PROC select PPC_NATIVE config PPC_LITE5200 diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 078097a0b09d..f51fd35f4618 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -344,6 +344,7 @@ done: } struct smp_ops_t smp_85xx_ops = { + .cause_nmi_ipi = NULL, .kick_cpu = smp_85xx_kick_cpu, .cpu_bootable = smp_generic_cpu_bootable, #ifdef CONFIG_HOTPLUG_CPU @@ -461,16 +462,9 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image) } #endif /* CONFIG_KEXEC_CORE */ -static void smp_85xx_basic_setup(int cpu_nr) -{ - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); -} - static void smp_85xx_setup_cpu(int cpu_nr) { mpic_setup_this_cpu(); - smp_85xx_basic_setup(cpu_nr); } void __init mpc85xx_smp_init(void) @@ -484,7 +478,7 @@ void __init mpc85xx_smp_init(void) smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu; smp_85xx_ops.message_pass = smp_mpic_message_pass; } else - smp_85xx_ops.setup_cpu = smp_85xx_basic_setup; + smp_85xx_ops.setup_cpu = NULL; if (cpu_has_feature(CPU_FTR_DBELL)) { /* @@ -492,7 +486,7 @@ void __init mpc85xx_smp_init(void) * smp_muxed_ipi_message_pass */ smp_85xx_ops.message_pass = NULL; - smp_85xx_ops.cause_ipi = doorbell_cause_ipi; + smp_85xx_ops.cause_ipi = doorbell_global_ipi; smp_85xx_ops.probe = NULL; } diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c index af09baee22cb..020e84a47a32 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c @@ -105,6 +105,7 @@ smp_86xx_setup_cpu(int cpu_nr) struct smp_ops_t smp_86xx_ops = { + .cause_nmi_ipi = NULL, .message_pass = smp_mpic_message_pass, .probe = smp_mpic_probe, .kick_cpu = smp_86xx_kick_cpu, diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 99b0ae8acb78..684e886eaae4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -279,7 +279,8 @@ config PPC_ICSWX This option enables kernel support for the PowerPC Initiate Coprocessor Store Word (icswx) coprocessor instruction on POWER7 - or newer processors. + and POWER8 processors. POWER9 uses new copy/paste instructions + to invoke the coprocessor. This option is only useful if you have a processor that supports the icswx coprocessor instruction. It does not have any effect @@ -359,7 +360,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) + default y if PPC_STD_MMU_64 default n config PPC_HAVE_PMU_SUPPORT @@ -371,9 +372,16 @@ config PPC_PERF_CTRS help This enables the powerpc-specific perf_event back-end. +config FORCE_SMP + # Allow platforms to force SMP=y by selecting this + bool + default n + select SMP + config SMP depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x - bool "Symmetric multi-processing support" + select GENERIC_IRQ_MIGRATION + bool "Symmetric multi-processing support" if !FORCE_SMP ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, say N. If you have a system with more diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index 8b55c5f19d4c..8d3ae2cc52bf 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -15,9 +15,9 @@ #include <linux/msi.h> #include <linux/export.h> #include <linux/of_platform.h> -#include <linux/debugfs.h> #include <linux/slab.h> +#include <asm/debugfs.h> #include <asm/dcr.h> #include <asm/machdep.h> #include <asm/prom.h> diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c index a6bbbaba14a3..871d38479a25 100644 --- a/arch/powerpc/platforms/cell/interrupt.c +++ b/arch/powerpc/platforms/cell/interrupt.c @@ -211,7 +211,7 @@ void iic_request_IPIs(void) iic_request_ipi(PPC_MSG_CALL_FUNCTION); iic_request_ipi(PPC_MSG_RESCHEDULE); iic_request_ipi(PPC_MSG_TICK_BROADCAST); - iic_request_ipi(PPC_MSG_DEBUGGER_BREAK); + iic_request_ipi(PPC_MSG_NMI_IPI); } #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c index e7d075077cb0..a88944db9fc3 100644 --- a/arch/powerpc/platforms/cell/pervasive.c +++ b/arch/powerpc/platforms/cell/pervasive.c @@ -88,11 +88,14 @@ static void cbe_power_save(void) static int cbe_system_reset_exception(struct pt_regs *regs) { switch (regs->msr & SRR1_WAKEMASK) { - case SRR1_WAKEEE: - do_IRQ(regs); - break; case SRR1_WAKEDEC: - timer_interrupt(regs); + set_dec(1); + case SRR1_WAKEEE: + /* + * Handle these when interrupts get re-enabled and we take + * them as regular exceptions. We are in an NMI context + * and can't handle these here. + */ break; case SRR1_WAKEMT: return cbe_sysreset_hack(); diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c index b6c9a0dcc924..14515040f7cd 100644 --- a/arch/powerpc/platforms/chrp/smp.c +++ b/arch/powerpc/platforms/chrp/smp.c @@ -44,6 +44,7 @@ static void smp_chrp_setup_cpu(int cpu_nr) /* CHRP with openpic */ struct smp_ops_t chrp_smp_ops = { + .cause_nmi_ipi = NULL, .message_pass = smp_mpic_message_pass, .probe = smp_mpic_probe, .kick_cpu = smp_chrp_kick_cpu, diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c index 75b296bc51af..44e0d9226f0a 100644 --- a/arch/powerpc/platforms/pasemi/idle.c +++ b/arch/powerpc/platforms/pasemi/idle.c @@ -53,11 +53,14 @@ static int pasemi_system_reset_exception(struct pt_regs *regs) regs->nip = regs->link; switch (regs->msr & SRR1_WAKEMASK) { - case SRR1_WAKEEE: - do_IRQ(regs); - break; case SRR1_WAKEDEC: - timer_interrupt(regs); + set_dec(1); + case SRR1_WAKEEE: + /* + * Handle these when interrupts get re-enabled and we take + * them as regular exceptions. We are in an NMI context + * and can't handle these here. + */ break; default: /* do system reset */ diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 746ca7321b03..2cd99eb30762 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -172,7 +172,7 @@ static irqreturn_t psurge_ipi_intr(int irq, void *d) return IRQ_HANDLED; } -static void smp_psurge_cause_ipi(int cpu, unsigned long data) +static void smp_psurge_cause_ipi(int cpu) { psurge_set_ipi(cpu); } @@ -447,6 +447,7 @@ void __init smp_psurge_give_timebase(void) struct smp_ops_t psurge_smp_ops = { .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ .cause_ipi = smp_psurge_cause_ipi, + .cause_nmi_ipi = NULL, .probe = smp_psurge_probe, .kick_cpu = smp_psurge_kick_cpu, .setup_cpu = smp_psurge_setup_cpu, diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 3a07e4dcf97c..6a6f4ef46b9e 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -4,6 +4,7 @@ config PPC_POWERNV select PPC_NATIVE select PPC_XICS select PPC_ICP_NATIVE + select PPC_XIVE_NATIVE select PPC_P7_NAP select PCI select PCI_MSI @@ -19,6 +20,8 @@ config PPC_POWERNV select CPU_FREQ_GOV_ONDEMAND select CPU_FREQ_GOV_CONSERVATIVE select PPC_DOORBELL + select MMU_NOTIFIER + select FORCE_SMP default y config OPAL_PRD diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 6fb5522acd70..d2f19821d71d 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1102,6 +1102,13 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) return -EIO; } + /* + * If dealing with the root bus (or the bus underneath the + * root port), we reset the bus underneath the root port. + * + * The cxl driver depends on this behaviour for bi-modal card + * switching. + */ if (pci_is_root_bus(bus) || pci_is_root_bus(bus->parent)) return pnv_eeh_root_reset(hose, option); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 4ee837e6391a..445f30a2c5ef 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -53,19 +53,6 @@ static int pnv_save_sprs_for_deep_states(void) uint64_t pir = get_hard_smp_processor_id(cpu); uint64_t hsprg0_val = (uint64_t)&paca[cpu]; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) { - /* - * HSPRG0 is used to store the cpu's pointer to paca. - * Hence last 3 bits are guaranteed to be 0. Program - * slw to restore HSPRG0 with 63rd bit set, so that - * when a thread wakes up at 0x100 we can use this bit - * to distinguish between fastsleep and deep winkle. - * This is not necessary with stop/psscr since PLS - * field of psscr indicates which state we are waking - * up from. - */ - hsprg0_val |= 1; - } rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); if (rc != 0) return rc; @@ -122,9 +109,12 @@ static void pnv_alloc_idle_core_states(void) for (i = 0; i < nr_cores; i++) { int first_cpu = i * threads_per_core; int node = cpu_to_node(first_cpu); + size_t paca_ptr_array_size; core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + paca_ptr_array_size = (threads_per_core * + sizeof(struct paca_struct *)); for (j = 0; j < threads_per_core; j++) { int cpu = first_cpu + j; @@ -132,6 +122,11 @@ static void pnv_alloc_idle_core_states(void) paca[cpu].core_idle_state_ptr = core_idle_state; paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; paca[cpu].thread_mask = 1 << j; + if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) + continue; + paca[cpu].thread_sibling_pacas = + kmalloc_node(paca_ptr_array_size, + GFP_KERNEL, node); } } @@ -147,7 +142,6 @@ u32 pnv_get_supported_cpuidle_states(void) } EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); - static void pnv_fastsleep_workaround_apply(void *info) { @@ -241,8 +235,9 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, * The default stop state that will be used by ppc_md.power_save * function on platforms that support stop instruction. */ -u64 pnv_default_stop_val; -u64 pnv_default_stop_mask; +static u64 pnv_default_stop_val; +static u64 pnv_default_stop_mask; +static bool default_stop_found; /* * Used for ppc_md.power_save which needs a function with no parameters @@ -262,8 +257,42 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE; * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. */ -u64 pnv_deepest_stop_psscr_val; -u64 pnv_deepest_stop_psscr_mask; +static u64 pnv_deepest_stop_psscr_val; +static u64 pnv_deepest_stop_psscr_mask; +static bool deepest_stop_found; + +/* + * pnv_cpu_offline: A function that puts the CPU into the deepest + * available platform idle state on a CPU-Offline. + */ +unsigned long pnv_cpu_offline(unsigned int cpu) +{ + unsigned long srr1; + + u32 idle_states = pnv_get_supported_cpuidle_states(); + + if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { + srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, + pnv_deepest_stop_psscr_mask); + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { + srr1 = power7_winkle(); + } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + srr1 = power7_sleep(); + } else if (idle_states & OPAL_PM_NAP_ENABLED) { + srr1 = power7_nap(1); + } else { + /* This is the fallback method. We emulate snooze */ + while (!generic_check_cpu_restart(cpu)) { + HMT_low(); + HMT_very_low(); + } + srr1 = 0; + HMT_medium(); + } + + return srr1; +} /* * Power ISA 3.0 idle initialization. @@ -352,7 +381,6 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, u32 *residency_ns = NULL; u64 max_residency_ns = 0; int rc = 0, i; - bool default_stop_found = false, deepest_stop_found = false; psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); @@ -432,21 +460,24 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, } } - if (!default_stop_found) { - pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL; - pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK; - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + if (unlikely(!default_stop_found)) { + pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n"); + } else { + ppc_md.power_save = power9_idle; + pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_default_stop_val, pnv_default_stop_mask); } - if (!deepest_stop_found) { - pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL; - pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK; - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + if (unlikely(!deepest_stop_found)) { + pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait"); + } else { + pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_deepest_stop_psscr_val, pnv_deepest_stop_psscr_mask); } + pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n", + pnv_first_deep_stop_state); out: kfree(psscr_val); kfree(psscr_mask); @@ -524,10 +555,30 @@ static int __init pnv_init_idle_states(void) pnv_alloc_idle_core_states(); + /* + * For each CPU, record its PACA address in each of it's + * sibling thread's PACA at the slot corresponding to this + * CPU's index in the core. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + int cpu; + + pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n"); + for_each_possible_cpu(cpu) { + int base_cpu = cpu_first_thread_sibling(cpu); + int idx = cpu_thread_in_core(cpu); + int i; + + for (i = 0; i < threads_per_core; i++) { + int j = base_cpu + i; + + paca[j].thread_sibling_pacas[idx] = &paca[cpu]; + } + } + } + if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) ppc_md.power_save = power7_idle; - else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST) - ppc_md.power_save = power9_idle; out: return 0; diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 1c383f38031d..067defeea691 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -9,11 +9,20 @@ * License as published by the Free Software Foundation. */ +#include <linux/slab.h> +#include <linux/mmu_notifier.h> +#include <linux/mmu_context.h> +#include <linux/of.h> #include <linux/export.h> #include <linux/pci.h> #include <linux/memblock.h> #include <linux/iommu.h> +#include <asm/tlb.h> +#include <asm/powernv.h> +#include <asm/reg.h> +#include <asm/opal.h> +#include <asm/io.h> #include <asm/iommu.h> #include <asm/pnv-pci.h> #include <asm/msi_bitmap.h> @@ -22,6 +31,8 @@ #include "powernv.h" #include "pci.h" +#define npu_to_phb(x) container_of(x, struct pnv_phb, npu) + /* * Other types of TCE cache invalidation are not functional in the * hardware. @@ -37,6 +48,12 @@ struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev) struct device_node *dn; struct pci_dev *gpdev; + if (WARN_ON(!npdev)) + return NULL; + + if (WARN_ON(!npdev->dev.of_node)) + return NULL; + /* Get assoicated PCI device */ dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); if (!dn) @@ -55,6 +72,12 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) struct device_node *dn; struct pci_dev *npdev; + if (WARN_ON(!gpdev)) + return NULL; + + if (WARN_ON(!gpdev->dev.of_node)) + return NULL; + /* Get assoicated PCI device */ dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); if (!dn) @@ -180,7 +203,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); return rc; } - pnv_pci_phb3_tce_invalidate_entire(phb, false); + pnv_pci_ioda2_tce_invalidate_entire(phb, false); /* Add the table to the list so its TCE cache will get invalidated */ pnv_pci_link_table_and_group(phb->hose->node, num, @@ -204,7 +227,7 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) pe_err(npe, "Unmapping failed, ret = %lld\n", rc); return rc; } - pnv_pci_phb3_tce_invalidate_entire(phb, false); + pnv_pci_ioda2_tce_invalidate_entire(phb, false); pnv_pci_unlink_table_and_group(npe->table_group.tables[num], &npe->table_group); @@ -270,7 +293,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) 0 /* bypass base */, top); if (rc == OPAL_SUCCESS) - pnv_pci_phb3_tce_invalidate_entire(phb, false); + pnv_pci_ioda2_tce_invalidate_entire(phb, false); return rc; } @@ -334,7 +357,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) pe_err(npe, "Failed to disable bypass, err %lld\n", rc); return; } - pnv_pci_phb3_tce_invalidate_entire(npe->phb, false); + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); } struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) @@ -359,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) return gpe; } + +/* Maximum number of nvlinks per npu */ +#define NV_MAX_LINKS 6 + +/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */ +static int max_npu2_index; + +struct npu_context { + struct mm_struct *mm; + struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; + struct mmu_notifier mn; + struct kref kref; + + /* Callback to stop translation requests on a given GPU */ + struct npu_context *(*release_cb)(struct npu_context *, void *); + + /* + * Private pointer passed to the above callback for usage by + * device drivers. + */ + void *priv; +}; + +/* + * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC + * if none are available. + */ +static int get_mmio_atsd_reg(struct npu *npu) +{ + int i; + + for (i = 0; i < npu->mmio_atsd_count; i++) { + if (!test_and_set_bit(i, &npu->mmio_atsd_usage)) + return i; + } + + return -ENOSPC; +} + +static void put_mmio_atsd_reg(struct npu *npu, int reg) +{ + clear_bit(reg, &npu->mmio_atsd_usage); +} + +/* MMIO ATSD register offsets */ +#define XTS_ATSD_AVA 1 +#define XTS_ATSD_STAT 2 + +static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, + unsigned long va) +{ + int mmio_atsd_reg; + + do { + mmio_atsd_reg = get_mmio_atsd_reg(npu); + cpu_relax(); + } while (mmio_atsd_reg < 0); + + __raw_writeq(cpu_to_be64(va), + npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA); + eieio(); + __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]); + + return mmio_atsd_reg; +} + +static int mmio_invalidate_pid(struct npu *npu, unsigned long pid) +{ + unsigned long launch; + + /* IS set to invalidate matching PID */ + launch = PPC_BIT(12); + + /* PRS set to process-scoped */ + launch |= PPC_BIT(13); + + /* AP */ + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + /* Invalidating the entire process doesn't use a va */ + return mmio_launch_invalidate(npu, launch, 0); +} + +static int mmio_invalidate_va(struct npu *npu, unsigned long va, + unsigned long pid) +{ + unsigned long launch; + + /* IS set to invalidate target VA */ + launch = 0; + + /* PRS set to process scoped */ + launch |= PPC_BIT(13); + + /* AP */ + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + return mmio_launch_invalidate(npu, launch, va); +} + +#define mn_to_npu_context(x) container_of(x, struct npu_context, mn) + +/* + * Invalidate either a single address or an entire PID depending on + * the value of va. + */ +static void mmio_invalidate(struct npu_context *npu_context, int va, + unsigned long address) +{ + int i, j, reg; + struct npu *npu; + struct pnv_phb *nphb; + struct pci_dev *npdev; + struct { + struct npu *npu; + int reg; + } mmio_atsd_reg[NV_MAX_NPUS]; + unsigned long pid = npu_context->mm->context.id; + + /* + * Loop over all the NPUs this process is active on and launch + * an invalidate. + */ + for (i = 0; i <= max_npu2_index; i++) { + mmio_atsd_reg[i].reg = -1; + for (j = 0; j < NV_MAX_LINKS; j++) { + npdev = npu_context->npdev[i][j]; + if (!npdev) + continue; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + mmio_atsd_reg[i].npu = npu; + + if (va) + mmio_atsd_reg[i].reg = + mmio_invalidate_va(npu, address, pid); + else + mmio_atsd_reg[i].reg = + mmio_invalidate_pid(npu, pid); + + /* + * The NPU hardware forwards the shootdown to all GPUs + * so we only have to launch one shootdown per NPU. + */ + break; + } + } + + /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm. + */ + flush_tlb_mm(npu_context->mm); + + /* Wait for all invalidations to complete */ + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + /* Wait for completion */ + npu = mmio_atsd_reg[i].npu; + reg = mmio_atsd_reg[i].reg; + while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) + cpu_relax(); + put_mmio_atsd_reg(npu, reg); + } +} + +static void pnv_npu2_mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + /* Call into device driver to stop requests to the NMMU */ + if (npu_context->release_cb) + npu_context->release_cb(npu_context, npu_context->priv); + + /* + * There should be no more translation requests for this PID, but we + * need to ensure any entries for it are removed from the TLB. + */ + mmio_invalidate(npu_context, 0, 0); +} + +static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + mmio_invalidate(npu_context, 1, address); +} + +static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + mmio_invalidate(npu_context, 1, address); +} + +static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + unsigned long address; + + for (address = start; address <= end; address += PAGE_SIZE) + mmio_invalidate(npu_context, 1, address); +} + +static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { + .release = pnv_npu2_mn_release, + .change_pte = pnv_npu2_mn_change_pte, + .invalidate_page = pnv_npu2_mn_invalidate_page, + .invalidate_range = pnv_npu2_mn_invalidate_range, +}; + +/* + * Call into OPAL to setup the nmmu context for the current task in + * the NPU. This must be called to setup the context tables before the + * GPU issues ATRs. pdev should be a pointed to PCIe GPU device. + * + * A release callback should be registered to allow a device driver to + * be notified that it should not launch any new translation requests + * as the final TLB invalidate is about to occur. + * + * Returns an error if there no contexts are currently available or a + * npu_context which should be passed to pnv_npu2_handle_fault(). + * + * mmap_sem must be held in write mode. + */ +struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv) +{ + int rc; + u32 nvlink_index; + struct device_node *nvlink_dn; + struct mm_struct *mm = current->mm; + struct pnv_phb *nphb; + struct npu *npu; + struct npu_context *npu_context; + + /* + * At present we don't support GPUs connected to multiple NPUs and I'm + * not sure the hardware does either. + */ + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return ERR_PTR(-ENODEV); + + if (!npdev) + /* No nvlink associated with this GPU device */ + return ERR_PTR(-ENODEV); + + if (!mm) { + /* kernel thread contexts are not supported */ + return ERR_PTR(-EINVAL); + } + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + + /* + * Setup the NPU context table for a particular GPU. These need to be + * per-GPU as we need the tables to filter ATSDs when there are no + * active contexts on a particular GPU. + */ + rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + if (rc < 0) + return ERR_PTR(-ENOSPC); + + /* + * We store the npu pci device so we can more easily get at the + * associated npus. + */ + npu_context = mm->context.npu_context; + if (!npu_context) { + npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL); + if (!npu_context) + return ERR_PTR(-ENOMEM); + + mm->context.npu_context = npu_context; + npu_context->mm = mm; + npu_context->mn.ops = &nv_nmmu_notifier_ops; + __mmu_notifier_register(&npu_context->mn, mm); + kref_init(&npu_context->kref); + } else { + kref_get(&npu_context->kref); + } + + npu_context->release_cb = cb; + npu_context->priv = priv; + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return ERR_PTR(-ENODEV); + npu_context->npdev[npu->index][nvlink_index] = npdev; + + return npu_context; +} +EXPORT_SYMBOL(pnv_npu2_init_context); + +static void pnv_npu2_release_context(struct kref *kref) +{ + struct npu_context *npu_context = + container_of(kref, struct npu_context, kref); + + npu_context->mm->context.npu_context = NULL; + mmu_notifier_unregister(&npu_context->mn, + npu_context->mm); + + kfree(npu_context); +} + +void pnv_npu2_destroy_context(struct npu_context *npu_context, + struct pci_dev *gpdev) +{ + struct pnv_phb *nphb, *phb; + struct npu *npu; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct device_node *nvlink_dn; + u32 nvlink_index; + + if (WARN_ON(!npdev)) + return; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + phb = pci_bus_to_host(gpdev->bus)->private_data; + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return; + npu_context->npdev[npu->index][nvlink_index] = NULL; + opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + kref_put(&npu_context->kref, pnv_npu2_release_context); +} +EXPORT_SYMBOL(pnv_npu2_destroy_context); + +/* + * Assumes mmap_sem is held for the contexts associated mm. + */ +int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, + unsigned long *flags, unsigned long *status, int count) +{ + u64 rc = 0, result = 0; + int i, is_write; + struct page *page[1]; + + /* mmap_sem should be held so the struct_mm must be present */ + struct mm_struct *mm = context->mm; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return -ENODEV; + + WARN_ON(!rwsem_is_locked(&mm->mmap_sem)); + + for (i = 0; i < count; i++) { + is_write = flags[i] & NPU2_WRITE; + rc = get_user_pages_remote(NULL, mm, ea[i], 1, + is_write ? FOLL_WRITE : 0, + page, NULL, NULL); + + /* + * To support virtualised environments we will have to do an + * access to the page to ensure it gets faulted into the + * hypervisor. For the moment virtualisation is not supported in + * other areas so leave the access out. + */ + if (rc != 1) { + status[i] = rc; + result = -EFAULT; + continue; + } + + status[i] = 0; + put_page(page[0]); + } + + return result; +} +EXPORT_SYMBOL(pnv_npu2_handle_fault); + +int pnv_npu2_init(struct pnv_phb *phb) +{ + unsigned int i; + u64 mmio_atsd; + struct device_node *dn; + struct pci_dev *gpdev; + static int npu_index; + uint64_t rc = 0; + + for_each_child_of_node(phb->hose->dn, dn) { + gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); + if (gpdev) { + rc = opal_npu_map_lpar(phb->opal_id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn), + 0, 0); + if (rc) + dev_err(&gpdev->dev, + "Error %lld mapping device to LPAR\n", + rc); + } + } + + for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd", + i, &mmio_atsd); i++) + phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); + + pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i); + phb->npu.mmio_atsd_count = i; + phb->npu.mmio_atsd_usage = 0; + npu_index++; + if (WARN_ON(npu_index >= NV_MAX_NPUS)) + return -ENOSPC; + max_npu2_index = npu_index; + phb->npu.index = npu_index; + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index a91d7876fae2..6c7ad1d8b32e 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -12,7 +12,6 @@ #include <linux/kernel.h> #include <linux/of.h> #include <linux/bug.h> -#include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> @@ -21,7 +20,7 @@ #include <asm/opal.h> #include <asm/prom.h> #include <linux/uaccess.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/isa-bridge.h> static int opal_lpc_chip_id = -1; diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index 308efd170c27..aa267f120033 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -64,6 +64,10 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) *sensor_data = be32_to_cpu(data); break; + case OPAL_WRONG_STATE: + ret = -EIO; + break; + default: ret = opal_error_code(ret); break; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index da8a0f7a035c..f620572f891f 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1); \ #define OPAL_BRANCH(LABEL) #endif -/* TODO: - * - * - Trace irqs in/off (needs saving/restoring all args, argh...) - * - Get r11 feed up by Dave so I can have better register usage +/* + * DO_OPAL_CALL assumes: + * r0 = opal call token + * r12 = msr + * LR has been saved */ - -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name); \ - mfmsr r12; \ - mflr r0; \ - andi. r11,r12,MSR_IR|MSR_DR; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - beq opal_real_call; \ - OPAL_BRANCH(opal_tracepoint_entry) \ +#define DO_OPAL_CALL() \ mfcr r11; \ stw r11,8(r1); \ li r11,0; \ @@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1); \ mtspr SPRN_HSRR0,r12; \ hrfid +#define OPAL_CALL(name, token) \ + _GLOBAL_TOC(name); \ + mfmsr r12; \ + mflr r0; \ + andi. r11,r12,MSR_IR|MSR_DR; \ + std r0,PPC_LR_STKOFF(r1); \ + li r0,token; \ + beq opal_real_call; \ + OPAL_BRANCH(opal_tracepoint_entry) \ + DO_OPAL_CALL() + + opal_return: /* * Fixup endian on OPAL return... we should be able to simplify @@ -148,26 +152,13 @@ opal_tracepoint_entry: ld r8,STK_REG(R29)(r1) ld r9,STK_REG(R30)(r1) ld r10,STK_REG(R31)(r1) + + /* setup LR so we return via tracepoint_return */ LOAD_REG_ADDR(r11,opal_tracepoint_return) - mfcr r12 std r11,16(r1) - stw r12,8(r1) - li r11,0 + mfmsr r12 - ori r11,r11,MSR_EE - std r12,PACASAVEDMSR(r13) - andc r12,r12,r11 - mtmsrd r12,1 - LOAD_REG_ADDR(r11,opal_return) - mtlr r11 - li r11,MSR_DR|MSR_IR|MSR_LE - andc r12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid + DO_OPAL_CALL() opal_tracepoint_return: std r3,STK_REG(R31)(r1) @@ -301,3 +292,21 @@ OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); +OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); +OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); +OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); +OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); +OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); +OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); +OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); +OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); +OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); +OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); +OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); +OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); +OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); +OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); +OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); +OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index d0ac535cf5d7..28651fb25417 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -73,25 +73,32 @@ static int opal_xscom_err_xlate(int64_t rc) static u64 opal_scom_unmangle(u64 addr) { + u64 tmp; + /* - * XSCOM indirect addresses have the top bit set. Additionally - * the rest of the top 3 nibbles is always 0. + * XSCOM addresses use the top nibble to set indirect mode and + * its form. Bits 4-11 are always 0. * * Because the debugfs interface uses signed offsets and shifts * the address left by 3, we basically cannot use the top 4 bits * of the 64-bit address, and thus cannot use the indirect bit. * - * To deal with that, we support the indirect bit being in bit - * 4 (IBM notation) instead of bit 0 in this API, we do the - * conversion here. To leave room for further xscom address - * expansion, we only clear out the top byte + * To deal with that, we support the indirect bits being in + * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we + * do the conversion here. * - * For in-kernel use, we also support the real indirect bit, so - * we test for any of the top 5 bits + * For in-kernel use, we don't need to do this mangling. In + * kernel won't have bits 4-7 set. * + * So: + * debugfs will always set 0-3 = 0 and clear 4-7 + * kernel will always clear 0-3 = 0 and set 4-7 */ - if (addr & (0x1full << 59)) - addr = (addr & ~(0xffull << 56)) | (1ull << 63); + tmp = addr; + tmp &= 0x0f00000000000000; + addr &= 0xf0ffffffffffffff; + addr |= tmp << 4; + return addr; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index e0f856bfbfe8..7925a9d72cca 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -435,7 +435,7 @@ int opal_machine_check(struct pt_regs *regs) evt.version); return 0; } - machine_check_print_event_info(&evt); + machine_check_print_event_info(&evt, user_mode(regs)); if (opal_recover_mce(regs, &evt)) return 1; @@ -595,6 +595,80 @@ static void opal_export_symmap(void) pr_warn("Error %d creating OPAL symbols file\n", rc); } +static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t off, size_t count) +{ + return memory_read_from_buffer(buf, count, &off, bin_attr->private, + bin_attr->size); +} + +/* + * opal_export_attrs: creates a sysfs node for each property listed in + * the device-tree under /ibm,opal/firmware/exports/ + * All new sysfs nodes are created under /opal/exports/. + * This allows for reserved memory regions (e.g. HDAT) to be read. + * The new sysfs nodes are only readable by root. + */ +static void opal_export_attrs(void) +{ + struct bin_attribute *attr; + struct device_node *np; + struct property *prop; + struct kobject *kobj; + u64 vals[2]; + int rc; + + np = of_find_node_by_path("/ibm,opal/firmware/exports"); + if (!np) + return; + + /* Create new 'exports' directory - /sys/firmware/opal/exports */ + kobj = kobject_create_and_add("exports", opal_kobj); + if (!kobj) { + pr_warn("kobject_create_and_add() of exports failed\n"); + return; + } + + for_each_property_of_node(np, prop) { + if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle")) + continue; + + if (of_property_read_u64_array(np, prop->name, &vals[0], 2)) + continue; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + + if (attr == NULL) { + pr_warn("Failed kmalloc for bin_attribute!"); + continue; + } + + sysfs_bin_attr_init(attr); + attr->attr.name = kstrdup(prop->name, GFP_KERNEL); + attr->attr.mode = 0400; + attr->read = export_attr_read; + attr->private = __va(vals[0]); + attr->size = vals[1]; + + if (attr->attr.name == NULL) { + pr_warn("Failed kstrdup for bin_attribute attr.name"); + kfree(attr); + continue; + } + + rc = sysfs_create_bin_file(kobj, attr); + if (rc) { + pr_warn("Error %d creating OPAL sysfs exports/%s file\n", + rc, prop->name); + kfree(attr->attr.name); + kfree(attr); + } + } + + of_node_put(np); +} + static void __init opal_dump_region_init(void) { void *addr; @@ -733,6 +807,9 @@ static int __init opal_init(void) opal_msglog_sysfs_init(); } + /* Export all properties */ + opal_export_attrs(); + /* Initialize platform devices: IPMI backend, PRD & flash interface */ opal_pdev_init("ibm,opal-ipmi"); opal_pdev_init("ibm,opal-flash"); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e36738291c32..6fdbd383f676 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -14,7 +14,6 @@ #include <linux/kernel.h> #include <linux/pci.h> #include <linux/crash_dump.h> -#include <linux/debugfs.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/init.h> @@ -38,7 +37,7 @@ #include <asm/iommu.h> #include <asm/tce.h> #include <asm/xics.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/firmware.h> #include <asm/pnv-pci.h> #include <asm/mmzone.h> @@ -1262,6 +1261,8 @@ static void pnv_pci_ioda_setup_PEs(void) /* PE#0 is needed for error reporting */ pnv_ioda_reserve_pe(phb, 0); pnv_ioda_setup_npu_PEs(hose->bus); + if (phb->model == PNV_PHB_MODEL_NPU2) + pnv_npu2_init(phb); } } } @@ -1424,8 +1425,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe iommu_group_put(pe->table_group.group); BUG_ON(pe->table_group.group); } - pnv_pci_ioda2_table_free_pages(tbl); - iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); + iommu_tce_table_put(tbl); } static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) @@ -1860,6 +1860,17 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, return ret; } + +static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret) + pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); + + return ret; +} #endif static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, @@ -1874,6 +1885,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { .set = pnv_ioda1_tce_build, #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda1_tce_xchg, + .exchange_rm = pnv_ioda1_tce_xchg_rm, #endif .clear = pnv_ioda1_tce_free, .get = pnv_tce_get, @@ -1883,7 +1895,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { #define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1) #define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2) -void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) +static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) { __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm); const unsigned long val = PHB3_TCE_KILL_INVAL_ALL; @@ -1948,7 +1960,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, { struct iommu_table_group_link *tgl; - list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) { struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); struct pnv_phb *phb = pe->phb; @@ -1979,6 +1991,14 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, } } +void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) +{ + if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3) + pnv_pci_phb3_tce_invalidate_entire(phb, rm); + else + opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0); +} + static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, @@ -2004,6 +2024,17 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, return ret; } + +static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret) + pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); + + return ret; +} #endif static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, @@ -2017,13 +2048,13 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, static void pnv_ioda2_table_free(struct iommu_table *tbl) { pnv_pci_ioda2_table_free_pages(tbl); - iommu_free_table(tbl, "pnv"); } static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda2_tce_xchg, + .exchange_rm = pnv_ioda2_tce_xchg_rm, #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, @@ -2128,6 +2159,9 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, found: tbl = pnv_pci_table_alloc(phb->hose->node); + if (WARN_ON(!tbl)) + return; + iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); @@ -2203,7 +2237,7 @@ found: __free_pages(tce_mem, get_order(tce32_segsz * segs)); if (tbl) { pnv_pci_unlink_table_and_group(tbl, &pe->table_group); - iommu_free_table(tbl, "pnv"); + iommu_tce_table_put(tbl); } } @@ -2293,16 +2327,16 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, if (!tbl) return -ENOMEM; + tbl->it_ops = &pnv_ioda2_iommu_ops; + ret = pnv_pci_ioda2_table_alloc_pages(nid, bus_offset, page_shift, window_size, levels, tbl); if (ret) { - iommu_free_table(tbl, "pnv"); + iommu_tce_table_put(tbl); return ret; } - tbl->it_ops = &pnv_ioda2_iommu_ops; - *ptbl = tbl; return 0; @@ -2343,7 +2377,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) if (rc) { pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc); - pnv_ioda2_table_free(tbl); + iommu_tce_table_put(tbl); return rc; } @@ -2414,7 +2448,8 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, tce_table_size /= direct_table_size; tce_table_size <<= 3; - tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size); + tce_table_size = max_t(unsigned long, + tce_table_size, direct_table_size); } return bytes; @@ -2431,7 +2466,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (pe->pbus) pnv_ioda_setup_bus_dma(pe, pe->pbus, false); - pnv_ioda2_table_free(tbl); + iommu_tce_table_put(tbl); } static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) @@ -2735,9 +2770,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (rc) return; - if (pe->flags & PNV_IODA_PE_DEV) - iommu_add_device(&pe->pdev->dev); - else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus, true); } @@ -3406,7 +3439,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) } free_pages(tbl->it_base, get_order(tbl->it_size << 3)); - iommu_free_table(tbl, "pnv"); + iommu_tce_table_put(tbl); } static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) @@ -3433,7 +3466,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) } pnv_pci_ioda2_table_free_pages(tbl); - iommu_free_table(tbl, "pnv"); + iommu_tce_table_put(tbl); } static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index eb835e977e33..935ccb249a8a 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -758,7 +758,7 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { - return *(pnv_tce(tbl, index - tbl->it_offset)); + return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); } struct iommu_table *pnv_pci_table_alloc(int nid) @@ -766,7 +766,11 @@ struct iommu_table *pnv_pci_table_alloc(int nid) struct iommu_table *tbl; tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid); + if (!tbl) + return NULL; + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + kref_init(&tbl->it_kref); return tbl; } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index e1d3e5526b54..18c8a2fa03b8 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -7,6 +7,9 @@ struct pci_dn; +/* Maximum possible number of ATSD MMIO registers per NPU */ +#define NV_NMMU_ATSD_REGS 8 + enum pnv_phb_type { PNV_PHB_IODA1 = 0, PNV_PHB_IODA2 = 1, @@ -174,6 +177,16 @@ struct pnv_phb { struct OpalIoP7IOCErrorData hub_diag; } diag; + /* Nvlink2 data */ + struct npu { + int index; + __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; + unsigned int mmio_atsd_count; + + /* Bitmask for MMIO register usage */ + unsigned long mmio_atsd_usage; + } npu; + #ifdef CONFIG_CXL_BASE struct cxl_afu *cxl_afu; #endif @@ -229,14 +242,14 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, /* Nvlink functions */ extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); -extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm); +extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, struct iommu_table *tbl); extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); - +extern int pnv_npu2_init(struct pnv_phb *phb); /* cxl functions */ extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev); diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 613052232475..6dbc0a1da1f6 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -18,8 +18,6 @@ static inline void pnv_pci_shutdown(void) { } #endif extern u32 pnv_get_supported_cpuidle_states(void); -extern u64 pnv_deepest_stop_psscr_val; -extern u64 pnv_deepest_stop_psscr_mask; extern void pnv_lpc_init(void); diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 5dcbdea1afac..1a9d84371a4d 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -62,7 +62,7 @@ int powernv_get_random_real_mode(unsigned long *v) rng = raw_cpu_read(powernv_rng); - *v = rng_whiten(rng, in_rm64(rng->regs_real)); + *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); return 1; } diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index d50c7d99baaf..2dc7e5fb86c3 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -32,6 +32,7 @@ #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/opal.h> #include <asm/kexec.h> #include <asm/smp.h> @@ -76,7 +77,9 @@ static void __init pnv_init(void) static void __init pnv_init_IRQ(void) { - xics_init(); + /* Try using a XIVE if available, otherwise use a XICS */ + if (!xive_native_init()) + xics_init(); WARN_ON(!ppc_md.get_irq); } @@ -95,6 +98,10 @@ static void pnv_show_cpuinfo(struct seq_file *m) else seq_printf(m, "firmware\t: BML\n"); of_node_put(root); + if (radix_enabled()) + seq_printf(m, "MMU\t\t: Radix\n"); + else + seq_printf(m, "MMU\t\t: Hash\n"); } static void pnv_prepare_going_down(void) @@ -218,10 +225,12 @@ static void pnv_kexec_wait_secondaries_down(void) static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { - xics_kexec_teardown_cpu(secondary); + if (xive_enabled()) + xive_kexec_teardown_cpu(secondary); + else + xics_kexec_teardown_cpu(secondary); /* On OPAL, we return all CPUs to firmware */ - if (!firmware_has_feature(FW_FEATURE_OPAL)) return; @@ -237,6 +246,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) /* Primary waits for the secondaries to have reached OPAL */ pnv_kexec_wait_secondaries_down(); + /* Switch XIVE back to emulation mode */ + if (xive_enabled()) + xive_shutdown(); + /* * We might be running as little-endian - now that interrupts * are disabled, reset the HILE bit to big-endian so we don't diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 8b67e1eefb5c..4aff754b6f2c 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -29,12 +29,14 @@ #include <asm/vdso_datapage.h> #include <asm/cputhreads.h> #include <asm/xics.h> +#include <asm/xive.h> #include <asm/opal.h> #include <asm/runlatch.h> #include <asm/code-patching.h> #include <asm/dbell.h> #include <asm/kvm_ppc.h> #include <asm/ppc-opcode.h> +#include <asm/cpuidle.h> #include "powernv.h" @@ -47,13 +49,10 @@ static void pnv_smp_setup_cpu(int cpu) { - if (cpu != boot_cpuid) + if (xive_enabled()) + xive_smp_setup_cpu(); + else if (cpu != boot_cpuid) xics_setup_cpu(); - -#ifdef CONFIG_PPC_DOORBELL - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); -#endif } static int pnv_smp_kick_cpu(int nr) @@ -132,7 +131,10 @@ static int pnv_smp_cpu_disable(void) vdso_data->processorCount--; if (cpu == boot_cpuid) boot_cpuid = cpumask_any(cpu_online_mask); - xics_migrate_irqs_away(); + if (xive_enabled()) + xive_smp_disable_cpu(); + else + xics_migrate_irqs_away(); return 0; } @@ -140,7 +142,6 @@ static void pnv_smp_cpu_kill_self(void) { unsigned int cpu; unsigned long srr1, wmask; - u32 idle_states; /* Standard hot unplug procedure */ local_irq_disable(); @@ -155,8 +156,6 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; - idle_states = pnv_get_supported_cpuidle_states(); - /* We don't want to take decrementer interrupts while we are offline, * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) * enabled as to let IPIs in. @@ -184,19 +183,7 @@ static void pnv_smp_cpu_kill_self(void) kvmppc_set_host_ipi(cpu, 0); ppc64_runlatch_off(); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, - pnv_deepest_stop_psscr_mask); - } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = power7_winkle(); - } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || - (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_sleep(); - } else { - srr1 = power7_nap(1); - } - + srr1 = pnv_cpu_offline(cpu); ppc64_runlatch_on(); /* @@ -213,9 +200,12 @@ static void pnv_smp_cpu_kill_self(void) if (((srr1 & wmask) == SRR1_WAKEEE) || ((srr1 & wmask) == SRR1_WAKEHVI) || (local_paca->irq_happened & PACA_IRQ_EE)) { - if (cpu_has_feature(CPU_FTR_ARCH_300)) - icp_opal_flush_interrupt(); - else + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (xive_enabled()) + xive_flush_interrupt(); + else + icp_opal_flush_interrupt(); + } else icp_native_flush_interrupt(); } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) { unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); @@ -252,10 +242,69 @@ static int pnv_cpu_bootable(unsigned int nr) return smp_generic_cpu_bootable(nr); } +static int pnv_smp_prepare_cpu(int cpu) +{ + if (xive_enabled()) + return xive_smp_prepare_cpu(cpu); + return 0; +} + +/* Cause IPI as setup by the interrupt controller (xics or xive) */ +static void (*ic_cause_ipi)(int cpu); + +static void pnv_cause_ipi(int cpu) +{ + if (doorbell_try_core_ipi(cpu)) + return; + + ic_cause_ipi(cpu); +} + +static void pnv_p9_dd1_cause_ipi(int cpu) +{ + int this_cpu = get_cpu(); + + /* + * POWER9 DD1 has a global addressed msgsnd, but for now we restrict + * IPIs to same core, because it requires additional synchronization + * for inter-core doorbells which we do not implement. + */ + if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) + doorbell_global_ipi(cpu); + else + ic_cause_ipi(cpu); + + put_cpu(); +} + +static void __init pnv_smp_probe(void) +{ + if (xive_enabled()) + xive_smp_probe(); + else + xics_smp_probe(); + + if (cpu_has_feature(CPU_FTR_DBELL)) { + ic_cause_ipi = smp_ops->cause_ipi; + WARN_ON(!ic_cause_ipi); + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi; + else + smp_ops->cause_ipi = doorbell_global_ipi; + } else { + smp_ops->cause_ipi = pnv_cause_ipi; + } + } +} + static struct smp_ops_t pnv_smp_ops = { - .message_pass = smp_muxed_ipi_message_pass, - .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ - .probe = xics_smp_probe, + .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ + .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */ + .cause_nmi_ipi = NULL, + .probe = pnv_smp_probe, + .prepare_cpu = pnv_smp_prepare_cpu, .kick_cpu = pnv_smp_kick_cpu, .setup_cpu = pnv_smp_setup_cpu, .cpu_bootable = pnv_cpu_bootable, diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c index 60154d08debf..1d1ad5df106f 100644 --- a/arch/powerpc/platforms/ps3/smp.c +++ b/arch/powerpc/platforms/ps3/smp.c @@ -77,7 +77,7 @@ static void __init ps3_smp_probe(void) BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION != 0); BUILD_BUG_ON(PPC_MSG_RESCHEDULE != 1); BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST != 2); - BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK != 3); + BUILD_BUG_ON(PPC_MSG_NMI_IPI != 3); for (i = 0; i < MSG_COUNT; i++) { result = ps3_event_receive_port_setup(cpu, &virqs[i]); @@ -96,7 +96,7 @@ static void __init ps3_smp_probe(void) ps3_register_ipi_irq(cpu, virqs[i]); } - ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_DEBUGGER_BREAK]); + ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_NMI_IPI]); DBG(" <- %s:%d: (%d)\n", __func__, __LINE__, cpu); } diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 30ec04f1c67c..913c54e23eea 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -17,9 +17,10 @@ config PPC_PSERIES select PPC_UDBG_16550 select PPC_NATIVE select PPC_DOORBELL - select HOTPLUG_CPU if SMP + select HOTPLUG_CPU select ARCH_RANDOM select PPC_DOORBELL + select FORCE_SMP default y config PPC_SPLPAR diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 193e052fa0dd..bda18d8e1674 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -288,7 +288,6 @@ int dlpar_detach_node(struct device_node *dn) if (rc) return rc; - of_node_put(dn); /* Must decrement the refcount */ return 0; } diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 6b04e3f0f982..18014cdeb590 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -21,13 +21,12 @@ */ #include <linux/slab.h> -#include <linux/debugfs.h> #include <linux/spinlock.h> #include <asm/smp.h> #include <linux/uaccess.h> #include <asm/firmware.h> #include <asm/lppaca.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/plpar_wrappers.h> #include <asm/machdep.h> diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index f02ec3ab428c..957ae347b0b3 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -29,6 +29,16 @@ #include <asm/trace.h> #include <asm/machdep.h> +/* For hcall instrumentation. One structure per-hcall, per-CPU */ +struct hcall_stats { + unsigned long num_calls; /* number of calls (on this CPU) */ + unsigned long tb_total; /* total wall time (mftb) of calls. */ + unsigned long purr_total; /* total cpu time (PURR) of calls. */ + unsigned long tb_start; + unsigned long purr_start; +}; +#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) + DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); /* diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 4d757eaa46bf..8374adee27e3 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -74,6 +74,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) goto fail_exit; INIT_LIST_HEAD_RCU(&tbl->it_group_list); + kref_init(&tbl->it_kref); tgl->table_group = table_group; list_add_rcu(&tgl->next, &tbl->it_group_list); @@ -115,7 +116,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group, BUG_ON(table_group->group); } #endif - iommu_free_table(tbl, node_name); + iommu_tce_table_put(tbl); kfree(table_group); } @@ -550,6 +551,7 @@ static void iommu_table_setparms(struct pci_controller *phb, static void iommu_table_setparms_lpar(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl, + struct iommu_table_group *table_group, const __be32 *dma_window) { unsigned long offset, size; @@ -563,6 +565,9 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb, tbl->it_type = TCE_PCI; tbl->it_offset = offset >> tbl->it_page_shift; tbl->it_size = size >> tbl->it_page_shift; + + table_group->tce32_start = offset; + table_group->tce32_size = size; } struct iommu_table_ops iommu_table_pseries_ops = { @@ -651,8 +656,38 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); } +#ifdef CONFIG_IOMMU_API +static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned + long *tce, enum dma_data_direction *direction) +{ + long rc; + unsigned long ioba = (unsigned long) index << tbl->it_page_shift; + unsigned long flags, oldtce = 0; + u64 proto_tce = iommu_direction_to_tce_perm(*direction); + unsigned long newtce = *tce | proto_tce; + + spin_lock_irqsave(&tbl->large_pool.lock, flags); + + rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce); + if (!rc) + rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce); + + if (!rc) { + *direction = iommu_tce_direction(oldtce); + *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); + } + + spin_unlock_irqrestore(&tbl->large_pool.lock, flags); + + return rc; +} +#endif + struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, +#ifdef CONFIG_IOMMU_API + .exchange = tce_exchange_pseries, +#endif .clear = tce_freemulti_pSeriesLP, .get = tce_get_pSeriesLP }; @@ -689,7 +724,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) if (!ppci->table_group) { ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); tbl = ppci->table_group->tables[0]; - iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); + iommu_table_setparms_lpar(ppci->phb, pdn, tbl, + ppci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; iommu_init_table(tbl, ppci->phb->node); iommu_register_group(ppci->table_group, @@ -1143,7 +1179,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) if (!pci->table_group) { pci->table_group = iommu_pseries_alloc_group(pci->phb->node); tbl = pci->table_group->tables[0]; - iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); + iommu_table_setparms_lpar(pci->phb, pdn, tbl, + pci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; iommu_init_table(tbl, pci->phb->node); iommu_register_group(pci->table_group, diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 8b1fe895daa3..6541d0b03e4c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -958,3 +958,64 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) return rc; } + +static unsigned long vsid_unscramble(unsigned long vsid, int ssize) +{ + unsigned long protovsid; + unsigned long va_bits = VA_BITS; + unsigned long modinv, vsid_modulus; + unsigned long max_mod_inv, tmp_modinv; + + if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) + va_bits = 65; + + if (ssize == MMU_SEGSIZE_256M) { + modinv = VSID_MULINV_256M; + vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1); + } else { + modinv = VSID_MULINV_1T; + vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1); + } + + /* + * vsid outside our range. + */ + if (vsid >= vsid_modulus) + return 0; + + /* + * If modinv is the modular multiplicate inverse of (x % vsid_modulus) + * and vsid = (protovsid * x) % vsid_modulus, then we say: + * protovsid = (vsid * modinv) % vsid_modulus + */ + + /* Check if (vsid * modinv) overflow (63 bits) */ + max_mod_inv = 0x7fffffffffffffffull / vsid; + if (modinv < max_mod_inv) + return (vsid * modinv) % vsid_modulus; + + tmp_modinv = modinv/max_mod_inv; + modinv %= max_mod_inv; + + protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus; + protovsid = (protovsid + vsid * modinv) % vsid_modulus; + + return protovsid; +} + +static int __init reserve_vrma_context_id(void) +{ + unsigned long protovsid; + + /* + * Reserve context ids which map to reserved virtual addresses. For now + * we only reserve the context id which maps to the VRMA VSID. We ignore + * the addresses in "ibm,adjunct-virtual-addresses" because we don't + * enable adjunct support via the "ibm,client-architecture-support" + * interface. + */ + protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T); + hash__reserve_context_id(protovsid >> ESID_BITS_1T); + return 0; +} +machine_device_initcall(pseries, reserve_vrma_context_id); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 904a677208d1..bb70b26334f0 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -386,6 +386,10 @@ int pSeries_system_reset_exception(struct pt_regs *regs) } fwnmi_release_errinfo(); } + + if (smp_handle_nmi_ipi(regs)) + return 1; + return 0; /* need to perform reset */ } diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index b4d362ed03a1..b5d86426e97b 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -87,6 +87,10 @@ static void pSeries_show_cpuinfo(struct seq_file *m) model = of_get_property(root, "model", NULL); seq_printf(m, "machine\t\t: CHRP %s\n", model); of_node_put(root); + if (radix_enabled()) + seq_printf(m, "MMU\t\t: Radix\n"); + else + seq_printf(m, "MMU\t\t: Hash\n"); } /* Initialize firmware assisted non-maskable interrupts if diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index f6f83aeccaaa..52ca6b311d44 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -55,11 +55,6 @@ */ static cpumask_var_t of_spin_mask; -/* - * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here - */ -static void (*xics_cause_ipi)(int cpu, unsigned long data); - /* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ int smp_query_cpu_stopped(unsigned int pcpu) { @@ -143,8 +138,6 @@ static void smp_setup_cpu(int cpu) { if (cpu != boot_cpuid) xics_setup_cpu(); - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); if (firmware_has_feature(FW_FEATURE_SPLPAR)) vpa_init(cpu); @@ -187,28 +180,50 @@ static int smp_pSeries_kick_cpu(int nr) return 0; } -/* Only used on systems that support multiple IPI mechanisms */ -static void pSeries_cause_ipi_mux(int cpu, unsigned long data) +static void smp_pseries_cause_ipi(int cpu) { - if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))) - doorbell_cause_ipi(cpu, data); - else - xics_cause_ipi(cpu, data); + /* POWER9 should not use this handler */ + if (doorbell_try_core_ipi(cpu)) + return; + + icp_ops->cause_ipi(cpu); +} + +static int pseries_cause_nmi_ipi(int cpu) +{ + int hwcpu; + + if (cpu == NMI_IPI_ALL_OTHERS) { + hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS; + } else { + if (cpu < 0) { + WARN_ONCE(true, "incorrect cpu parameter %d", cpu); + return 0; + } + + hwcpu = get_hard_smp_processor_id(cpu); + } + + if (plapr_signal_sys_reset(hwcpu) == H_SUCCESS) + return 1; + + return 0; } static __init void pSeries_smp_probe(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) { - xics_cause_ipi = smp_ops->cause_ipi; - smp_ops->cause_ipi = pSeries_cause_ipi_mux; - } + if (cpu_has_feature(CPU_FTR_DBELL)) + smp_ops->cause_ipi = smp_pseries_cause_ipi; + else + smp_ops->cause_ipi = icp_ops->cause_ipi; } static struct smp_ops_t pseries_smp_ops = { .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */ + .cause_nmi_ipi = pseries_cause_nmi_ipi, .probe = pSeries_smp_probe, .kick_cpu = smp_pSeries_kick_cpu, .setup_cpu = smp_setup_cpu, diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 720493932486..28b09fd797ec 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1318,7 +1318,7 @@ static void vio_dev_release(struct device *dev) struct iommu_table *tbl = get_iommu_table_base(dev); if (tbl) - iommu_free_table(tbl, of_node_full_name(dev->of_node)); + iommu_tce_table_put(tbl); of_node_put(dev->of_node); kfree(to_vio_dev(dev)); } |