diff options
31 files changed, 378 insertions, 75 deletions
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 3a510f4a6b68..77e2cefe47eb 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -64,17 +64,17 @@ config PPC_EMULATED_STATS emulated. config CODE_PATCHING_SELFTEST - bool "Run self-tests of the code-patching code." + bool "Run self-tests of the code-patching code" depends on DEBUG_KERNEL default n config FTR_FIXUP_SELFTEST - bool "Run self-tests of the feature-fixup code." + bool "Run self-tests of the feature-fixup code" depends on DEBUG_KERNEL default n config MSI_BITMAP_SELFTEST - bool "Run self-tests of the MSI bitmap code." + bool "Run self-tests of the MSI bitmap code" depends on DEBUG_KERNEL default n diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 9e861b4378bd..2ff8b3df553d 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -33,7 +33,12 @@ #define _PAGE_F_GIX_SHIFT 12 #define _PAGE_F_SECOND 0x08000 /* Whether to use secondary hash or not */ #define _PAGE_SPECIAL 0x10000 /* software: special page */ + +#ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY 0x20000 /* software: software dirty tracking */ +#else +#define _PAGE_SOFT_DIRTY 0x00000 +#endif /* * THP pages can't be special. So use the _PAGE_SPECIAL diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 03c1a5a21c0c..b3a5badab69f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -158,12 +158,22 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val) #define __swp_entry(type, offset) ((swp_entry_t) { \ ((type) << _PAGE_BIT_SWAP_TYPE) \ | ((offset) << PTE_RPN_SHIFT) }) +/* + * swp_entry_t must be independent of pte bits. We build a swp_entry_t from + * swap type and offset we get from swap and convert that to pte to find a + * matching pte in linux page table. + * Clear bits not found in swap entries here. + */ +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) +#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) -#define __swp_entry_to_pte(x) __pte((x).val) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) +#else +#define _PAGE_SWP_SOFT_DIRTY 0UL +#endif /* CONFIG_MEM_SOFT_DIRTY */ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); @@ -176,8 +186,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); } -#else -#define _PAGE_SWP_SOFT_DIRTY 0 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 8374afed9d0a..f8faaaeeca1e 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -157,7 +157,8 @@ #define OPAL_LEDS_GET_INDICATOR 114 #define OPAL_LEDS_SET_INDICATOR 115 #define OPAL_CEC_REBOOT2 116 -#define OPAL_LAST 116 +#define OPAL_CONSOLE_FLUSH 117 +#define OPAL_LAST 117 /* Device tree flags */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 800115910e43..07a99e638449 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -35,6 +35,7 @@ int64_t opal_console_read(int64_t term_number, __be64 *length, uint8_t *buffer); int64_t opal_console_write_buffer_space(int64_t term_number, __be64 *length); +int64_t opal_console_flush(int64_t term_number); int64_t opal_rtc_read(__be32 *year_month_day, __be64 *hour_minute_second_millisecond); int64_t opal_rtc_write(uint32_t year_month_day, @@ -262,6 +263,8 @@ extern int opal_resync_timebase(void); extern void opal_lpc_init(void); +extern void opal_kmsg_init(void); + extern int opal_event_request(unsigned int opal_event_nr); struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 70bd4381f8e6..546540b91095 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -16,6 +16,7 @@ #ifdef CONFIG_PPC64 +#include <linux/string.h> #include <asm/types.h> #include <asm/lppaca.h> #include <asm/mmu.h> @@ -131,7 +132,16 @@ struct paca_struct { struct tlb_core_data tcd; #endif /* CONFIG_PPC_BOOK3E */ - mm_context_t context; +#ifdef CONFIG_PPC_BOOK3S + mm_context_id_t mm_ctx_id; +#ifdef CONFIG_PPC_MM_SLICES + u64 mm_ctx_low_slices_psize; + unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; +#else + u16 mm_ctx_user_psize; + u16 mm_ctx_sllp; +#endif +#endif /* * then miscellaneous read-write fields @@ -194,6 +204,23 @@ struct paca_struct { #endif }; +#ifdef CONFIG_PPC_BOOK3S +static inline void copy_mm_to_paca(mm_context_t *context) +{ + get_paca()->mm_ctx_id = context->id; +#ifdef CONFIG_PPC_MM_SLICES + get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; + memcpy(&get_paca()->mm_ctx_high_slices_psize, + &context->high_slices_psize, SLICE_ARRAY_SIZE); +#else + get_paca()->mm_ctx_user_psize = context->user_psize; + get_paca()->mm_ctx_sllp = context->sllp; +#endif +} +#else +static inline void copy_mm_to_paca(mm_context_t *context){} +#endif + extern struct paca_struct *paca; extern void initialise_paca(struct paca_struct *new_paca, int cpu); extern void setup_paca(struct paca_struct *new_paca); diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h index 43686043e297..8dde19962a5b 100644 --- a/arch/powerpc/include/uapi/asm/cputable.h +++ b/arch/powerpc/include/uapi/asm/cputable.h @@ -43,5 +43,7 @@ #define PPC_FEATURE2_TAR 0x04000000 #define PPC_FEATURE2_VEC_CRYPTO 0x02000000 #define PPC_FEATURE2_HTM_NOSC 0x01000000 +#define PPC_FEATURE2_ARCH_3_00 0x00800000 /* ISA 3.00 */ +#define PPC_FEATURE2_HAS_IEEE128 0x00400000 /* VSX IEEE Binary Float 128-bit */ #endif /* _UAPI__ASM_POWERPC_CPUTABLE_H */ diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index 59dad113897b..c2d21d11c2d2 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -295,6 +295,8 @@ do { \ #define R_PPC64_TLSLD 108 #define R_PPC64_TOCSAVE 109 +#define R_PPC64_ENTRY 118 + #define R_PPC64_REL16 249 #define R_PPC64_REL16_LO 250 #define R_PPC64_REL16_HI 251 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 221d584d089f..07cebc3514f3 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -185,14 +185,16 @@ int main(void) DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); DEFINE(PACAIRQHAPPENED, offsetof(struct paca_struct, irq_happened)); - DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); +#ifdef CONFIG_PPC_BOOK3S + DEFINE(PACACONTEXTID, offsetof(struct paca_struct, mm_ctx_id)); #ifdef CONFIG_PPC_MM_SLICES DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct, - context.low_slices_psize)); + mm_ctx_low_slices_psize)); DEFINE(PACAHIGHSLICEPSIZE, offsetof(struct paca_struct, - context.high_slices_psize)); + mm_ctx_high_slices_psize)); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ +#endif #ifdef CONFIG_PPC_BOOK3E DEFINE(PACAPGD, offsetof(struct paca_struct, pgd)); @@ -222,7 +224,7 @@ int main(void) #ifdef CONFIG_PPC_MM_SLICES DEFINE(MMUPSIZESLLP, offsetof(struct mmu_psize_def, sllp)); #else - DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, context.sllp)); + DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, mm_ctx_sllp)); #endif /* CONFIG_PPC_MM_SLICES */ DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen)); DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc)); diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 68384514506b..59663af9315f 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -635,6 +635,33 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, */ break; + case R_PPC64_ENTRY: + /* + * Optimize ELFv2 large code model entry point if + * the TOC is within 2GB range of current location. + */ + value = my_r2(sechdrs, me) - (unsigned long)location; + if (value + 0x80008000 > 0xffffffff) + break; + /* + * Check for the large code model prolog sequence: + * ld r2, ...(r12) + * add r2, r2, r12 + */ + if ((((uint32_t *)location)[0] & ~0xfffc) + != 0xe84c0000) + break; + if (((uint32_t *)location)[1] != 0x7c426214) + break; + /* + * If found, replace it with: + * addis r2, r12, (.TOC.-func)@ha + * addi r2, r12, (.TOC.-func)@l + */ + ((uint32_t *)location)[0] = 0x3c4c0000 + PPC_HA(value); + ((uint32_t *)location)[1] = 0x38420000 + PPC_LO(value); + break; + case R_PPC64_REL16_HA: /* Subtract location pointer */ value -= (unsigned long)location; diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 92dea8df6b26..da5192590c44 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -389,6 +389,7 @@ static void __init prom_printf(const char *format, ...) break; } } + va_end(args); } diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index ea43a347a104..4f24606afc3f 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -61,3 +61,10 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) save_context_stack(trace, tsk->thread.ksp, tsk, 0); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +void +save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +{ + save_context_stack(trace, regs->gpr[1], current, 0); +} +EXPORT_SYMBOL_GPL(save_stack_trace_regs); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 4233dcccbaf7..ba59d5977f34 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -853,11 +853,11 @@ static unsigned int get_paca_psize(unsigned long addr) unsigned long index, mask_index; if (addr < SLICE_LOW_TOP) { - lpsizes = get_paca()->context.low_slices_psize; + lpsizes = get_paca()->mm_ctx_low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); return (lpsizes >> (index * 4)) & 0xF; } - hpsizes = get_paca()->context.high_slices_psize; + hpsizes = get_paca()->mm_ctx_high_slices_psize; index = GET_HIGH_SLICE_INDEX(addr); mask_index = index & 0x1; return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF; @@ -866,7 +866,7 @@ static unsigned int get_paca_psize(unsigned long addr) #else unsigned int get_paca_psize(unsigned long addr) { - return get_paca()->context.user_psize; + return get_paca()->mm_ctx_user_psize; } #endif @@ -882,7 +882,8 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); copro_flush_all_slbs(mm); if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { - get_paca()->context = mm->context; + + copy_mm_to_paca(&mm->context); slb_flush_and_rebolt(); } } @@ -949,7 +950,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, { if (user_region) { if (psize != get_paca_psize(ea)) { - get_paca()->context = mm->context; + copy_mm_to_paca(&mm->context); slb_flush_and_rebolt(); } } else if (get_paca()->vmalloc_sllp != diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 515730e499fe..825b6873391f 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -228,7 +228,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) asm volatile("slbie %0" : : "r" (slbie_data)); get_paca()->slb_cache_ptr = 0; - get_paca()->context = mm->context; + copy_mm_to_paca(&mm->context); /* * preload some userspace segments into the SLB. diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 0f432a702870..42954f0b47ac 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -185,8 +185,7 @@ static void slice_flush_segments(void *parm) if (mm != current->active_mm) return; - /* update the paca copy of the context struct */ - get_paca()->context = current->active_mm->context; + copy_mm_to_paca(¤t->active_mm->context); local_irq_save(flags); slb_flush_and_rebolt(); diff --git a/arch/powerpc/platforms/powermac/bootx_init.c b/arch/powerpc/platforms/powermac/bootx_init.c index 76f5013c35e5..c3c9bbb3573a 100644 --- a/arch/powerpc/platforms/powermac/bootx_init.c +++ b/arch/powerpc/platforms/powermac/bootx_init.c @@ -84,6 +84,7 @@ static void __init bootx_printf(const char *format, ...) break; } } + va_end(args); } #else /* CONFIG_BOOTX_TEXT */ static void __init bootx_printf(const char *format, ...) {} diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index ee774e8a4837..f1516b5ecec9 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -2,6 +2,7 @@ obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o +obj-y += opal-kmsg.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o npu-dma.o diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c new file mode 100644 index 000000000000..6f1214d4de92 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-kmsg.c @@ -0,0 +1,75 @@ +/* + * kmsg dumper that ensures the OPAL console fully flushes panic messages + * + * Author: Russell Currey <ruscur@russell.cc> + * + * Copyright 2015 IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/kmsg_dump.h> + +#include <asm/opal.h> +#include <asm/opal-api.h> + +/* + * Console output is controlled by OPAL firmware. The kernel regularly calls + * OPAL_POLL_EVENTS, which flushes some console output. In a panic state, + * however, the kernel no longer calls OPAL_POLL_EVENTS and the panic message + * may not be completely printed. This function does not actually dump the + * message, it just ensures that OPAL completely flushes the console buffer. + */ +static void force_opal_console_flush(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + int i; + int64_t ret; + + /* + * Outside of a panic context the pollers will continue to run, + * so we don't need to do any special flushing. + */ + if (reason != KMSG_DUMP_PANIC) + return; + + if (opal_check_token(OPAL_CONSOLE_FLUSH)) { + ret = opal_console_flush(0); + + if (ret == OPAL_UNSUPPORTED || ret == OPAL_PARAMETER) + return; + + /* Incrementally flush until there's nothing left */ + while (opal_console_flush(0) != OPAL_SUCCESS); + } else { + /* + * If OPAL_CONSOLE_FLUSH is not implemented in the firmware, + * the console can still be flushed by calling the polling + * function enough times to flush the buffer. We don't know + * how much output still needs to be flushed, but we can be + * generous since the kernel is in panic and doesn't need + * to do much else. + */ + printk(KERN_NOTICE "opal: OPAL_CONSOLE_FLUSH missing.\n"); + for (i = 0; i < 1024; i++) { + opal_poll_events(NULL); + } + } +} + +static struct kmsg_dumper opal_kmsg_dumper = { + .dump = force_opal_console_flush +}; + +void __init opal_kmsg_init(void) +{ + int rc; + + /* Add our dumper to the list */ + rc = kmsg_dump_register(&opal_kmsg_dumper); + if (rc != 0) + pr_err("opal: kmsg_dump_register failed; returned %d\n", rc); +} diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c index 1b149c92fca1..f8868864f373 100644 --- a/arch/powerpc/platforms/powernv/opal-rtc.c +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -50,7 +50,7 @@ unsigned long __init opal_get_boot_time(void) rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); if (rc == OPAL_BUSY_EVENT) opal_poll_events(NULL); - else + else if (rc == OPAL_BUSY) mdelay(10); } if (rc != OPAL_SUCCESS) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index b7a464fef7a7..e45b88a5d7e0 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -301,3 +301,4 @@ OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); +OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index aad0033d65d1..81f4a3ab8743 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -548,7 +548,7 @@ bool opal_mce_check_early_recovery(struct pt_regs *regs) goto out; if ((regs->nip >= opal.base) && - (regs->nip <= (opal.base + opal.size))) + (regs->nip < (opal.base + opal.size))) recover_addr = find_recovery_address(regs->nip); /* @@ -748,6 +748,9 @@ static int __init opal_init(void) opal_pdev_init(opal_node, "ibm,opal-flash"); opal_pdev_init(opal_node, "ibm,opal-prd"); + /* Initialise OPAL kmsg dumper for flushing console on panic */ + opal_kmsg_init(); + return 0; } machine_subsys_initcall(powernv, opal_init); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 323e1e58da93..573ae1994097 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1074,16 +1074,75 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) pnv_ioda_link_pe_by_weight(phb, pe); } -static void pnv_ioda_setup_dev_PEs(struct pci_bus *bus) +static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) +{ + int pe_num, found_pe = false, rc; + long rid; + struct pnv_ioda_pe *pe; + struct pci_dev *gpu_pdev; + struct pci_dn *npu_pdn; + struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus); + struct pnv_phb *phb = hose->private_data; + + /* + * Due to a hardware errata PE#0 on the NPU is reserved for + * error handling. This means we only have three PEs remaining + * which need to be assigned to four links, implying some + * links must share PEs. + * + * To achieve this we assign PEs such that NPUs linking the + * same GPU get assigned the same PE. + */ + gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); + for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) { + pe = &phb->ioda.pe_array[pe_num]; + if (!pe->pdev) + continue; + + if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) { + /* + * This device has the same peer GPU so should + * be assigned the same PE as the existing + * peer NPU. + */ + dev_info(&npu_pdev->dev, + "Associating to existing PE %d\n", pe_num); + pci_dev_get(npu_pdev); + npu_pdn = pci_get_pdn(npu_pdev); + rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; + npu_pdn->pcidev = npu_pdev; + npu_pdn->pe_number = pe_num; + pe->dma_weight += pnv_ioda_dma_weight(npu_pdev); + phb->ioda.pe_rmap[rid] = pe->pe_number; + + /* Map the PE to this link */ + rc = opal_pci_set_pe(phb->opal_id, pe_num, rid, + OpalPciBusAll, + OPAL_COMPARE_RID_DEVICE_NUMBER, + OPAL_COMPARE_RID_FUNCTION_NUMBER, + OPAL_MAP_PE); + WARN_ON(rc != OPAL_SUCCESS); + found_pe = true; + break; + } + } + + if (!found_pe) + /* + * Could not find an existing PE so allocate a new + * one. + */ + return pnv_ioda_setup_dev_PE(npu_pdev); + else + return pe; +} + +static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) { - struct pci_bus *child; struct pci_dev *pdev; list_for_each_entry(pdev, &bus->devices, bus_list) - pnv_ioda_setup_dev_PE(pdev); - - list_for_each_entry(child, &bus->children, node) - pnv_ioda_setup_dev_PEs(child); + pnv_ioda_setup_npu_PE(pdev); } static void pnv_ioda_setup_PEs(struct pci_bus *bus) @@ -1127,9 +1186,11 @@ static void pnv_pci_ioda_setup_PEs(void) * functions. PCI bus dependent PEs are required for the * remaining types of PHBs. */ - if (phb->type == PNV_PHB_NPU) - pnv_ioda_setup_dev_PEs(hose->bus); - else + if (phb->type == PNV_PHB_NPU) { + /* PE#0 is needed for error reporting */ + pnv_ioda_reserve_pe(phb, 0); + pnv_ioda_setup_npu_PEs(hose->bus); + } else pnv_ioda_setup_PEs(hose->bus); } } @@ -1612,7 +1673,10 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) /* Update peer npu devices */ if (pe->flags & PNV_IODA_PE_PEER) - for (i = 0; pe->peers[i]; i++) { + for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { + if (!pe->peers[i]) + continue; + linked_npu_dev = pe->peers[i]->pdev; if (dma_get_mask(&linked_npu_dev->dev) != dma_mask) dma_set_mask(&linked_npu_dev->dev, dma_mask); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index ff4e42d9d259..2f55c86df703 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -1,8 +1,6 @@ /* * Support PCI/PCIe on PowerNV platforms * - * Currently supports only P5IOC2 - * * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. * * This program is free software; you can redistribute it and/or diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile index 6982f603fadc..be2ac5ce349f 100644 --- a/drivers/misc/cxl/Makefile +++ b/drivers/misc/cxl/Makefile @@ -1,4 +1,5 @@ -ccflags-y := -Werror -Wno-unused-const-variable +ccflags-y := $(call cc-disable-warning, unused-const-variable) +ccflags-$(CONFIG_PPC_WERROR) += -Werror cxl-y += main.o file.o irq.o fault.o native.o cxl-y += context.o sysfs.o debugfs.o pci.o trace.o diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index a6543aefa299..ea3eeb7011e1 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -172,7 +172,7 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, if (task) { ctx->pid = get_task_pid(task, PIDTYPE_PID); - get_pid(ctx->pid); + ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID); kernel = false; } diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 6dde7a9d6a7e..262b88eac414 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -42,7 +42,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, spin_lock_init(&ctx->sste_lock); ctx->afu = afu; ctx->master = master; - ctx->pid = NULL; /* Set in start work ioctl */ + ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */ mutex_init(&ctx->mapping_lock); ctx->mapping = mapping; @@ -217,7 +217,11 @@ int __detach_context(struct cxl_context *ctx) WARN_ON(cxl_detach_process(ctx) && cxl_adapter_link_ok(ctx->afu->adapter)); flush_work(&ctx->fault_work); /* Only needed for dedicated process */ + + /* release the reference to the group leader and mm handling pid */ put_pid(ctx->pid); + put_pid(ctx->glpid); + cxl_ctx_put(); return 0; } diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 25ae57fa79b0..a521bc72cec2 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -445,6 +445,9 @@ struct cxl_context { unsigned int sst_size, sst_lru; wait_queue_head_t wq; + /* pid of the group leader associated with the pid */ + struct pid *glpid; + /* use mm context associated with this pid for ds faults */ struct pid *pid; spinlock_t lock; /* Protects pending_irq_mask, pending_fault and fault_addr */ /* Only used in PR mode */ diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 25a5418c55cb..81c3f75b7330 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -166,13 +166,92 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, cxl_ack_irq(ctx, CXL_PSL_TFC_An_R, 0); } +/* + * Returns the mm_struct corresponding to the context ctx via ctx->pid + * In case the task has exited we use the task group leader accessible + * via ctx->glpid to find the next task in the thread group that has a + * valid mm_struct associated with it. If a task with valid mm_struct + * is found the ctx->pid is updated to use the task struct for subsequent + * translations. In case no valid mm_struct is found in the task group to + * service the fault a NULL is returned. + */ +static struct mm_struct *get_mem_context(struct cxl_context *ctx) +{ + struct task_struct *task = NULL; + struct mm_struct *mm = NULL; + struct pid *old_pid = ctx->pid; + + if (old_pid == NULL) { + pr_warn("%s: Invalid context for pe=%d\n", + __func__, ctx->pe); + return NULL; + } + + task = get_pid_task(old_pid, PIDTYPE_PID); + + /* + * pid_alive may look racy but this saves us from costly + * get_task_mm when the task is a zombie. In worst case + * we may think a task is alive, which is about to die + * but get_task_mm will return NULL. + */ + if (task != NULL && pid_alive(task)) + mm = get_task_mm(task); + + /* release the task struct that was taken earlier */ + if (task) + put_task_struct(task); + else + pr_devel("%s: Context owning pid=%i for pe=%i dead\n", + __func__, pid_nr(old_pid), ctx->pe); + + /* + * If we couldn't find the mm context then use the group + * leader to iterate over the task group and find a task + * that gives us mm_struct. + */ + if (unlikely(mm == NULL && ctx->glpid != NULL)) { + + rcu_read_lock(); + task = pid_task(ctx->glpid, PIDTYPE_PID); + if (task) + do { + mm = get_task_mm(task); + if (mm) { + ctx->pid = get_task_pid(task, + PIDTYPE_PID); + break; + } + task = next_thread(task); + } while (task && !thread_group_leader(task)); + rcu_read_unlock(); + + /* check if we switched pid */ + if (ctx->pid != old_pid) { + if (mm) + pr_devel("%s:pe=%i switch pid %i->%i\n", + __func__, ctx->pe, pid_nr(old_pid), + pid_nr(ctx->pid)); + else + pr_devel("%s:Cannot find mm for pid=%i\n", + __func__, pid_nr(old_pid)); + + /* drop the reference to older pid */ + put_pid(old_pid); + } + } + + return mm; +} + + + void cxl_handle_fault(struct work_struct *fault_work) { struct cxl_context *ctx = container_of(fault_work, struct cxl_context, fault_work); u64 dsisr = ctx->dsisr; u64 dar = ctx->dar; - struct task_struct *task = NULL; struct mm_struct *mm = NULL; if (cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An) != dsisr || @@ -195,17 +274,17 @@ void cxl_handle_fault(struct work_struct *fault_work) "DSISR: %#llx DAR: %#llx\n", ctx->pe, dsisr, dar); if (!ctx->kernel) { - if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) { - pr_devel("cxl_handle_fault unable to get task %i\n", - pid_nr(ctx->pid)); + + mm = get_mem_context(ctx); + /* indicates all the thread in task group have exited */ + if (mm == NULL) { + pr_devel("%s: unable to get mm for pe=%d pid=%i\n", + __func__, ctx->pe, pid_nr(ctx->pid)); cxl_ack_ae(ctx); return; - } - if (!(mm = get_task_mm(task))) { - pr_devel("cxl_handle_fault unable to get mm %i\n", - pid_nr(ctx->pid)); - cxl_ack_ae(ctx); - goto out; + } else { + pr_devel("Handling page fault for pe=%d pid=%i\n", + ctx->pe, pid_nr(ctx->pid)); } } @@ -218,33 +297,22 @@ void cxl_handle_fault(struct work_struct *fault_work) if (mm) mmput(mm); -out: - if (task) - put_task_struct(task); } static void cxl_prefault_one(struct cxl_context *ctx, u64 ea) { - int rc; - struct task_struct *task; struct mm_struct *mm; - if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) { - pr_devel("cxl_prefault_one unable to get task %i\n", - pid_nr(ctx->pid)); - return; - } - if (!(mm = get_task_mm(task))) { + mm = get_mem_context(ctx); + if (mm == NULL) { pr_devel("cxl_prefault_one unable to get mm %i\n", pid_nr(ctx->pid)); - put_task_struct(task); return; } - rc = cxl_fault_segment(ctx, mm, ea); + cxl_fault_segment(ctx, mm, ea); mmput(mm); - put_task_struct(task); } static u64 next_segment(u64 ea, u64 vsid) @@ -263,18 +331,13 @@ static void cxl_prefault_vma(struct cxl_context *ctx) struct copro_slb slb; struct vm_area_struct *vma; int rc; - struct task_struct *task; struct mm_struct *mm; - if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) { - pr_devel("cxl_prefault_vma unable to get task %i\n", - pid_nr(ctx->pid)); - return; - } - if (!(mm = get_task_mm(task))) { + mm = get_mem_context(ctx); + if (mm == NULL) { pr_devel("cxl_prefault_vm unable to get mm %i\n", pid_nr(ctx->pid)); - goto out1; + return; } down_read(&mm->mmap_sem); @@ -295,8 +358,6 @@ static void cxl_prefault_vma(struct cxl_context *ctx) up_read(&mm->mmap_sem); mmput(mm); -out1: - put_task_struct(task); } void cxl_prefault(struct cxl_context *ctx, u64 wed) diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 5cc14599837d..783337d22f36 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -201,8 +201,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, * where a process (master, some daemon, etc) has opened the chardev on * behalf of another process, so the AFU's mm gets bound to the process * that performs this ioctl and not the process that opened the file. + * Also we grab the PID of the group leader so that if the task that + * has performed the attach operation exits the mm context of the + * process is still accessible. */ - ctx->pid = get_pid(get_task_pid(current, PIDTYPE_PID)); + ctx->pid = get_task_pid(current, PIDTYPE_PID); + ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID); trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr); diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 85761d7eb333..4c1903f781fc 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -138,6 +138,7 @@ static const struct pci_device_id cxl_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0477), }, { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), }, { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x04cf), }, + { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), }, { PCI_DEVICE_CLASS(0x120000, ~0), }, { } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 826470d7f000..96e2486a6fc4 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -263,7 +263,8 @@ if ($arch eq "x86_64") { } elsif ($arch eq "powerpc") { $local_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)"; - $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:"; + # See comment in the sparc64 section for why we use '\w'. + $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?\\w*?)>:"; $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$"; if ($bits == 64) { |