From 8119cefd9a29b71997e62b762932d23499ba4896 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 14 Jul 2021 18:17:58 +0530 Subject: powerpc/kexec: blacklist functions called in real mode for kprobe As kprobe does not handle events happening in real mode, blacklist the functions that only get called in real mode or in kexec sequence with MMU turned off. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/162626687834.155313.4692863392927831843.stgit@hbathini-workstation.ibm.com --- arch/powerpc/mm/book3s64/hash_native.c | 2 +- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 52e170bd95ae..d8279bfe68ea 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -787,7 +787,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, * TODO: add batching support when enabled. remember, no dynamic memory here, * although there is the control page available... */ -static void native_hpte_clear(void) +static notrace void native_hpte_clear(void) { unsigned long vpn = 0; unsigned long slot, slots; diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 9ffa65074cb0..300099de553b 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -172,8 +172,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* For use by kexec */ -void mmu_cleanup_all(void) +/* For use by kexec, called with MMU off */ +notrace void mmu_cleanup_all(void) { if (radix_enabled()) radix__mmu_cleanup_all(); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e50ddf129c15..ae20add7954a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -679,7 +679,8 @@ void radix__early_init_mmu_secondary(void) mtspr(SPRN_UAMOR, 0); } -void radix__mmu_cleanup_all(void) +/* Called during kexec sequence with MMU off */ +notrace void radix__mmu_cleanup_all(void) { unsigned long lpcr; -- cgit v1.2.3 From 9c7248bb8de31f51c693bfa6a6ea53b1c07e0fa8 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Tue, 11 May 2021 09:31:36 +0200 Subject: powerpc/numa: Consider the max NUMA node for migratable LPAR When a LPAR is migratable, we should consider the maximum possible NUMA node instead of the number of NUMA nodes from the actual system. The DT property 'ibm,current-associativity-domains' defines the maximum number of nodes the LPAR can see when running on that box. But if the LPAR is being migrated on another box, it may see up to the nodes defined by 'ibm,max-associativity-domains'. So if a LPAR is migratable, that value should be used. Unfortunately, there is no easy way to know if an LPAR is migratable or not. The hypervisor exports the property 'ibm,migratable-partition' in the case it set to migrate partition, but that would not mean that the current partition is migratable. Without this patch, when a LPAR is started on a 2 node box and then migrated to a 3 node box, the hypervisor may spread the LPAR's CPUs on the 3rd node. In that case if a CPU from that 3rd node is added to the LPAR, it will be wrongly assigned to the node because the kernel has been set to use up to 2 nodes (the configuration of the departure node). With this patch applies, the CPU is correctly added to the 3rd node. Fixes: f9f130ff2ec9 ("powerpc/numa: Detect support for coregroup") Signed-off-by: Laurent Dufour Reviewed-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210511073136.17795-1-ldufour@linux.ibm.com --- arch/powerpc/mm/numa.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f2bf98bdcea2..094a1076fd1f 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -893,7 +893,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) static void __init find_possible_nodes(void) { struct device_node *rtas; - const __be32 *domains; + const __be32 *domains = NULL; int prop_length, max_nodes; u32 i; @@ -909,9 +909,14 @@ static void __init find_possible_nodes(void) * it doesn't exist, then fallback on ibm,max-associativity-domains. * Current denotes what the platform can support compared to max * which denotes what the Hypervisor can support. + * + * If the LPAR is migratable, new nodes might be activated after a LPM, + * so we should consider the max number in that case. */ - domains = of_get_property(rtas, "ibm,current-associativity-domains", - &prop_length); + if (!of_get_property(of_root, "ibm,migratable-partition", NULL)) + domains = of_get_property(rtas, + "ibm,current-associativity-domains", + &prop_length); if (!domains) { domains = of_get_property(rtas, "ibm,max-associativity-domains", &prop_length); @@ -920,6 +925,8 @@ static void __init find_possible_nodes(void) } max_nodes = of_read_number(&domains[min_common_depth], 1); + pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); + for (i = 0; i < max_nodes; i++) { if (!node_possible(i)) node_set(i, node_possible_map); -- cgit v1.2.3 From d144f4d5a8a804133d20ff311d7be70bcdbfaac2 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 17 May 2021 11:06:06 +0200 Subject: pseries/drmem: update LMBs after LPM After a LPM, the device tree node ibm,dynamic-reconfiguration-memory may be updated by the hypervisor in the case the NUMA topology of the LPAR's memory is updated. This is handled by the kernel, but the memory's node is not updated because there is no way to move a memory block between nodes from the Linux kernel point of view. If later a memory block is added or removed, drmem_update_dt() is called and it is overwriting the DT node ibm,dynamic-reconfiguration-memory to match the added or removed LMB. But the LMB's associativity node has not been updated after the DT node update and thus the node is overwritten by the Linux's topology instead of the hypervisor one. Introduce a hook called when the ibm,dynamic-reconfiguration-memory node is updated to force an update of the LMB's associativity. However, ignore the call to that hook when the update has been triggered by drmem_update_dt(). Because, in that case, the LMB tree has been used to set the DT property and thus it doesn't need to be updated back. Since drmem_update_dt() is called under the protection of the device_hotplug_lock and the hook is called in the same context, use a simple boolean variable to detect that call. Signed-off-by: Laurent Dufour Reviewed-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210517090606.56930-1-ldufour@linux.ibm.com --- arch/powerpc/include/asm/drmem.h | 1 + arch/powerpc/mm/drmem.c | 46 +++++++++++++++++++++++++ arch/powerpc/platforms/pseries/hotplug-memory.c | 4 +++ 3 files changed, 51 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index bf2402fed3e0..4265d5e95c2c 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -111,6 +111,7 @@ int drmem_update_dt(void); int __init walk_drmem_lmbs_early(unsigned long node, void *data, int (*func)(struct drmem_lmb *, const __be32 **, void *)); +void drmem_update_lmbs(struct property *prop); #endif static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 9af3832c9d8d..22197b18d85e 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -18,6 +18,7 @@ static int n_root_addr_cells, n_root_size_cells; static struct drmem_lmb_info __drmem_info; struct drmem_lmb_info *drmem_info = &__drmem_info; +static bool in_drmem_update; u64 drmem_lmb_memory_max(void) { @@ -178,6 +179,11 @@ int drmem_update_dt(void) if (!memory) return -1; + /* + * Set in_drmem_update to prevent the notifier callback to process the + * DT property back since the change is coming from the LMB tree. + */ + in_drmem_update = true; prop = of_find_property(memory, "ibm,dynamic-memory", NULL); if (prop) { rc = drmem_update_dt_v1(memory, prop); @@ -186,6 +192,7 @@ int drmem_update_dt(void) if (prop) rc = drmem_update_dt_v2(memory, prop); } + in_drmem_update = false; of_node_put(memory); return rc; @@ -307,6 +314,45 @@ int __init walk_drmem_lmbs_early(unsigned long node, void *data, return ret; } +/* + * Update the LMB associativity index. + */ +static int update_lmb(struct drmem_lmb *updated_lmb, + __maybe_unused const __be32 **usm, + __maybe_unused void *data) +{ + struct drmem_lmb *lmb; + + for_each_drmem_lmb(lmb) { + if (lmb->drc_index != updated_lmb->drc_index) + continue; + + lmb->aa_index = updated_lmb->aa_index; + break; + } + return 0; +} + +/* + * Update the LMB associativity index. + * + * This needs to be called when the hypervisor is updating the + * dynamic-reconfiguration-memory node property. + */ +void drmem_update_lmbs(struct property *prop) +{ + /* + * Don't update the LMBs if triggered by the update done in + * drmem_update_dt(), the LMB values have been used to the update the DT + * property in that case. + */ + if (in_drmem_update) + return; + if (!strcmp(prop->name, "ibm,dynamic-memory")) + __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb); + else if (!strcmp(prop->name, "ibm,dynamic-memory-v2")) + __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb); +} #endif static int init_drmem_lmb_size(struct device_node *dn) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9a..0beb3ca2b549 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -979,6 +979,10 @@ static int pseries_memory_notifier(struct notifier_block *nb, case OF_RECONFIG_DETACH_NODE: err = pseries_remove_mem_node(rd->dn); break; + case OF_RECONFIG_UPDATE_PROPERTY: + if (!strcmp(rd->dn->name, + "ibm,dynamic-reconfiguration-memory")) + drmem_update_lmbs(rd->prop); } return notifier_from_errno(err); } -- cgit v1.2.3 From 3e188b1ae8807f26cc5a530a9d55f3f643fe050a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:58:30 +0530 Subject: powerpc/book3s64/radix: make tlb_single_page_flush_ceiling a debugfs entry Similar to x86/s390 add a debugfs file to tune tlb_single_page_flush_ceiling. Also add a debugfs entry for tlb_local_single_page_flush_ceiling. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132831.233794-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/radix_tlb.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index aefc100d79a7..1fa2bc6a969e 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internal.h" @@ -1106,8 +1107,8 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * invalidating a full PID, so it has a far lower threshold to change from * individual page flushes to full-pid flushes. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; +static u32 tlb_single_page_flush_ceiling __read_mostly = 33; +static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -1524,3 +1525,14 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid, EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, + powerpc_debugfs_root, &tlb_single_page_flush_ceiling); + debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, + powerpc_debugfs_root, &tlb_local_single_page_flush_ceiling); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); + -- cgit v1.2.3 From dbf77fed8b302e87561c7c2fc06050c88f4d3120 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:58:31 +0530 Subject: powerpc: rename powerpc_debugfs_root to arch_debugfs_dir No functional change in this patch. arch_debugfs_dir is the generic kernel name declared in linux/debugfs.h for arch-specific debugfs directory. Architectures like x86/s390 already use the name. Rename powerpc specific powerpc_debugfs_root to arch_debugfs_dir. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132831.233794-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/debugfs.h | 13 ------------- arch/powerpc/kernel/Makefile | 3 ++- arch/powerpc/kernel/dawr.c | 3 +-- arch/powerpc/kernel/eeh.c | 16 ++++++++-------- arch/powerpc/kernel/eeh_cache.c | 4 ++-- arch/powerpc/kernel/fadump.c | 4 ++-- arch/powerpc/kernel/hw_breakpoint.c | 1 - arch/powerpc/kernel/kdebugfs.c | 14 ++++++++++++++ arch/powerpc/kernel/security.c | 16 ++++++++-------- arch/powerpc/kernel/setup-common.c | 13 ------------- arch/powerpc/kernel/setup_64.c | 1 - arch/powerpc/kernel/traps.c | 4 ++-- arch/powerpc/kvm/book3s_xics.c | 6 +++--- arch/powerpc/kvm/book3s_xive.c | 3 +-- arch/powerpc/kvm/book3s_xive_native.c | 3 +-- arch/powerpc/mm/book3s64/hash_utils.c | 4 ++-- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/powerpc/mm/book3s64/radix_tlb.c | 6 +++--- arch/powerpc/mm/ptdump/bats.c | 4 ++-- arch/powerpc/mm/ptdump/segment_regs.c | 4 ++-- arch/powerpc/platforms/cell/axon_msi.c | 4 ++-- arch/powerpc/platforms/powernv/memtrace.c | 3 +-- arch/powerpc/platforms/powernv/opal-imc.c | 4 ++-- arch/powerpc/platforms/powernv/opal-lpc.c | 4 ++-- arch/powerpc/platforms/powernv/opal-xscom.c | 4 ++-- arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++-- arch/powerpc/platforms/pseries/dtl.c | 4 ++-- arch/powerpc/platforms/pseries/lpar.c | 5 +++-- arch/powerpc/sysdev/xive/common.c | 3 +-- arch/powerpc/xmon/xmon.c | 6 +++--- 30 files changed, 75 insertions(+), 92 deletions(-) delete mode 100644 arch/powerpc/include/asm/debugfs.h create mode 100644 arch/powerpc/kernel/kdebugfs.c (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h deleted file mode 100644 index 2c5c48571d75..000000000000 --- a/arch/powerpc/include/asm/debugfs.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_POWERPC_DEBUGFS_H -#define _ASM_POWERPC_DEBUGFS_H - -/* - * Copyright 2017, Michael Ellerman, IBM Corporation. - */ - -#include - -extern struct dentry *powerpc_debugfs_root; - -#endif /* _ASM_POWERPC_DEBUGFS_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f66b63e81c3b..7be36c1e1db6 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -46,7 +46,8 @@ obj-y := cputable.o syscalls.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o firmware.o \ - hw_breakpoint_constraints.o interrupt.o + hw_breakpoint_constraints.o interrupt.o \ + kdebugfs.o obj-y += ptrace/ obj-$(CONFIG_PPC64) += setup_64.o \ paca.o nvram_64.o note.o diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c index cdc2dccb987d..64e423d2fe0f 100644 --- a/arch/powerpc/kernel/dawr.c +++ b/arch/powerpc/kernel/dawr.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -101,7 +100,7 @@ static int __init dawr_force_setup(void) if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) { /* Turn DAWR off by default, but allow admin to turn it on */ debugfs_create_file_unsafe("dawr_enable_dangerous", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &dawr_force_enable, &dawr_enable_fops); } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 3bbdcc86d01b..e9b597ed423c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -21,9 +21,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -1901,24 +1901,24 @@ static int __init eeh_init_proc(void) proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show); #ifdef CONFIG_DEBUG_FS debugfs_create_file_unsafe("eeh_enable", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_enable_dbgfs_ops); debugfs_create_u32("eeh_max_freezes", 0600, - powerpc_debugfs_root, &eeh_max_freezes); + arch_debugfs_dir, &eeh_max_freezes); debugfs_create_bool("eeh_disable_recovery", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &eeh_debugfs_no_recover); debugfs_create_file_unsafe("eeh_dev_check", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_check_fops); debugfs_create_file_unsafe("eeh_dev_break", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_break_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_force_recover_fops); debugfs_create_file_unsafe("eeh_dev_can_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_can_recover_fops); eeh_cache_debugfs_init(); #endif diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index bf3270426d82..9bdaaf7fddc9 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -12,8 +12,8 @@ #include #include #include +#include #include -#include #include @@ -283,6 +283,6 @@ DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); void eeh_cache_debugfs_init(void) { debugfs_create_file_unsafe("eeh_address_cache", 0400, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_addr_cache_fops); } diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b990075285f5..b7ceb041743c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -24,8 +24,8 @@ #include #include #include +#include -#include #include #include #include @@ -1557,7 +1557,7 @@ static void fadump_init_files(void) return; } - debugfs_create_file("fadump_region", 0444, powerpc_debugfs_root, NULL, + debugfs_create_file("fadump_region", 0444, arch_debugfs_dir, NULL, &fadump_region_fops); if (fw_dump.dump_active) { diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 21a638aff72f..91a3be14808b 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/kdebugfs.c b/arch/powerpc/kernel/kdebugfs.c new file mode 100644 index 000000000000..36d3124d5a8b --- /dev/null +++ b/arch/powerpc/kernel/kdebugfs.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +static int __init arch_kdebugfs_init(void) +{ + arch_debugfs_dir = debugfs_create_dir("powerpc", NULL); + return 0; +} +arch_initcall(arch_kdebugfs_init); diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index cc51fa52e783..1a998490fe60 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -106,7 +106,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_barrier_nospec, barrier_nospec_get, static __init int barrier_nospec_debugfs_init(void) { debugfs_create_file_unsafe("barrier_nospec", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_barrier_nospec); return 0; } @@ -114,7 +114,7 @@ device_initcall(barrier_nospec_debugfs_init); static __init int security_feature_debugfs_init(void) { - debugfs_create_x64("security_features", 0400, powerpc_debugfs_root, + debugfs_create_x64("security_features", 0400, arch_debugfs_dir, &powerpc_security_features); return 0; } @@ -420,7 +420,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, static __init int stf_barrier_debugfs_init(void) { - debugfs_create_file_unsafe("stf_barrier", 0600, powerpc_debugfs_root, + debugfs_create_file_unsafe("stf_barrier", 0600, arch_debugfs_dir, NULL, &fops_stf_barrier); return 0; } @@ -748,7 +748,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get, static __init int count_cache_flush_debugfs_init(void) { debugfs_create_file_unsafe("count_cache_flush", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_count_cache_flush); return 0; } @@ -834,9 +834,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set static __init int rfi_flush_debugfs_init(void) { - debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); - debugfs_create_file("entry_flush", 0600, powerpc_debugfs_root, NULL, &fops_entry_flush); - debugfs_create_file("uaccess_flush", 0600, powerpc_debugfs_root, NULL, &fops_uaccess_flush); + debugfs_create_file("rfi_flush", 0600, arch_debugfs_dir, NULL, &fops_rfi_flush); + debugfs_create_file("entry_flush", 0600, arch_debugfs_dir, NULL, &fops_entry_flush); + debugfs_create_file("uaccess_flush", 0600, arch_debugfs_dir, NULL, &fops_uaccess_flush); return 0; } device_initcall(rfi_flush_debugfs_init); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index aa9c2d01424a..b1e43b69a559 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -773,18 +772,6 @@ static int __init check_cache_coherency(void) late_initcall(check_cache_coherency); #endif /* CONFIG_CHECK_CACHE_COHERENCY */ -#ifdef CONFIG_DEBUG_FS -struct dentry *powerpc_debugfs_root; -EXPORT_SYMBOL(powerpc_debugfs_root); - -static int powerpc_debugfs_init(void) -{ - powerpc_debugfs_root = debugfs_create_dir("powerpc", NULL); - return 0; -} -arch_initcall(powerpc_debugfs_init); -#endif - void ppc_printk_progress(char *s, unsigned short hex) { pr_info("%s\n", s); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1ff258f6c76c..eaa79a0996d1 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -32,7 +32,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dfbce527c98e..c8f648727d36 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,10 +37,10 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -2267,7 +2267,7 @@ static int __init ppc_warn_emulated_init(void) struct ppc_emulated_entry *entries = (void *)&ppc_emulated; dir = debugfs_create_dir("emulated_instructions", - powerpc_debugfs_root); + arch_debugfs_dir); debugfs_create_u32("do_warn", 0644, dir, &ppc_warn_emulated); diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 303e3cb096db..ebd5d920de8c 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -10,13 +10,13 @@ #include #include #include - +#include #include + #include #include #include #include -#include #include #include @@ -1024,7 +1024,7 @@ static void xics_debugfs_init(struct kvmppc_xics *xics) return; } - xics->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xics->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, xics, &xics_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 912c1e9eef6b..a18db9e16ea4 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -2360,7 +2359,7 @@ static void xive_debugfs_init(struct kvmppc_xive *xive) return; } - xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, + xive->dentry = debugfs_create_file(name, S_IRUGO, arch_debugfs_dir, xive, &xive_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index af65ea21bde7..99db9ac49901 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -1268,7 +1267,7 @@ static void xive_native_debugfs_init(struct kvmppc_xive *xive) return; } - xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xive->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, xive, &xive_native_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index ac5720371c0d..c145776d3ae5 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -36,8 +36,8 @@ #include #include #include +#include -#include #include #include #include @@ -2072,7 +2072,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n") static int __init hash64_debugfs(void) { - debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL, + debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL, &fops_hpt_order); return 0; } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 300099de553b..9e16c7b1a6c5 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -6,9 +6,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -520,7 +520,7 @@ static int __init pgtable_debugfs_setup(void) * invalidated as expected. */ debugfs_create_bool("tlbie_enabled", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &tlbie_enabled); return 0; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 1fa2bc6a969e..7724af19ed7e 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -17,7 +18,6 @@ #include #include #include -#include #include "internal.h" @@ -1529,9 +1529,9 @@ EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); static int __init create_tlb_single_page_flush_ceiling(void) { debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, - powerpc_debugfs_root, &tlb_single_page_flush_ceiling); + arch_debugfs_dir, &tlb_single_page_flush_ceiling); debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, - powerpc_debugfs_root, &tlb_local_single_page_flush_ceiling); + arch_debugfs_dir, &tlb_local_single_page_flush_ceiling); return 0; } late_initcall(create_tlb_single_page_flush_ceiling); diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index c4c628b03cf8..8bf7383fb26c 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -7,7 +7,7 @@ */ #include -#include +#include #include #include "ptdump.h" @@ -103,7 +103,7 @@ static const struct file_operations bats_fops = { static int __init bats_init(void) { debugfs_create_file("block_address_translation", 0400, - powerpc_debugfs_root, NULL, &bats_fops); + arch_debugfs_dir, NULL, &bats_fops); return 0; } device_initcall(bats_init); diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c index 565048a0c9be..9223dfb85c51 100644 --- a/arch/powerpc/mm/ptdump/segment_regs.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -6,7 +6,7 @@ * This dumps the content of Segment Registers */ -#include +#include static void seg_show(struct seq_file *m, int i) { @@ -55,7 +55,7 @@ static const struct file_operations sr_fops = { static int __init sr_init(void) { - debugfs_create_file("segment_registers", 0400, powerpc_debugfs_root, + debugfs_create_file("segment_registers", 0400, arch_debugfs_dir, NULL, &sr_fops); return 0; } diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index ca2555b8a0c2..82335e364c44 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -12,8 +12,8 @@ #include #include #include +#include -#include #include #include #include @@ -480,6 +480,6 @@ void axon_msi_debug_setup(struct device_node *dn, struct axon_msic *msic) snprintf(name, sizeof(name), "msic_%d", of_node_to_nid(dn)); - debugfs_create_file(name, 0600, powerpc_debugfs_root, msic, &fops_msic); + debugfs_create_file(name, 0600, arch_debugfs_dir, msic, &fops_msic); } #endif /* DEBUG */ diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 537a4daed614..877720c64515 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -18,7 +18,6 @@ #include #include #include -#include #include /* This enables us to keep track of the memory removed from each node. */ @@ -330,7 +329,7 @@ DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, static int memtrace_init(void) { memtrace_debugfs_dir = debugfs_create_dir("memtrace", - powerpc_debugfs_root); + arch_debugfs_dir); debugfs_create_file("enable", 0600, memtrace_debugfs_dir, NULL, &memtrace_init_fops); diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index ba02a75c1410..05d3832019b9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -13,11 +13,11 @@ #include #include #include +#include #include #include #include #include -#include static struct dentry *imc_debugfs_parent; @@ -56,7 +56,7 @@ static void export_imc_mode_and_cmd(struct device_node *node, u32 cb_offset; struct imc_mem_info *ptr = pmu_ptr->mem_info; - imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); + imc_debugfs_parent = debugfs_create_dir("imc", arch_debugfs_dir); if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 608569082ba0..1e5d51db40f8 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -10,13 +10,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include static int opal_lpc_chip_id = -1; @@ -371,7 +371,7 @@ static int opal_lpc_init_debugfs(void) if (opal_lpc_chip_id < 0) return -ENODEV; - root = debugfs_create_dir("lpc", powerpc_debugfs_root); + root = debugfs_create_dir("lpc", arch_debugfs_dir); rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index fd510d961b8c..6b4eed2ef4fa 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -14,11 +14,11 @@ #include #include #include +#include #include #include #include -#include #include static u64 opal_scom_unmangle(u64 addr) @@ -189,7 +189,7 @@ static int scom_debug_init(void) if (!firmware_has_feature(FW_FEATURE_OPAL)) return 0; - root = debugfs_create_dir("scom", powerpc_debugfs_root); + root = debugfs_create_dir("scom", arch_debugfs_dir); if (!root) return -1; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2389cd79c3c8..3dd35c327d1c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -32,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -2475,7 +2475,7 @@ static void pnv_pci_ioda_create_dbgfs(void) phb = hose->private_data; sprintf(name, "PCI%04x", hose->global_number); - phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); + phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir); debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs, phb, &pnv_pci_diag_data_fops); diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 982f069e4c31..352af5b14a0f 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include #include -#include #include #include @@ -338,7 +338,7 @@ static int dtl_init(void) /* set up common debugfs structure */ - dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root); + dtl_dir = debugfs_create_dir("dtl", arch_debugfs_dir); debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask); debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 869ef638698a..bd1fcb0881ea 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -22,6 +22,8 @@ #include #include #include +#include + #include #include #include @@ -39,7 +41,6 @@ #include #include #include -#include #include #include "pseries.h" @@ -2019,7 +2020,7 @@ static int __init vpa_debugfs_init(void) if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return 0; - vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); + vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir); /* set up the per-cpu vpa file*/ for_each_possible_cpu(i) { diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 4018964bbd69..458645c7a72b 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -1769,7 +1768,7 @@ DEFINE_SHOW_ATTRIBUTE(xive_core_debug); int xive_core_debug_init(void) { if (xive_enabled()) - debugfs_create_file("xive", 0400, powerpc_debugfs_root, + debugfs_create_file("xive", 0400, arch_debugfs_dir, NULL, &xive_core_debug_fops); return 0; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index da4d7f225a40..ead460b80905 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -26,8 +26,8 @@ #include #include #include +#include -#include #include #include #include @@ -4077,8 +4077,8 @@ DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get, static int __init setup_xmon_dbgfs(void) { - debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL, - &xmon_dbgfs_ops); + debugfs_create_file("xmon", 0600, arch_debugfs_dir, NULL, + &xmon_dbgfs_ops); return 0; } device_initcall(setup_xmon_dbgfs); -- cgit v1.2.3 From 7e35ef662ca05c42dbc2f401bb76d9219dd7fd02 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:19 +0530 Subject: powerpc/pseries: rename min_common_depth to primary_domain_index No functional change in this patch. Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/numa.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 094a1076fd1f..79132744b728 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(node_to_cpumask_map); EXPORT_SYMBOL(node_data); -static int min_common_depth; +static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; static int form1_affinity; @@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity) if (!numa_enabled) goto out; - if (of_read_number(associativity, 1) >= min_common_depth) - nid = of_read_number(&associativity[min_common_depth], 1); + if (of_read_number(associativity, 1) >= primary_domain_index) + nid = of_read_number(&associativity[primary_domain_index], 1); /* POWER4 LPAR uses 0xffff as invalid node */ if (nid == 0xffff || nid >= nr_node_ids) @@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); -static int __init find_min_common_depth(void) +static int __init find_primary_domain_index(void) { - int depth; + int index; struct device_node *root; if (firmware_has_feature(FW_FEATURE_OPAL)) @@ -326,7 +326,7 @@ static int __init find_min_common_depth(void) } if (form1_affinity) { - depth = of_read_number(distance_ref_points, 1); + index = of_read_number(distance_ref_points, 1); } else { if (distance_ref_points_depth < 2) { printk(KERN_WARNING "NUMA: " @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void) goto err; } - depth = of_read_number(&distance_ref_points[1], 1); + index = of_read_number(&distance_ref_points[1], 1); } /* @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void) } of_node_put(root); - return depth; + return index; err: of_node_put(root); @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) int nid = default_nid; int rc, index; - if ((min_common_depth < 0) || !numa_enabled) + if ((primary_domain_index < 0) || !numa_enabled) return default_nid; rc = of_get_assoc_arrays(&aa); if (rc) return default_nid; - if (min_common_depth <= aa.array_sz && + if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + min_common_depth - 1; + index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; nid = of_read_number(&aa.arrays[index], 1); if (nid == 0xffff || nid >= nr_node_ids) @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void) return -1; } - min_common_depth = find_min_common_depth(); + primary_domain_index = find_primary_domain_index(); - if (min_common_depth < 0) { + if (primary_domain_index < 0) { /* - * if we fail to parse min_common_depth from device tree + * if we fail to parse primary_domain_index from device tree * mark the numa disabled, boot with numa disabled. */ numa_enabled = false; - return min_common_depth; + return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * Even though we connect cpus to numa domains later in SMP @@ -924,7 +924,7 @@ static void __init find_possible_nodes(void) goto out; } - max_nodes = of_read_number(&domains[min_common_depth], 1); + max_nodes = of_read_number(&domains[primary_domain_index], 1); pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); for (i = 0; i < max_nodes; i++) { @@ -933,7 +933,7 @@ static void __init find_possible_nodes(void) } prop_length /= sizeof(int); - if (prop_length > min_common_depth + 2) + if (prop_length > primary_domain_index + 2) coregroup_enabled = 1; out: @@ -1266,7 +1266,7 @@ int cpu_to_coregroup_id(int cpu) goto out; index = of_read_number(associativity, 1); - if (index > min_common_depth + 1) + if (index > primary_domain_index + 1) return of_read_number(&associativity[index - 1], 1); out: -- cgit v1.2.3 From 0eacd06bb8adea8dd9edb0a30144166d9f227e64 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:20 +0530 Subject: powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY Also make related code cleanup that will allow adding FORM2_AFFINITY in later patches. No functional change in this patch. Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-3-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/firmware.h | 4 ++-- arch/powerpc/include/asm/prom.h | 2 +- arch/powerpc/kernel/prom_init.c | 2 +- arch/powerpc/mm/numa.c | 35 ++++++++++++++++++------------- arch/powerpc/platforms/pseries/firmware.c | 2 +- 5 files changed, 26 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 7604673787d6..60b631161360 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -44,7 +44,7 @@ #define FW_FEATURE_OPAL ASM_CONST(0x0000000010000000) #define FW_FEATURE_SET_MODE ASM_CONST(0x0000000040000000) #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x0000000080000000) -#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000) +#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000) #define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000) #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000) #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) @@ -69,7 +69,7 @@ enum { FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO | FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | - FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | + FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 324a13351749..df9fec9d232c 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop, #define OV5_MSI 0x0201 /* PCIe/MSI support */ #define OV5_CMO 0x0480 /* Cooperative Memory Overcommitment */ #define OV5_XCMO 0x0440 /* Page Coalescing */ -#define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ +#define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a5bf355ce1d6..57db605ad33a 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1096,7 +1096,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { #else 0, #endif - .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN), .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), .micro_checkpoint = 0, .reserved0 = 0, diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 79132744b728..0bad11b3e929 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data); static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; -static int form1_affinity; + +#define FORM0_AFFINITY 0 +#define FORM1_AFFINITY 1 +static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; @@ -190,7 +193,7 @@ int __node_distance(int a, int b) int i; int distance = LOCAL_DISTANCE; - if (!form1_affinity) + if (affinity_form == FORM0_AFFINITY) return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); for (i = 0; i < distance_ref_points_depth; i++) { @@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid, { int i; - if (!form1_affinity) + if (affinity_form != FORM1_AFFINITY) return; for (i = 0; i < distance_ref_points_depth; i++) { @@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void) int index; struct device_node *root; + /* + * Check for which form of affinity. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) { + affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { + dbg("Using form 1 affinity\n"); + affinity_form = FORM1_AFFINITY; + } else + affinity_form = FORM0_AFFINITY; + if (firmware_has_feature(FW_FEATURE_OPAL)) root = of_find_node_by_path("/ibm,opal"); else @@ -318,23 +332,16 @@ static int __init find_primary_domain_index(void) } distance_ref_points_depth /= sizeof(int); - - if (firmware_has_feature(FW_FEATURE_OPAL) || - firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { - dbg("Using form 1 affinity\n"); - form1_affinity = 1; - } - - if (form1_affinity) { - index = of_read_number(distance_ref_points, 1); - } else { + if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + "short ibm,associativity-reference-points\n"); goto err; } index = of_read_number(&distance_ref_points[1], 1); + } else { + index = of_read_number(distance_ref_points, 1); } /* diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 4c7b7f5a2ebc..5d4c2bc20bba 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -119,7 +119,7 @@ struct vec5_fw_feature { static __initdata struct vec5_fw_feature vec5_fw_features_table[] = { - {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY}, + {FW_FEATURE_FORM1_AFFINITY, OV5_FORM1_AFFINITY}, {FW_FEATURE_PRRN, OV5_PRRN}, {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, -- cgit v1.2.3 From 8ddc6448ec5a5ef50eaa581a7dec0e12a02850ff Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:21 +0530 Subject: powerpc/pseries: Consolidate different NUMA distance update code paths The associativity details of the newly added resourced are collected from the hypervisor via "ibm,configure-connector" rtas call. Update the numa distance details of the newly added numa node after the above call. Instead of updating NUMA distance every time we lookup a node id from the associativity property, add helpers that can be used during boot which does this only once. Also remove the distance update from node id lookup helpers. Currently, we duplicate parsing code for ibm,associativity and ibm,associativity-lookup-arrays in the kernel. The associativity array provided by these device tree properties are very similar and hence can use a helper to parse the node id and numa distance details. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-4-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 2 + arch/powerpc/mm/numa.c | 212 +++++++++++++++++------- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 + arch/powerpc/platforms/pseries/hotplug-memory.c | 2 + 4 files changed, 161 insertions(+), 57 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index e4db64c0e184..a6425a70c37b 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -64,6 +64,7 @@ static inline int early_cpu_to_node(int cpu) } int of_drconf_to_nid_single(struct drmem_lmb *lmb); +void update_numa_distance(struct device_node *node); #else @@ -93,6 +94,7 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) return first_online_node; } +static inline void update_numa_distance(struct device_node *node) {} #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 0bad11b3e929..3cda37c52f26 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -208,50 +208,35 @@ int __node_distance(int a, int b) } EXPORT_SYMBOL(__node_distance); -static void initialize_distance_lookup_table(int nid, - const __be32 *associativity) +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) { - int i; + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; - if (affinity_form != FORM1_AFFINITY) - return; + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; - for (i = 0; i < distance_ref_points_depth; i++) { - const __be32 *entry; + nid = of_read_number(&associativity[index], 1); - entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; - distance_lookup_table[nid][i] = of_read_number(entry, 1); - } + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; } - /* * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA * info is found. */ static int associativity_to_nid(const __be32 *associativity) { - int nid = NUMA_NO_NODE; - - if (!numa_enabled) - goto out; - - if (of_read_number(associativity, 1) >= primary_domain_index) - nid = of_read_number(&associativity[primary_domain_index], 1); - - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= nr_node_ids) - nid = NUMA_NO_NODE; - - if (nid > 0 && - of_read_number(associativity, 1) >= distance_ref_points_depth) { - /* - * Skip the length field and send start of associativity array - */ - initialize_distance_lookup_table(nid, associativity + 1); - } + int array_sz = of_read_number(associativity, 1); -out: - return nid; + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); } /* Returns the nid associated with the given device tree node, @@ -287,6 +272,60 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); +static void __initialize_form1_numa_distance(const __be32 *associativity, + int max_array_sz) +{ + int i, nid; + + if (affinity_form != FORM1_AFFINITY) + return; + + nid = __associativity_to_nid(associativity, max_array_sz); + if (nid != NUMA_NO_NODE) { + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + int index = be32_to_cpu(distance_ref_points[i]) - 1; + + /* + * broken hierarchy, return with broken distance table + */ + if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) + return; + + entry = &associativity[index]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } + } +} + +static void initialize_form1_numa_distance(const __be32 *associativity) +{ + int array_sz; + + array_sz = of_read_number(associativity, 1); + /* Skip the first element in the associativity array */ + __initialize_form1_numa_distance(associativity + 1, array_sz); +} + +/* + * Used to update distance information w.r.t newly added node. + */ +void update_numa_distance(struct device_node *node) +{ + if (affinity_form == FORM0_AFFINITY) + return; + else if (affinity_form == FORM1_AFFINITY) { + const __be32 *associativity; + + associativity = of_get_associativity(node); + if (!associativity) + return; + + initialize_form1_numa_distance(associativity); + return; + } +} + static int __init find_primary_domain_index(void) { int index; @@ -433,6 +472,38 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) return 0; } +static int get_nid_and_numa_distance(struct drmem_lmb *lmb) +{ + struct assoc_arrays aa = { .arrays = NULL }; + int default_nid = NUMA_NO_NODE; + int nid = default_nid; + int rc, index; + + if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; + + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { + const __be32 *associativity; + + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); + if (nid > 0 && affinity_form == FORM1_AFFINITY) { + /* + * lookup array associativity entries have + * no length of the array as the first element. + */ + __initialize_form1_numa_distance(associativity, aa.array_sz); + } + } + return nid; +} + /* * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. @@ -453,26 +524,19 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; - nid = of_read_number(&aa.arrays[index], 1); - - if (nid == 0xffff || nid >= nr_node_ids) - nid = default_nid; + const __be32 *associativity; - if (nid > 0) { - index = lmb->aa_index * aa.array_sz; - initialize_distance_lookup_table(nid, - &aa.arrays[index]); - } + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); } - return nid; } #ifdef CONFIG_PPC_SPLPAR -static int vphn_get_nid(long lcpu) + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) { - __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; long rc, hwid; /* @@ -492,12 +556,30 @@ static int vphn_get_nid(long lcpu) rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); if (rc == H_SUCCESS) - return associativity_to_nid(associativity); + return 0; } + return -1; +} + +static int vphn_get_nid(long lcpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + + + if (!__vphn_get_associativity(lcpu, associativity)) + return associativity_to_nid(associativity); + return NUMA_NO_NODE; + } #else + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) +{ + return -1; +} + static int vphn_get_nid(long unused) { return NUMA_NO_NODE; @@ -692,7 +774,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, size = read_n_cells(n_mem_size_cells, usm); } - nid = of_drconf_to_nid_single(lmb); + nid = get_nid_and_numa_distance(lmb); fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), &nid); node_set_online(nid); @@ -709,6 +791,7 @@ static int __init parse_numa_properties(void) struct device_node *memory; int default_nid = 0; unsigned long i; + const __be32 *associativity; if (numa_enabled == 0) { printk(KERN_WARNING "NUMA disabled by user\n"); @@ -734,18 +817,30 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; struct device_node *cpu; - int nid = vphn_get_nid(i); + int nid = NUMA_NO_NODE; - /* - * Don't fall back to default_nid yet -- we will plug - * cpus into nodes once the memory scan has discovered - * the topology. - */ - if (nid == NUMA_NO_NODE) { + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); + + if (__vphn_get_associativity(i, vphn_assoc) == 0) { + nid = associativity_to_nid(vphn_assoc); + initialize_form1_numa_distance(vphn_assoc); + } else { + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ cpu = of_get_cpu_node(i, NULL); BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); + + associativity = of_get_associativity(cpu); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } of_node_put(cpu); } @@ -781,8 +876,11 @@ new_range: * have associativity properties. If none, then * everything goes to default_nid. */ - nid = of_node_to_nid_single(memory); - if (nid < 0) + associativity = of_get_associativity(memory); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } else nid = default_nid; fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index e1f224320102..1ef40ef699a6 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -580,6 +580,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return saved_rc; } + update_numa_distance(dn); + rc = dlpar_online_cpu(dn); if (rc) { saved_rc = rc; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 0beb3ca2b549..14fccd7b9c99 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) return -ENODEV; } + update_numa_distance(lmb_node); + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (!dr_node) { dlpar_free_cc_nodes(lmb_node); -- cgit v1.2.3 From ef31cb83d19c4c589d650747cd5a7e502be9f665 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:22 +0530 Subject: powerpc/pseries: Add a helper for form1 cpu distance This helper is only used with the dispatch trace log collection. A later patch will add Form2 affinity support and this change helps in keeping that simpler. Also add a comment explaining we don't expect the code to be called with FORM0 Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-5-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 4 ++-- arch/powerpc/mm/numa.c | 10 +++++++++- arch/powerpc/platforms/pseries/lpar.c | 4 ++-- 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a6425a70c37b..a4712ecad3e9 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) -extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) @@ -84,7 +84,7 @@ static inline void sysfs_remove_device_from_node(struct device *dev, static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {} -static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { return 0; } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 3cda37c52f26..fe3a9a38eddf 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { int dist = 0; @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) return dist; } +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); + + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); +} + /* must hold reference to node during call */ static const __be32 *of_get_associativity(struct device_node *dev) { diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index bd1fcb0881ea..3df6bdfea475 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -262,7 +262,7 @@ static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu) if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc) return -EIO; - return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); + return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); } static int cpu_home_node_dispatch_distance(int disp_cpu) @@ -282,7 +282,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu) if (!disp_cpu_assoc || !vcpu_assoc) return -EIO; - return cpu_distance(disp_cpu_assoc, vcpu_assoc); + return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc); } static void update_vcpu_disp_stat(int disp_cpu) -- cgit v1.2.3 From 1c6b5a7e74052768977855f95d6b8812f6e7772c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:23 +0530 Subject: powerpc/pseries: Add support for FORM2 associativity PAPR interface currently supports two different ways of communicating resource grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping. Form 0 is the older format and is now considered deprecated. This patch adds another resource grouping named FORM2. Signed-off-by: Daniel Henrique Barboza Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-6-aneesh.kumar@linux.ibm.com --- Documentation/powerpc/associativity.rst | 104 +++++++++++++++++ arch/powerpc/include/asm/firmware.h | 3 +- arch/powerpc/include/asm/prom.h | 1 + arch/powerpc/kernel/prom_init.c | 3 +- arch/powerpc/mm/numa.c | 187 ++++++++++++++++++++++++------ arch/powerpc/platforms/pseries/firmware.c | 1 + 6 files changed, 262 insertions(+), 37 deletions(-) create mode 100644 Documentation/powerpc/associativity.rst (limited to 'arch/powerpc/mm') diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst new file mode 100644 index 000000000000..07e7dd3d6c87 --- /dev/null +++ b/Documentation/powerpc/associativity.rst @@ -0,0 +1,104 @@ +============================ +NUMA resource associativity +============================= + +Associativity represents the groupings of the various platform resources into +domains of substantially similar mean performance relative to resources outside +of that domain. Resources subsets of a given domain that exhibit better +performance relative to each other than relative to other resources subsets +are represented as being members of a sub-grouping domain. This performance +characteristic is presented in terms of NUMA node distance within the Linux kernel. +From the platform view, these groups are also referred to as domains. + +PAPR interface currently supports different ways of communicating these resource +grouping details to the OS. These are referred to as Form 0, Form 1 and Form2 +associativity grouping. Form 0 is the oldest format and is now considered deprecated. + +Hypervisor indicates the type/form of associativity used via "ibm,architecture-vec-5 property". +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1. +A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used. + +Form 0 +----- +Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE). + +Form 1 +----- +With Form 1 a combination of ibm,associativity-reference-points, and ibm,associativity +device tree properties are used to determine the NUMA distance between resource groups/domains. + +The “ibm,associativity” property contains a list of one or more numbers (domainID) +representing the resource’s platform grouping domains. + +The “ibm,associativity-reference-points” property contains a list of one or more numbers +(domainID index) that represents the 1 based ordinal in the associativity lists. +The list of domainID indexes represents an increasing hierarchy of resource grouping. + +ex: +{ primary domainID index, secondary domainID index, tertiary domainID index.. } + +Linux kernel uses the domainID at the primary domainID index as the NUMA node id. +Linux kernel computes NUMA distance between two domains by recursively comparing +if they belong to the same higher-level domains. For mismatch at every higher +level of the resource group, the kernel doubles the NUMA distance between the +comparing domains. + +Form 2 +------- +Form 2 associativity format adds separate device tree properties representing NUMA node distance +thereby making the node distance computation flexible. Form 2 also allows flexible primary +domain numbering. With numa distance computation now detached from the index value in +"ibm,associativity-reference-points" property, Form 2 allows a large number of primary domain +ids at the same domainID index representing resource groups of different performance/latency +characteristics. + +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the +"ibm,architecture-vec-5" property. + +"ibm,numa-lookup-index-table" property contains a list of one or more numbers representing +the domainIDs present in the system. The offset of the domainID in this property is +used as an index while computing numa distance information via "ibm,numa-distance-table". + +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by +N domainID encoded as with encode-int + +For ex: +"ibm,numa-lookup-index-table" = {4, 0, 8, 250, 252}. The offset of domainID 8 (2) is used when +computing the distance of domain 8 from other domains present in the system. For the rest of +this document, this offset will be referred to as domain distance offset. + +"ibm,numa-distance-table" property contains a list of one or more numbers representing the NUMA +distance between resource groups/domains present in the system. + +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by +N distance values encoded as with encode-bytes. The max distance value we could encode is 255. +The number N must be equal to the square of m where m is the number of domainIDs in the +numa-lookup-index-table. + +For ex: +ibm,numa-lookup-index-table = <3 0 8 40>; +ibm,numa-distace-table = <9>, /bits/ 8 < 10 20 80 + 20 10 160 + 80 160 10>; + | 0 8 40 +--|------------ + | +0 | 10 20 80 + | +8 | 20 10 160 + | +40| 80 160 10 + +A possible "ibm,associativity" property for resources in node 0, 8 and 40 + +{ 3, 6, 7, 0 } +{ 3, 6, 9, 8 } +{ 3, 6, 7, 40} + +With "ibm,associativity-reference-points" { 0x3 } + +"ibm,lookup-index-table" helps in having a compact representation of distance matrix. +Since domainID can be sparse, the matrix of distances can also be effectively sparse. +With "ibm,lookup-index-table" we can achieve a compact representation of +distance information. diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 60b631161360..97a3bd9ffeb9 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -53,6 +53,7 @@ #define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000) #define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000) #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000) +#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000) #ifndef __ASSEMBLY__ @@ -73,7 +74,7 @@ enum { FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | - FW_FEATURE_RPT_INVALIDATE, + FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index df9fec9d232c..5c80152e8f18 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -149,6 +149,7 @@ extern int of_read_drc_info_cell(struct property **prop, #define OV5_XCMO 0x0440 /* Page Coalescing */ #define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ +#define OV5_FORM2_AFFINITY 0x0520 /* Form2 NUMA affinity */ #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ #define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 57db605ad33a..95a42d49e291 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1096,7 +1096,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { #else 0, #endif - .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) | + OV5_FEAT(OV5_FORM2_AFFINITY), .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), .micro_checkpoint = 0, .reserved0 = 0, diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fe3a9a38eddf..fe9c6ced40b2 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -56,12 +56,17 @@ static int n_mem_addr_cells, n_mem_size_cells; #define FORM0_AFFINITY 0 #define FORM1_AFFINITY 1 +#define FORM2_AFFINITY 2 static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; static const __be32 *distance_ref_points; static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { + [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } +}; +static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; /* * Allocate node_to_cpumask_map based on number of available nodes @@ -166,6 +171,54 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) +{ + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; + + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; + + nid = of_read_number(&associativity[index], 1); + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; +} +/* + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA + * info is found. + */ +static int associativity_to_nid(const __be32 *associativity) +{ + int array_sz = of_read_number(associativity, 1); + + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); +} + +static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist; + int node1, node2; + + node1 = associativity_to_nid(cpu1_assoc); + node2 = associativity_to_nid(cpu2_assoc); + + dist = numa_distance_table[node1][node2]; + if (dist <= LOCAL_DISTANCE) + return 0; + else if (dist <= REMOTE_DISTANCE) + return 1; + else + return 2; +} + static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { int dist = 0; @@ -186,8 +239,9 @@ int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { /* We should not get called with FORM0 */ VM_WARN_ON(affinity_form == FORM0_AFFINITY); - - return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + if (affinity_form == FORM1_AFFINITY) + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); } /* must hold reference to node during call */ @@ -201,7 +255,9 @@ int __node_distance(int a, int b) int i; int distance = LOCAL_DISTANCE; - if (affinity_form == FORM0_AFFINITY) + if (affinity_form == FORM2_AFFINITY) + return numa_distance_table[a][b]; + else if (affinity_form == FORM0_AFFINITY) return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); for (i = 0; i < distance_ref_points_depth; i++) { @@ -216,37 +272,6 @@ int __node_distance(int a, int b) } EXPORT_SYMBOL(__node_distance); -static int __associativity_to_nid(const __be32 *associativity, - int max_array_sz) -{ - int nid; - /* - * primary_domain_index is 1 based array index. - */ - int index = primary_domain_index - 1; - - if (!numa_enabled || index >= max_array_sz) - return NUMA_NO_NODE; - - nid = of_read_number(&associativity[index], 1); - - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= nr_node_ids) - nid = NUMA_NO_NODE; - return nid; -} -/* - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA - * info is found. - */ -static int associativity_to_nid(const __be32 *associativity) -{ - int array_sz = of_read_number(associativity, 1); - - /* Skip the first element in the associativity array */ - return __associativity_to_nid((associativity + 1), array_sz); -} - /* Returns the nid associated with the given device tree node, * or -1 if not found. */ @@ -320,6 +345,8 @@ static void initialize_form1_numa_distance(const __be32 *associativity) */ void update_numa_distance(struct device_node *node) { + int nid; + if (affinity_form == FORM0_AFFINITY) return; else if (affinity_form == FORM1_AFFINITY) { @@ -332,6 +359,84 @@ void update_numa_distance(struct device_node *node) initialize_form1_numa_distance(associativity); return; } + + /* FORM2 affinity */ + nid = of_node_to_nid_single(node); + if (nid == NUMA_NO_NODE) + return; + + /* + * With FORM2 we expect NUMA distance of all possible NUMA + * nodes to be provided during boot. + */ + WARN(numa_distance_table[nid][nid] == -1, + "NUMA distance details for node %d not provided\n", nid); +} + +/* + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} + */ +static void initialize_form2_numa_distance_lookup_table(void) +{ + int i, j; + struct device_node *root; + const __u8 *numa_dist_table; + const __be32 *numa_lookup_index; + int numa_dist_table_length; + int max_numa_index, distance_index; + + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); + + numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); + max_numa_index = of_read_number(&numa_lookup_index[0], 1); + + /* first element of the array is the size and is encode-int */ + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL); + numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1); + /* Skip the size which is encoded int */ + numa_dist_table += sizeof(__be32); + + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n", + numa_dist_table_length, max_numa_index); + + for (i = 0; i < max_numa_index; i++) + /* +1 skip the max_numa_index in the property */ + numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); + + + if (numa_dist_table_length != max_numa_index * max_numa_index) { + WARN(1, "Wrong NUMA distance information\n"); + /* consider everybody else just remote. */ + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + if (nodeA == nodeB) + numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE; + else + numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE; + } + } + } + + distance_index = 0; + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++]; + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]); + } + } + of_node_put(root); } static int __init find_primary_domain_index(void) @@ -344,6 +449,9 @@ static int __init find_primary_domain_index(void) */ if (firmware_has_feature(FW_FEATURE_OPAL)) { affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { + dbg("Using form 2 affinity\n"); + affinity_form = FORM2_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { dbg("Using form 1 affinity\n"); affinity_form = FORM1_AFFINITY; @@ -388,9 +496,12 @@ static int __init find_primary_domain_index(void) index = of_read_number(&distance_ref_points[1], 1); } else { + /* + * Both FORM1 and FORM2 affinity find the primary domain details + * at the same offset. + */ index = of_read_number(distance_ref_points, 1); } - /* * Warn and cap if the hardware supports more than * MAX_DISTANCE_REF_POINTS domains. @@ -819,6 +930,12 @@ static int __init parse_numa_properties(void) dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + /* + * If it is FORM2 initialize the distance table here. + */ + if (affinity_form == FORM2_AFFINITY) + initialize_form2_numa_distance_lookup_table(); + /* * Even though we connect cpus to numa domains later in SMP * init, we need to know the node ids now. This is because diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 5d4c2bc20bba..f162156b7b68 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -123,6 +123,7 @@ vec5_fw_features_table[] = { {FW_FEATURE_PRRN, OV5_PRRN}, {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY}, }; static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) -- cgit v1.2.3 From f5007dbf4da729baa850b33a64dc3cc53757bdf8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 24 Aug 2021 07:56:26 +0000 Subject: powerpc/booke: Avoid link stack corruption in several places Use bcl 20,31,+4 instead of bl in order to preserve link stack. See commit c974809a26a1 ("powerpc/vdso: Avoid link stack corruption in __get_datapage()") for details. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e9fbc285eceb720e6c0e032ef47fe8b05f669b48.1629791751.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc_asm.h | 2 +- arch/powerpc/kernel/exceptions-64e.S | 6 +++--- arch/powerpc/kernel/fsl_booke_entry_mapping.S | 8 ++++---- arch/powerpc/kernel/head_44x.S | 6 +++--- arch/powerpc/kernel/head_fsl_booke.S | 6 +++--- arch/powerpc/mm/nohash/tlb_low.S | 4 ++-- 6 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index ffe712307e11..1c538a9a11e0 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -260,7 +260,7 @@ n: /* Be careful, this will clobber the lr register. */ #define LOAD_REG_ADDR_PIC(reg, name) \ - bl 0f; \ + bcl 20,31,$+4; \ 0: mflr reg; \ addis reg,reg,(name - 0b)@ha; \ addi reg,reg,(name - 0b)@l; diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 1401787b0b93..7e0943d9f9b0 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -1127,7 +1127,7 @@ found_iprot: * r3 = MAS0_TLBSEL (for the iprot array) * r4 = SPRN_TLBnCFG */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r5,r7,27,31,31 /* extract MSR[IS] */ @@ -1196,7 +1196,7 @@ skpinv: addi r6,r6,1 /* Increment */ mfmsr r6 xori r6,r6,MSR_IS mtspr SPRN_SRR1,r6 - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) mtspr SPRN_SRR0,r6 @@ -1256,7 +1256,7 @@ skpinv: addi r6,r6,1 /* Increment */ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping */ /* Now we branch the new virtual address mapped by this entry */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) tovirt(r6,r6) diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S index 8bccce6544b5..dedc17fac8f8 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* 1. Find the index of the entry we're executing in */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ @@ -85,7 +85,7 @@ skpinv: addi r6,r6,1 /* Increment */ addi r6,r6,10 slw r6,r8,r6 /* convert to mask */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r7 mfspr r8,SPRN_MAS3 @@ -117,7 +117,7 @@ skpinv: addi r6,r6,1 /* Increment */ xori r6,r4,1 slwi r6,r6,5 /* setup new context with other address space */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r7,r9,0,20,31 addi r7,r7,(2f - 1b) @@ -207,7 +207,7 @@ next_tlb_setup: lis r7,MSR_KERNEL@h ori r7,r7,MSR_KERNEL@l - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r6,r9,0,20,31 addi r6,r6,(2f - 1b) diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index ddc978a2d381..02d2928d1e01 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -70,7 +70,7 @@ _ENTRY(_start); * address. * r21 will be loaded with the physical runtime address of _stext */ - bl 0f /* Get our runtime address */ + bcl 20,31,$+4 /* Get our runtime address */ 0: mflr r21 /* Make it accessible */ addis r21,r21,(_stext - 0b)@ha addi r21,r21,(_stext - 0b)@l /* Get our current runtime base */ @@ -853,7 +853,7 @@ _GLOBAL(init_cpu_state) wmmucr: mtspr SPRN_MMUCR,r3 /* Put MMUCR */ sync - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r5 /* Make it accessible */ tlbsx r23,0,r5 /* Find entry we are in */ li r4,0 /* Start at TLB entry 0 */ @@ -1045,7 +1045,7 @@ head_start_47x: sync /* Find the entry we are running from */ - bl 1f + bcl 20,31,$+4 1: mflr r23 tlbsx r23,0,r23 tlbre r24,r23,0 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 0f9642f36b49..dbf3b89e543c 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -79,7 +79,7 @@ _ENTRY(_start); mr r23,r3 mr r25,r4 - bl 0f + bcl 20,31,$+4 0: mflr r8 addis r3,r8,(is_second_reloc - 0b)@ha lwz r19,(is_second_reloc - 0b)@l(r3) @@ -1132,7 +1132,7 @@ _GLOBAL(switch_to_as1) bne 1b /* Get the tlb entry used by the current running code */ - bl 0f + bcl 20,31,$+4 0: mflr r4 tlbsx 0,r4 @@ -1166,7 +1166,7 @@ _GLOBAL(switch_to_as1) _GLOBAL(restore_to_as0) mflr r0 - bl 0f + bcl 20,31,$+4 0: mflr r9 addi r9,r9,1f - 0b diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index 4613bf8e9aae..5add4a51e51f 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) * Touch enough instruction cache lines to ensure cache hits */ 1: mflr r9 - bl 2f + bcl 20,31,$+4 2: mflr r6 li r7,32 PPC_ICBT(0,R6,R7) /* touch next cache line */ @@ -414,7 +414,7 @@ _GLOBAL(loadcam_multi) * Set up temporary TLB entry that is the same as what we're * running from, but in AS=1. */ - bl 1f + bcl 20,31,$+4 1: mflr r6 tlbsx 0,r8 mfspr r6,SPRN_MAS1 -- cgit v1.2.3 From 11f27a7fa4ca27935de74e3eb052bdc430d5f8d8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:40 +0000 Subject: powerpc/ptdump: Use DEFINE_SHOW_ATTRIBUTE() Use DEFINE_SHOW_ATTRIBUTE() instead of open coding open() and fops. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b864a92693ca8413ef0b19f0c12065c212899b6e.1625762905.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/bats.c | 14 ++------------ arch/powerpc/mm/ptdump/hashpagetable.c | 12 +----------- arch/powerpc/mm/ptdump/ptdump.c | 13 +------------ arch/powerpc/mm/ptdump/segment_regs.c | 12 +----------- 4 files changed, 5 insertions(+), 46 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index 8bf7383fb26c..820c119013e4 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool #define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d) -static int bats_show_603(struct seq_file *m, void *v) +static int bats_show(struct seq_file *m, void *v) { seq_puts(m, "---[ Instruction Block Address Translation ]---\n"); @@ -88,17 +88,7 @@ static int bats_show_603(struct seq_file *m, void *v) return 0; } -static int bats_open(struct inode *inode, struct file *file) -{ - return single_open(file, bats_show_603, NULL); -} - -static const struct file_operations bats_fops = { - .open = bats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(bats); static int __init bats_init(void) { diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index ad6df9a2e7c8..c7f824d294b2 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -526,17 +526,7 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static int ptdump_init(void) { diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 5062c58b1e5b..349fd8fe173f 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -397,18 +397,7 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } - -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static void build_pgtable_complete_mask(void) { diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c index 9223dfb85c51..9df3af8d481f 100644 --- a/arch/powerpc/mm/ptdump/segment_regs.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -41,17 +41,7 @@ static int sr_show(struct seq_file *m, void *v) return 0; } -static int sr_open(struct inode *inode, struct file *file) -{ - return single_open(file, sr_show, NULL); -} - -static const struct file_operations sr_fops = { - .open = sr_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(sr); static int __init sr_init(void) { -- cgit v1.2.3 From 64b87b0c70e0fd28352895cba3c0a9631e0072dd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:41 +0000 Subject: powerpc/ptdump: Remove unused 'page_size' parameter note_page_update_state() doesn't use page_size. Remove it. Could also be removed to note_page() but as a following patch will remove all current users of note_page(), just leave it as is for now. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e2f80d052001155251bfe009c360d0c5d9242c6b.1625762906.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/ptdump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 349fd8fe173f..3eb8732641da 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -189,7 +189,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page_update_state(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) + unsigned int level, u64 val) { u64 flag = val & pg_level[level].mask; u64 pa = val & PTE_RPN_MASK; @@ -213,7 +213,7 @@ static void note_page(struct pg_state *st, unsigned long addr, /* At first no level is set */ if (!st->level) { pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); - note_page_update_state(st, addr, level, val, page_size); + note_page_update_state(st, addr, level, val); /* * Dump the section of virtual memory when: * - the PTE flags from one entry to the next differs. @@ -242,7 +242,7 @@ static void note_page(struct pg_state *st, unsigned long addr, * Address indicates we have passed the end of the * current section of virtual memory */ - note_page_update_state(st, addr, level, val, page_size); + note_page_update_state(st, addr, level, val); } } -- cgit v1.2.3 From cf98d2b6eea6a1b2c43f85680ad58fcc3ea9496b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:42 +0000 Subject: powerpc/ptdump: Reduce level numbers by 1 in note_page() and add p4d level Do the same as commit f8f0d0b6fa20 ("mm: ptdump: reduce level numbers by 1 in note_page()") and add missing p4d level. This will align powerpc to the users of generic ptdump. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d76495c574132b197b445a1f133755cca4b912a4.1625762906.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/8xx.c | 6 ++++-- arch/powerpc/mm/ptdump/book3s64.c | 6 ++++-- arch/powerpc/mm/ptdump/ptdump.c | 17 +++++++++-------- arch/powerpc/mm/ptdump/shared.c | 6 ++++-- 4 files changed, 21 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 86da2a669680..fac932eb8f9a 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c index 14f73868db66..5ad92d9dc5d1 100644 --- a/arch/powerpc/mm/ptdump/book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -103,8 +103,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 3eb8732641da..fb531bc64fc5 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -58,7 +58,7 @@ struct pg_state { const struct addr_marker *marker; unsigned long start_address; unsigned long start_pa; - unsigned int level; + int level; u64 current_flags; bool check_wx; unsigned long wx_pages; @@ -188,10 +188,9 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page_update_state(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val) +static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; u64 pa = val & PTE_RPN_MASK; st->level = level; @@ -206,12 +205,12 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr, } static void note_page(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) + int level, u64 val, unsigned long page_size) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; /* At first no level is set */ - if (!st->level) { + if (st->level == -1) { pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); note_page_update_state(st, addr, level, val); /* @@ -383,6 +382,7 @@ static int ptdump_show(struct seq_file *m, void *v) struct pg_state st = { .seq = m, .marker = address_markers, + .level = -1, .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, }; @@ -393,7 +393,7 @@ static int ptdump_show(struct seq_file *m, void *v) /* Traverse kernel page tables */ walk_pagetables(&st); - note_page(&st, 0, 0, 0, 0); + note_page(&st, 0, -1, 0, 0); return 0; } @@ -415,6 +415,7 @@ void ptdump_check_wx(void) struct pg_state st = { .seq = NULL, .marker = address_markers, + .level = -1, .check_wx = true, .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, }; diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index c005fe041c18..03607ab90c66 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -68,8 +68,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ -- cgit v1.2.3 From e084728393a58e7fdafeee2e6b20e0faff09b557 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:43 +0000 Subject: powerpc/ptdump: Convert powerpc to GENERIC_PTDUMP This patch converts powerpc to the generic PTDUMP implementation. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/03166d569526be70214fe9370a7bad219d2f41c8.1625762907.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 + arch/powerpc/Kconfig.debug | 30 --------- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mmu_decl.h | 2 +- arch/powerpc/mm/ptdump/Makefile | 9 ++- arch/powerpc/mm/ptdump/ptdump.c | 146 ++++++++++------------------------------ 6 files changed, 47 insertions(+), 144 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d01e3401581d..ec866085ff8b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -123,6 +123,7 @@ config PPC select ARCH_HAS_COPY_MC if PPC64 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE + select ARCH_HAS_DEBUG_WX if STRICT_KERNEL_RWX select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES select ARCH_HAS_ELF_RANDOMIZE @@ -182,6 +183,7 @@ config PPC select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI + select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 205cd77f321f..192f0ed0097f 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -365,36 +365,6 @@ config FAIL_IOMMU If you are unsure, say N. -config PPC_PTDUMP - bool "Export kernel pagetable layout to userspace via debugfs" - depends on DEBUG_KERNEL && DEBUG_FS - help - This option exports the state of the kernel pagetables to a - debugfs file. This is only useful for kernel developers who are - working in architecture specific areas of the kernel - probably - not a good idea to enable this feature in a production kernel. - - If you are unsure, say N. - -config PPC_DEBUG_WX - bool "Warn on W+X mappings at boot" - depends on PPC_PTDUMP && STRICT_KERNEL_RWX - help - Generate a warning if any W+X mappings are found at boot. - - This is useful for discovering cases where the kernel is leaving - W+X mappings after applying NX, as such mappings are a security risk. - - Note that even if the check fails, your kernel is possibly - still fine, as W+X mappings are not a security hole in - themselves, what they do is that they make the exploitation - of other unfixed kernel bugs easier. - - There is no runtime or memory usage effect of this option - once the kernel has booted up - it's a one time check. - - If in doubt, say "Y". - config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index eae4ec2988fc..df8172da2301 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_PPC_PTDUMP) += ptdump/ +obj-$(CONFIG_PTDUMP_CORE) += ptdump/ obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7dac910c0b21..dd1cabc2ea0f 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { } void __init mmu_mapin_immr(void); #endif -#ifdef CONFIG_PPC_DEBUG_WX +#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void); #else static inline void ptdump_check_wx(void) { } diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index 712762be3cb1..4050cbb55acf 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -5,5 +5,10 @@ obj-y += ptdump.o obj-$(CONFIG_4xx) += shared.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o -obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o -obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o + +ifdef CONFIG_PTDUMP_DEBUGFS +obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o +obj-$(CONFIG_PPC_BOOK3S_64) += hashpagetable.o +endif diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index fb531bc64fc5..2d80d775d15e 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,7 @@ * */ struct pg_state { + struct ptdump_state ptdump; struct seq_file *seq; const struct addr_marker *marker; unsigned long start_address; @@ -102,6 +104,11 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; +static struct ptdump_range ptdump_range[] __ro_after_init = { + {TASK_SIZE_MAX, ~0UL}, + {0, 0} +}; + #define pt_dump_seq_printf(m, fmt, args...) \ ({ \ if (m) \ @@ -204,10 +211,10 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr, int } } -static void note_page(struct pg_state *st, unsigned long addr, - int level, u64 val, unsigned long page_size) +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { u64 flag = level >= 0 ? val & pg_level[level].mask : 0; + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); /* At first no level is set */ if (st->level == -1) { @@ -245,94 +252,6 @@ static void note_page(struct pg_state *st, unsigned long addr, } } -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) -{ - pte_t *pte = pte_offset_kernel(pmd, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PTE; i++, pte++) { - addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE); - - } -} - -static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start, - int pdshift, int level) -{ -#ifdef CONFIG_ARCH_HAS_HUGEPD - unsigned int i; - int shift = hugepd_shift(*phpd); - int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1; - - if (start & ((1 << shift) - 1)) - return; - - for (i = 0; i < ptrs_per_hpd; i++) { - unsigned long addr = start + (i << shift); - pte_t *pte = hugepte_offset(*phpd, addr, pdshift); - - note_page(st, addr, level + 1, pte_val(*pte), 1 << shift); - } -#endif -} - -static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { - addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd)) - /* pmd exists */ - walk_pte(st, pmd, addr); - else - note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE); - } -} - -static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) -{ - pud_t *pud = pud_offset(p4d, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PUD; i++, pud++) { - addr = start + i * PUD_SIZE; - if (!pud_none(*pud) && !pud_is_leaf(*pud)) - /* pud exists */ - walk_pmd(st, pud, addr); - else - note_page(st, addr, 2, pud_val(*pud), PUD_SIZE); - } -} - -static void walk_pagetables(struct pg_state *st) -{ - unsigned int i; - unsigned long addr = st->start_address & PGDIR_MASK; - pgd_t *pgd = pgd_offset_k(addr); - - /* - * Traverse the linux pagetable structure and dump pages that are in - * the hash pagetable. - */ - for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - p4d_t *p4d = p4d_offset(pgd, 0); - - if (p4d_none(*p4d) || p4d_is_leaf(*p4d)) - note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE); - else if (is_hugepd(__hugepd(p4d_val(*p4d)))) - walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1); - else - /* p4d exists */ - walk_pud(st, p4d, addr); - } -} - static void populate_markers(void) { int i = 0; @@ -383,17 +302,14 @@ static int ptdump_show(struct seq_file *m, void *v) .seq = m, .marker = address_markers, .level = -1, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .ptdump = { + .note_page = note_page, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - /* Traverse kernel page tables */ - walk_pagetables(&st); - note_page(&st, 0, -1, 0, 0); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); return 0; } @@ -409,23 +325,24 @@ static void build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -#ifdef CONFIG_PPC_DEBUG_WX +#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, - .marker = address_markers, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, .level = -1, .check_wx = true, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .ptdump = { + .note_page = note_page, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - - walk_pagetables(&st); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); if (st.wx_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", @@ -435,12 +352,21 @@ void ptdump_check_wx(void) } #endif -static int ptdump_init(void) +static int __init ptdump_init(void) { +#ifdef CONFIG_PPC64 + if (!radix_enabled()) + ptdump_range[0].start = KERN_VIRT_START; + else + ptdump_range[0].start = PAGE_OFFSET; +#endif + populate_markers(); build_pgtable_complete_mask(); - debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, - &ptdump_fops); + + if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS)) + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); + return 0; } device_initcall(ptdump_init); -- cgit v1.2.3 From 806c0e6e7e97adc17389c8dc1f52d4736f49299b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Aug 2021 08:24:21 +0000 Subject: powerpc: Refactor verification of MSR_RI 40x and BOOKE don't have MSR_RI therefore all tests involving MSR_RI may be problematic on those plateforms. Create helpers to check or set MSR_RI in regs, and use them in common code. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c2fb93708196734f4176dda334aaa3055f213b89.1629707037.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ptrace.h | 23 +++++++++++++++++++++++ arch/powerpc/kernel/interrupt.c | 9 +++------ arch/powerpc/kernel/traps.c | 8 ++++---- arch/powerpc/mm/book3s64/slb.c | 2 +- arch/powerpc/platforms/embedded6xx/holly.c | 2 +- arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c | 2 +- arch/powerpc/platforms/pasemi/idle.c | 2 +- arch/powerpc/platforms/powernv/opal.c | 2 +- arch/powerpc/platforms/pseries/ras.c | 2 +- arch/powerpc/sysdev/fsl_rio.c | 2 +- arch/powerpc/xmon/xmon.c | 16 +++------------- 11 files changed, 40 insertions(+), 30 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 5114ddba6f16..f9c235c1d6ce 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -22,6 +22,7 @@ #include #include #include +#include #ifndef __ASSEMBLY__ struct pt_regs @@ -272,6 +273,28 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) regs->gpr[3] = rc; } +static inline bool cpu_has_msr_ri(void) +{ + return !IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x); +} + +static inline bool regs_is_unrecoverable(struct pt_regs *regs) +{ + return unlikely(cpu_has_msr_ri() && !(regs->msr & MSR_RI)); +} + +static inline void regs_set_recoverable(struct pt_regs *regs) +{ + if (cpu_has_msr_ri()) + regs_set_return_msr(regs, regs->msr | MSR_RI); +} + +static inline void regs_set_unrecoverable(struct pt_regs *regs) +{ + if (cpu_has_msr_ri()) + regs_set_return_msr(regs, regs->msr & ~MSR_RI); +} + #define arch_has_single_step() (1) #define arch_has_block_step() (true) #define ARCH_HAS_USER_SINGLE_STEP_REPORT diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 531f0a1dbabc..a73f3f70a657 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -92,8 +92,7 @@ notrace long system_call_exception(long r3, long r4, long r5, CT_WARN_ON(ct_state() == CONTEXT_KERNEL); user_exit_irqoff(); - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) - BUG_ON(!(regs->msr & MSR_RI)); + BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(!(regs->msr & MSR_PR)); BUG_ON(arch_irq_disabled_regs(regs)); @@ -462,8 +461,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) { unsigned long ret; - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) - BUG_ON(!(regs->msr & MSR_RI)); + BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(arch_irq_disabled_regs(regs)); CT_WARN_ON(ct_state() == CONTEXT_USER); @@ -494,8 +492,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) bool stack_store = current_thread_info()->flags & _TIF_EMULATE_STACK_STORE; - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) && - unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); /* * CT_WARN_ON comes here via program_check_exception, diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5463d201d465..3a344f5265c2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -428,7 +428,7 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) return; nonrecoverable: - regs_set_return_msr(regs, regs->msr & ~MSR_RI); + regs_set_unrecoverable(regs); #endif } DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception) @@ -498,7 +498,7 @@ out: die("Unrecoverable nested System Reset", regs, SIGABRT); #endif /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* For the reason explained in die_mce, nmi_exit before die */ nmi_exit(); die("Unrecoverable System Reset", regs, SIGABRT); @@ -550,7 +550,7 @@ static inline int check_io_access(struct pt_regs *regs) printk(KERN_DEBUG "%s bad port %lx at %p\n", (*nip & 0x100)? "OUT to": "IN from", regs->gpr[rb] - _IO_BASE, nip); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } @@ -840,7 +840,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) bail: /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) + if (regs_is_unrecoverable(regs)) die_mce("Unrecoverable Machine check", regs, SIGBUS); #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index c91bd85eb90e..f0037bcc47a0 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -822,7 +822,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); - if (unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) return -EINVAL; /* diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c index 85521b3e7098..7a85b117f7a4 100644 --- a/arch/powerpc/platforms/embedded6xx/holly.c +++ b/arch/powerpc/platforms/embedded6xx/holly.c @@ -251,7 +251,7 @@ static int ppc750_machine_check_exception(struct pt_regs *regs) /* Are we prepared to handle this fault */ if ((entry = search_exception_tables(regs->nip)) != NULL) { tsi108_clear_pci_cfg_error(); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c index d8da6a483e59..9eb9abb5bce2 100644 --- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c +++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c @@ -173,7 +173,7 @@ static int mpc7448_machine_check_exception(struct pt_regs *regs) /* Are we prepared to handle this fault */ if ((entry = search_exception_tables(regs->nip)) != NULL) { tsi108_clear_pci_cfg_error(); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c index 9b88e3cded7d..2a5b84268b77 100644 --- a/arch/powerpc/platforms/pasemi/idle.c +++ b/arch/powerpc/platforms/pasemi/idle.c @@ -58,7 +58,7 @@ static int pasemi_system_reset_exception(struct pt_regs *regs) restore_astate(hard_smp_processor_id()); /* everything handled */ - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); return 1; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 2835376e61a4..e9d18519e650 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -588,7 +588,7 @@ static int opal_recover_mce(struct pt_regs *regs, { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 167f2e1b8d39..56092dccfdb8 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -783,7 +783,7 @@ static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 5a95b8ea23d8..ff7906b48ca1 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -108,7 +108,7 @@ int fsl_rio_mcheck_exception(struct pt_regs *regs) __func__); out_be32((u32 *)(rio_regs_win + RIO_LTLEDCSR), 0); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index ead460b80905..dd8241c009e5 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -482,16 +482,6 @@ static inline void get_output_lock(void) {} static inline void release_output_lock(void) {} #endif -static inline int unrecoverable_excp(struct pt_regs *regs) -{ -#if defined(CONFIG_4xx) || defined(CONFIG_PPC_BOOK3E) - /* We have no MSR_RI bit on 4xx or Book3e, so we simply return false */ - return 0; -#else - return ((regs->msr & MSR_RI) == 0); -#endif -} - static void xmon_touch_watchdogs(void) { touch_softlockup_watchdog_sync(); @@ -565,7 +555,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) bp = NULL; if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) == (MSR_IR|MSR_64BIT)) bp = at_breakpoint(regs->nip); - if (bp || unrecoverable_excp(regs)) + if (bp || regs_is_unrecoverable(regs)) fromipi = 0; if (!fromipi) { @@ -577,7 +567,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) cpu, BP_NUM(bp)); xmon_print_symbol(regs->nip, " ", ")\n"); } - if (unrecoverable_excp(regs)) + if (regs_is_unrecoverable(regs)) printf("WARNING: exception is not recoverable, " "can't continue\n"); release_output_lock(); @@ -693,7 +683,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) printf("Stopped at breakpoint %tx (", BP_NUM(bp)); xmon_print_symbol(regs->nip, " ", ")\n"); } - if (unrecoverable_excp(regs)) + if (regs_is_unrecoverable(regs)) printf("WARNING: exception is not recoverable, " "can't continue\n"); remove_bpts(); -- cgit v1.2.3 From 544af6429777cefae2f8af9a9866df5e8cb21763 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:17 +0530 Subject: powerpc/numa: Drop dbg in favour of pr_debug powerpc supported numa=debug which is not documented. This option was used to print early debug output. However something more flexible can be achieved by using CONFIG_DYNAMIC_DEBUG. Hence drop dbg (and numa=debug) in favour of pr_debug Suggested-by: Michael Ellerman Signed-off-by: Srikar Dronamraju [mpe: Rebase on to powerpc/next form2 affinity changes] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-2-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fe9c6ced40b2..73ba6b072274 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -40,9 +40,6 @@ static int numa_enabled = 1; static char *cmdline __initdata; -static int numa_debug; -#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } - int numa_cpu_lookup_table[NR_CPUS]; cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; struct pglist_data *node_data[MAX_NUMNODES]; @@ -87,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - dbg("Node to cpumask map for %u nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init fake_numa_create_new_node(unsigned long end_pfn, @@ -131,7 +128,7 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn, cmdline = p; fake_nid++; *nid = fake_nid; - dbg("created new fake_node with id %d\n", fake_nid); + pr_debug("created new fake_node with id %d\n", fake_nid); return 1; } return 0; @@ -149,7 +146,7 @@ static void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); - dbg("adding cpu %d to node %d\n", cpu, node); + pr_debug("adding cpu %d to node %d\n", cpu, node); if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) cpumask_set_cpu(cpu, node_to_cpumask_map[node]); @@ -160,7 +157,7 @@ static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - dbg("removing cpu %lu from node %d\n", cpu, node); + pr_debug("removing cpu %lu from node %d\n", cpu, node); if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); @@ -450,10 +447,10 @@ static int __init find_primary_domain_index(void) if (firmware_has_feature(FW_FEATURE_OPAL)) { affinity_form = FORM1_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { - dbg("Using form 2 affinity\n"); + pr_debug("Using form 2 affinity\n"); affinity_form = FORM2_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { - dbg("Using form 1 affinity\n"); + pr_debug("Using form 1 affinity\n"); affinity_form = FORM1_AFFINITY; } else affinity_form = FORM0_AFFINITY; @@ -482,7 +479,7 @@ static int __init find_primary_domain_index(void) &distance_ref_points_depth); if (!distance_ref_points) { - dbg("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("NUMA: ibm,associativity-reference-points not found.\n"); goto err; } @@ -928,7 +925,7 @@ static int __init parse_numa_properties(void) return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + pr_debug("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * If it is FORM2 initialize the distance table here. @@ -1251,9 +1248,6 @@ static int __init early_numa(char *p) if (strstr(p, "off")) numa_enabled = 0; - if (strstr(p, "debug")) - numa_debug = 1; - p = strstr(p, "fake="); if (p) cmdline = p + strlen("fake="); @@ -1416,7 +1410,7 @@ static long vphn_get_associativity(unsigned long cpu, switch (rc) { case H_SUCCESS: - dbg("VPHN hcall succeeded. Reset polling...\n"); + pr_debug("VPHN hcall succeeded. Reset polling...\n"); goto out; case H_FUNCTION: -- cgit v1.2.3 From 506c2075ffd8db352c53201ef166948a272e3bce Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:18 +0530 Subject: powerpc/numa: convert printk to pr_xxx Convert the remaining printk to pr_xxx One advantage would be all prints will now have prefix "numa:" from pr_fmt(). [ convert printk(KERN_ERR) to pr_warn : Suggested by Laurent Dufour ] Suggested-by: Michael Ellerman Signed-off-by: Srikar Dronamraju [mpe: Rebase onto powerpc/next, s/WARNING/Warning/] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-3-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 73ba6b072274..f4e52581713e 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -162,8 +162,7 @@ static void unmap_cpu_from_node(unsigned long cpu) if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); } else { - printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", - cpu, node); + pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ @@ -479,15 +478,14 @@ static int __init find_primary_domain_index(void) &distance_ref_points_depth); if (!distance_ref_points) { - pr_debug("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("ibm,associativity-reference-points not found.\n"); goto err; } distance_ref_points_depth /= sizeof(int); if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { - printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + pr_warn("short ibm,associativity-reference-points\n"); goto err; } @@ -504,8 +502,8 @@ static int __init find_primary_domain_index(void) * MAX_DISTANCE_REF_POINTS domains. */ if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { - printk(KERN_WARNING "NUMA: distance array capped at " - "%d entries\n", MAX_DISTANCE_REF_POINTS); + pr_warn("distance array capped at %d entries\n", + MAX_DISTANCE_REF_POINTS); distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; } @@ -910,7 +908,7 @@ static int __init parse_numa_properties(void) const __be32 *associativity; if (numa_enabled == 0) { - printk(KERN_WARNING "NUMA disabled by user\n"); + pr_warn("disabled by user\n"); return -1; } @@ -925,7 +923,7 @@ static int __init parse_numa_properties(void) return primary_domain_index; } - pr_debug("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * If it is FORM2 initialize the distance table here. @@ -1038,10 +1036,8 @@ static void __init setup_nonnuma(void) unsigned int nid = 0; int i; - printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); - printk(KERN_DEBUG "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); + pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); + pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { fake_numa_create_new_node(end_pfn, &nid); -- cgit v1.2.3 From 544a09ee7434b949552266d20ece538d35535bd5 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:19 +0530 Subject: powerpc/numa: Print debug statements only when required Currently, a debug message gets printed every time an attempt to add(remove) a CPU. However this is redundant if the CPU is already added (removed) from the node. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-4-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f4e52581713e..52bc63ee6a6a 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -146,10 +146,10 @@ static void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); - pr_debug("adding cpu %d to node %d\n", cpu, node); - - if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) + if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) { + pr_debug("adding cpu %d to node %d\n", cpu, node); cpumask_set_cpu(cpu, node_to_cpumask_map[node]); + } } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) @@ -157,10 +157,9 @@ static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - pr_debug("removing cpu %lu from node %d\n", cpu, node); - if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); + pr_debug("removing cpu %lu from node %d\n", cpu, node); } else { pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } -- cgit v1.2.3 From 9a245d0e1f006bc7ccf0285d0d520ed304d00c4a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:20 +0530 Subject: powerpc/numa: Update cpu_cpu_map on CPU online/offline cpu_cpu_map holds all the CPUs in the DIE. However in PowerPC, when onlining/offlining of CPUs, this mask doesn't get updated. This mask is however updated when CPUs are added/removed. So when both operations like online/offline of CPUs and adding/removing of CPUs are done simultaneously, then cpumaps end up broken. WARNING: CPU: 13 PID: 1142 at kernel/sched/topology.c:898 build_sched_domains+0xd48/0x1720 Modules linked in: rpadlpar_io rpaphp mptcp_diag xsk_diag tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag bonding tls nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set rfkill nf_tables nfnetlink pseries_rng xts vmx_crypto uio_pdrv_genirq uio binfmt_misc ip_tables xfs libcrc32c dm_service_time sd_mod t10_pi sg ibmvfc scsi_transport_fc ibmveth dm_multipath dm_mirror dm_region_hash dm_log dm_mod fuse CPU: 13 PID: 1142 Comm: kworker/13:2 Not tainted 5.13.0-rc6+ #28 Workqueue: events cpuset_hotplug_workfn NIP: c0000000001caac8 LR: c0000000001caac4 CTR: 00000000007088ec REGS: c00000005596f220 TRAP: 0700 Not tainted (5.13.0-rc6+) MSR: 8000000000029033 CR: 48828222 XER: 00000009 CFAR: c0000000001ea698 IRQMASK: 0 GPR00: c0000000001caac4 c00000005596f4c0 c000000001c4a400 0000000000000036 GPR04: 00000000fffdffff c00000005596f1d0 0000000000000027 c0000018cfd07f90 GPR08: 0000000000000023 0000000000000001 0000000000000027 c0000018fe68ffe8 GPR12: 0000000000008000 c00000001e9d1880 c00000013a047200 0000000000000800 GPR16: c000000001d3c7d0 0000000000000240 0000000000000048 c000000010aacd18 GPR20: 0000000000000001 c000000010aacc18 c00000013a047c00 c000000139ec2400 GPR24: 0000000000000280 c000000139ec2520 c000000136c1b400 c000000001c93060 GPR28: c00000013a047c20 c000000001d3c6c0 c000000001c978a0 000000000000000d NIP [c0000000001caac8] build_sched_domains+0xd48/0x1720 LR [c0000000001caac4] build_sched_domains+0xd44/0x1720 Call Trace: [c00000005596f4c0] [c0000000001caac4] build_sched_domains+0xd44/0x1720 (unreliable) [c00000005596f670] [c0000000001cc5ec] partition_sched_domains_locked+0x3ac/0x4b0 [c00000005596f710] [c0000000002804e4] rebuild_sched_domains_locked+0x404/0x9e0 [c00000005596f810] [c000000000283e60] rebuild_sched_domains+0x40/0x70 [c00000005596f840] [c000000000284124] cpuset_hotplug_workfn+0x294/0xf10 [c00000005596fc60] [c000000000175040] process_one_work+0x290/0x590 [c00000005596fd00] [c0000000001753c8] worker_thread+0x88/0x620 [c00000005596fda0] [c000000000181704] kthread+0x194/0x1a0 [c00000005596fe10] [c00000000000ccec] ret_from_kernel_thread+0x5c/0x70 Instruction dump: 485af049 60000000 2fa30800 409e0028 80fe0000 e89a00f8 e86100e8 38da0120 7f88e378 7ce53b78 4801fb91 60000000 <0fe00000> 39000000 38e00000 38c00000 Fix this by updating cpu_cpu_map aka cpumask_of_node() on every CPU online/offline. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-5-srikar@linux.vnet.ibm.com --- arch/powerpc/include/asm/topology.h | 13 +++++++++++++ arch/powerpc/kernel/smp.c | 3 +++ arch/powerpc/mm/numa.c | 7 ++----- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a4712ecad3e9..36fcafb1fd6d 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -66,6 +66,11 @@ static inline int early_cpu_to_node(int cpu) int of_drconf_to_nid_single(struct drmem_lmb *lmb); void update_numa_distance(struct device_node *node); +extern void map_cpu_to_node(int cpu, int node); +#ifdef CONFIG_HOTPLUG_CPU +extern void unmap_cpu_from_node(unsigned long cpu); +#endif /* CONFIG_HOTPLUG_CPU */ + #else static inline int early_cpu_to_node(int cpu) { return 0; } @@ -95,6 +100,14 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) } static inline void update_numa_distance(struct device_node *node) {} + +#ifdef CONFIG_SMP +static inline void map_cpu_to_node(int cpu, int node) {} +#ifdef CONFIG_HOTPLUG_CPU +static inline void unmap_cpu_from_node(unsigned long cpu) {} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ + #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 548aa58d25f9..9cc7d3dbf439 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1442,6 +1442,8 @@ static void remove_cpu_from_masks(int cpu) struct cpumask *(*mask_fn)(int) = cpu_sibling_mask; int i; + unmap_cpu_from_node(cpu); + if (shared_caches) mask_fn = cpu_l2_cache_mask; @@ -1526,6 +1528,7 @@ static void add_cpu_to_masks(int cpu) * This CPU will not be in the online mask yet so we need to manually * add it to it's own thread sibling mask. */ + map_cpu_to_node(cpu, cpu_to_node(cpu)); cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(cpu)); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 52bc63ee6a6a..6f14c8fb6359 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -142,7 +142,7 @@ static void reset_numa_cpu_lookup_table(void) numa_cpu_lookup_table[cpu] = -1; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); @@ -153,7 +153,7 @@ static void map_cpu_to_node(int cpu, int node) } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; @@ -800,9 +800,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu) static int ppc_numa_cpu_dead(unsigned int cpu) { -#ifdef CONFIG_HOTPLUG_CPU - unmap_cpu_from_node(cpu); -#endif return 0; } -- cgit v1.2.3 From b14b8b1ed0e15b8f43fba9c25654278a31ee3c2f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 31 Aug 2021 23:51:51 +1000 Subject: powerpc/ptdump: Fix generic ptdump for 64-bit Since the conversion to generic ptdump we see crashes on 64-bit: BUG: Unable to handle kernel data access on read at 0xc0eeff7f00000000 Faulting instruction address: 0xc00000000045e5fc Oops: Kernel access of bad area, sig: 11 [#1] ... NIP __walk_page_range+0x2bc/0xce0 LR __walk_page_range+0x240/0xce0 Call Trace: __walk_page_range+0x240/0xce0 (unreliable) walk_page_range_novma+0x74/0xb0 ptdump_walk_pgd+0x98/0x170 ptdump_check_wx+0x88/0xd0 mark_rodata_ro+0x48/0x80 kernel_init+0x74/0x1a0 ret_from_kernel_thread+0x5c/0x64 What's happening is that have walked off the end of the kernel page tables, and started dereferencing junk values. That happens because we initialised the ptdump_range to span all the way up to 0xffffffffffffffff: static struct ptdump_range ptdump_range[] __ro_after_init = { {TASK_SIZE_MAX, ~0UL}, But the kernel page tables don't span that far. So on 64-bit set the end of the range to be the address immediately past the end of the kernel page tables, to limit the page table walk to valid addresses. Fixes: e084728393a5 ("powerpc/ptdump: Convert powerpc to GENERIC_PTDUMP") Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210831135151.886620-1-mpe@ellerman.id.au --- arch/powerpc/mm/ptdump/ptdump.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 2d80d775d15e..bf251191e78d 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -359,6 +359,8 @@ static int __init ptdump_init(void) ptdump_range[0].start = KERN_VIRT_START; else ptdump_range[0].start = PAGE_OFFSET; + + ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD); #endif populate_markers(); -- cgit v1.2.3