From d2e60075a3d4422dc54b919f3b125d8066b839d4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Feb 2018 01:08:12 +1000 Subject: powerpc/64: Use array of paca pointers and allocate pacas individually Change the paca array into an array of pointers to pacas. Allocate pacas individually. This allows flexibility in where the PACAs are allocated. Future work will allocate them node-local. Platforms that don't have address limits on PACAs would be able to defer PACA allocations until later in boot rather than allocate all possible ones up-front then freeing unused. This is slightly more overhead (one additional indirection) for cross CPU paca references, but those aren't too common. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 71d1b19ad1c0..e6016f4466f3 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -723,7 +723,7 @@ extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { if (sib == cpu) continue; - if (paca[sib].kvm_hstate.kvm_vcpu) + if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) flush = true; } if (flush) -- cgit v1.2.3 From 499dcd41378ebab2a37a0df65735748d66e75599 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Feb 2018 01:08:13 +1000 Subject: powerpc/64s: Allocate LPPACAs individually We no longer allocate lppacas in an array, so this patch removes the 1kB static alignment for the structure, and enforces the PAPR alignment requirements at allocation time. We can not reduce the 1kB allocation size however, due to existing KVM hypervisors. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/lppaca.h | 24 ++++----- arch/powerpc/kernel/machine_kexec_64.c | 15 ++++-- arch/powerpc/kernel/paca.c | 89 ++++++++++++---------------------- arch/powerpc/kvm/book3s_hv.c | 3 +- arch/powerpc/mm/numa.c | 4 +- arch/powerpc/platforms/pseries/kexec.c | 7 ++- 6 files changed, 63 insertions(+), 79 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index 6e4589eee2da..65d589689f01 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -36,14 +36,16 @@ #include /* - * We only have to have statically allocated lppaca structs on - * legacy iSeries, which supports at most 64 cpus. - */ -#define NR_LPPACAS 1 - -/* - * The Hypervisor barfs if the lppaca crosses a page boundary. A 1k - * alignment is sufficient to prevent this + * The lppaca is the "virtual processor area" registered with the hypervisor, + * H_REGISTER_VPA etc. + * + * According to PAPR, the structure is 640 bytes long, must be L1 cache line + * aligned, and must not cross a 4kB boundary. Its size field must be at + * least 640 bytes (but may be more). + * + * Pre-v4.14 KVM hypervisors reject the VPA if its size field is smaller than + * 1kB, so we dynamically allocate 1kB and advertise size as 1kB, but keep + * this structure as the canonical 640 byte size. */ struct lppaca { /* cacheline 1 contains read-only data */ @@ -97,11 +99,9 @@ struct lppaca { __be32 page_ins; /* CMO Hint - # page ins by OS */ u8 reserved11[148]; - volatile __be64 dtl_idx; /* Dispatch Trace Log head index */ + volatile __be64 dtl_idx; /* Dispatch Trace Log head index */ u8 reserved12[96]; -} __attribute__((__aligned__(0x400))); - -extern struct lppaca lppaca[]; +} ____cacheline_aligned; #define lppaca_of(cpu) (*paca_ptrs[cpu]->lppaca_ptr) diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index a250e3331f94..1044bf15d5ed 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -323,17 +323,24 @@ void default_machine_kexec(struct kimage *image) kexec_stack.thread_info.cpu = current_thread_info()->cpu; /* We need a static PACA, too; copy this CPU's PACA over and switch to - * it. Also poison per_cpu_offset to catch anyone using non-static - * data. + * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using + * non-static data. */ memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct)); kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL; +#ifdef CONFIG_PPC_PSERIES + kexec_paca.lppaca_ptr = NULL; +#endif paca_ptrs[kexec_paca.paca_index] = &kexec_paca; + setup_paca(&kexec_paca); - /* XXX: If anyone does 'dynamic lppacas' this will also need to be - * switched to a static version! + /* + * The lppaca should be unregistered at this point so the HV won't + * touch it. In the case of a crash, none of the lppacas are + * unregistered so there is not much we can do about it here. */ + /* * On Book3S, the copy must happen with the MMU off if we are either * using Radix page tables or we are not in an LPAR since we can diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index eef4891c9af6..6cddb9bdc151 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -23,82 +23,50 @@ #ifdef CONFIG_PPC_PSERIES /* - * The structure which the hypervisor knows about - this structure - * should not cross a page boundary. The vpa_init/register_vpa call - * is now known to fail if the lppaca structure crosses a page - * boundary. The lppaca is also used on POWER5 pSeries boxes. - * The lppaca is 640 bytes long, and cannot readily - * change since the hypervisor knows its layout, so a 1kB alignment - * will suffice to ensure that it doesn't cross a page boundary. + * See asm/lppaca.h for more detail. + * + * lppaca structures must must be 1kB in size, L1 cache line aligned, + * and not cross 4kB boundary. A 1kB size and 1kB alignment will satisfy + * these requirements. */ -struct lppaca lppaca[] = { - [0 ... (NR_LPPACAS-1)] = { +static inline void init_lppaca(struct lppaca *lppaca) +{ + BUILD_BUG_ON(sizeof(struct lppaca) != 640); + + *lppaca = (struct lppaca) { .desc = cpu_to_be32(0xd397d781), /* "LpPa" */ - .size = cpu_to_be16(sizeof(struct lppaca)), + .size = cpu_to_be16(0x400), .fpregs_in_use = 1, .slb_count = cpu_to_be16(64), .vmxregs_in_use = 0, - .page_ins = 0, - }, + .page_ins = 0, }; }; -static struct lppaca *extra_lppacas; -static long __initdata lppaca_size; - -static void __init allocate_lppacas(int nr_cpus, unsigned long limit) -{ - if (early_cpu_has_feature(CPU_FTR_HVMODE)) - return; - - if (nr_cpus <= NR_LPPACAS) - return; - - lppaca_size = PAGE_ALIGN(sizeof(struct lppaca) * - (nr_cpus - NR_LPPACAS)); - extra_lppacas = __va(memblock_alloc_base(lppaca_size, - PAGE_SIZE, limit)); -} - -static struct lppaca * __init new_lppaca(int cpu) +static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) { struct lppaca *lp; + size_t size = 0x400; + + BUILD_BUG_ON(size < sizeof(struct lppaca)); if (early_cpu_has_feature(CPU_FTR_HVMODE)) return NULL; - if (cpu < NR_LPPACAS) - return &lppaca[cpu]; - - lp = extra_lppacas + (cpu - NR_LPPACAS); - *lp = lppaca[0]; + lp = __va(memblock_alloc_base(size, 0x400, limit)); + init_lppaca(lp); return lp; } -static void __init free_lppacas(void) +static void __init free_lppaca(struct lppaca *lp) { - long new_size = 0, nr; + size_t size = 0x400; if (early_cpu_has_feature(CPU_FTR_HVMODE)) return; - if (!lppaca_size) - return; - nr = num_possible_cpus() - NR_LPPACAS; - if (nr > 0) - new_size = PAGE_ALIGN(nr * sizeof(struct lppaca)); - if (new_size >= lppaca_size) - return; - - memblock_free(__pa(extra_lppacas) + new_size, lppaca_size - new_size); - lppaca_size = new_size; + memblock_free(__pa(lp), size); } - -#else - -static inline void allocate_lppacas(int nr_cpus, unsigned long limit) { } -static inline void free_lppacas(void) { } - #endif /* CONFIG_PPC_BOOK3S */ #ifdef CONFIG_PPC_BOOK3S_64 @@ -167,7 +135,7 @@ EXPORT_SYMBOL(paca_ptrs); void __init initialise_paca(struct paca_struct *new_paca, int cpu) { #ifdef CONFIG_PPC_PSERIES - new_paca->lppaca_ptr = new_lppaca(cpu); + new_paca->lppaca_ptr = NULL; #endif #ifdef CONFIG_PPC_BOOK3E new_paca->kernel_pgd = swapper_pg_dir; @@ -254,13 +222,15 @@ void __init allocate_pacas(void) printk(KERN_DEBUG "Allocated %lu bytes for %u pacas\n", size, nr_cpu_ids); - allocate_lppacas(nr_cpu_ids, limit); - allocate_slb_shadows(nr_cpu_ids, limit); /* Can't use for_each_*_cpu, as they aren't functional yet */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { initialise_paca(paca_ptrs[cpu], cpu); +#ifdef CONFIG_PPC_PSERIES + paca_ptrs[cpu]->lppaca_ptr = new_lppaca(cpu, limit); +#endif + } } void __init free_unused_pacas(void) @@ -272,6 +242,9 @@ void __init free_unused_pacas(void) for (cpu = 0; cpu < paca_nr_cpu_ids; cpu++) { if (!cpu_possible(cpu)) { unsigned long pa = __pa(paca_ptrs[cpu]); +#ifdef CONFIG_PPC_PSERIES + free_lppaca(paca_ptrs[cpu]->lppaca_ptr); +#endif memblock_free(pa, sizeof(struct paca_struct)); paca_ptrs[cpu] = NULL; size += sizeof(struct paca_struct); @@ -288,8 +261,6 @@ void __init free_unused_pacas(void) if (size) printk(KERN_DEBUG "Freed %lu bytes for unused pacas\n", size); - free_lppacas(); - paca_nr_cpu_ids = nr_cpu_ids; paca_ptrs_size = new_ptrs_size; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 41fce69714d5..9b48d4a191ff 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -498,7 +498,8 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, * use 640 bytes of the structure though, so we should accept * clients that set a size of 640. */ - if (len < 640) + BUILD_BUG_ON(sizeof(struct lppaca) != 640); + if (len < sizeof(struct lppaca)) break; vpap = &tvcpu->arch.vpa; err = 0; diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 314d19ab9385..e9ec465068f1 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1110,7 +1110,7 @@ static void setup_cpu_associativity_change_counters(void) for_each_possible_cpu(cpu) { int i; u8 *counts = vphn_cpu_change_counts[cpu]; - volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts; for (i = 0; i < distance_ref_points_depth; i++) counts[i] = hypervisor_counts[i]; @@ -1136,7 +1136,7 @@ static int update_cpu_associativity_changes_mask(void) for_each_possible_cpu(cpu) { int i, changed = 0; u8 *counts = vphn_cpu_change_counts[cpu]; - volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts; for (i = 0; i < distance_ref_points_depth; i++) { if (hypervisor_counts[i] != counts[i]) { diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index eeb13429d685..3fe126796975 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -23,7 +23,12 @@ void pseries_kexec_cpu_down(int crash_shutdown, int secondary) { - /* Don't risk a hypervisor call if we're crashing */ + /* + * Don't risk a hypervisor call if we're crashing + * XXX: Why? The hypervisor is not crashing. It might be better + * to at least attempt unregister to avoid the hypervisor stepping + * on our memory. + */ if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) { int ret; int cpu = smp_processor_id(); -- cgit v1.2.3 From 9bd9be006c8ec0ccf7cb0422d35033af39d3f969 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Feb 2018 01:08:16 +1000 Subject: powerpc/mm/numa: move numa topology discovery earlier Split sparsemem initialisation from basic numa topology discovery. Move the parsing earlier in boot, before pacas are allocated. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/setup.h | 1 + arch/powerpc/kernel/setup-common.c | 3 +++ arch/powerpc/mm/mem.c | 5 ++++- arch/powerpc/mm/numa.c | 32 +++++++++++++++++++------------- 4 files changed, 27 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 469b7fdc9be4..d2bf233aebd5 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -23,6 +23,7 @@ extern void reloc_got2(unsigned long); #define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x))) void check_for_initrd(void); +void mem_topology_setup(void); void initmem_init(void); void setup_panic(void); #define ARCH_PANIC_TIMEOUT 180 diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index d73ec518ef80..9eaf26318d20 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -888,6 +888,9 @@ void __init setup_arch(char **cmdline_p) /* Check the SMT related command line arguments (ppc64). */ check_smt_enabled(); + /* Parse memory topology */ + mem_topology_setup(); + /* On BookE, setup per-core TLB data structures. */ setup_tlb_core_data(); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index fe8c61149fb8..4eee46ea4d96 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -212,7 +212,7 @@ walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, EXPORT_SYMBOL_GPL(walk_system_ram_range); #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(void) +void __init mem_topology_setup(void) { max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; min_low_pfn = MEMORY_START >> PAGE_SHIFT; @@ -224,7 +224,10 @@ void __init initmem_init(void) * memblock_regions */ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); +} +void __init initmem_init(void) +{ /* XXX need to clip this if using highmem? */ sparse_memory_present_with_active_regions(0); sparse_init(); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index e9ec465068f1..1eec1bcc03a6 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -836,18 +836,13 @@ out: of_node_put(rtas); } -void __init initmem_init(void) +void __init mem_topology_setup(void) { - int nid, cpu; - - max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; - max_pfn = max_low_pfn; + int cpu; if (parse_numa_properties()) setup_nonnuma(); - memblock_dump_all(); - /* * Modify the set of possible NUMA nodes to reflect information * available about the set of online nodes, and the set of nodes @@ -858,6 +853,23 @@ void __init initmem_init(void) find_possible_nodes(); + setup_node_to_cpumask_map(); + + reset_numa_cpu_lookup_table(); + + for_each_present_cpu(cpu) + numa_setup_cpu(cpu); +} + +void __init initmem_init(void) +{ + int nid; + + max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + max_pfn = max_low_pfn; + + memblock_dump_all(); + for_each_online_node(nid) { unsigned long start_pfn, end_pfn; @@ -868,10 +880,6 @@ void __init initmem_init(void) sparse_init(); - setup_node_to_cpumask_map(); - - reset_numa_cpu_lookup_table(); - /* * We need the numa_cpu_lookup_table to be accurate for all CPUs, * even before we online them, so that we can use cpu_to_{node,mem} @@ -881,8 +889,6 @@ void __init initmem_init(void) */ cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", ppc_numa_cpu_prepare, ppc_numa_cpu_dead); - for_each_present_cpu(cpu) - numa_setup_cpu(cpu); } static int __init early_numa(char *p) -- cgit v1.2.3 From 0633dafcf8921048eb1ddc8c6fcfbe1b1cf3e42c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Feb 2018 01:08:23 +1000 Subject: powerpc/64s/radix: Split early page table mapping to its own function Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 112 +++++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 47 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 573a9a2ee455..5afff0eb57d4 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -56,6 +56,50 @@ static __ref void *early_alloc_pgtable(unsigned long size) return pt; } +static int early_map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_k(ea); + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } + pudp = pud_offset(pgdp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); + smp_wmb(); + return 0; +} + + int radix__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, unsigned int map_page_size) @@ -68,54 +112,28 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa, * Make sure task size is correct as per the max adddr */ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - if (map_page_size == PUD_SIZE) { - ptep = (pte_t *)pudp; - goto set_the_pte; - } - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - if (map_page_size == PMD_SIZE) { - ptep = pmdp_ptep(pmdp); - goto set_the_pte; - } - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - } else { - pgdp = pgd_offset_k(ea); - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); - pgd_populate(&init_mm, pgdp, pudp); - } - pudp = pud_offset(pgdp, ea); - if (map_page_size == PUD_SIZE) { - ptep = (pte_t *)pudp; - goto set_the_pte; - } - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (map_page_size == PMD_SIZE) { - ptep = pmdp_ptep(pmdp); - goto set_the_pte; - } - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); + + if (!slab_is_available()) + return early_map_kernel_page(ea, pa, flags, map_page_size); + + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = pmdp_ptep(pmdp); + goto set_the_pte; } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; set_the_pte: set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); -- cgit v1.2.3 From 2ad452ffaaa8d2f1124208e507e7b045e8ee98a6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Feb 2018 01:08:24 +1000 Subject: powerpc/64s/radix: Allocate kernel page tables node-local if possible Try to allocate kernel page tables for direct mapping and vmemmap according to the node of the memory they will map. The node is not available for the linear map in early boot, so use range allocation to allocate the page tables from the region they map, which is effectively node-local. Signed-off-by: Nicholas Piggin [mpe: Fix build error in radix__create_section_mapping()] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 104 ++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 24 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 5afff0eb57d4..5f4b75315990 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -46,11 +46,26 @@ static int native_register_process_table(unsigned long base, unsigned long pg_sz return 0; } -static __ref void *early_alloc_pgtable(unsigned long size) +static __ref void *early_alloc_pgtable(unsigned long size, int nid, + unsigned long region_start, unsigned long region_end) { + unsigned long pa = 0; void *pt; - pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE)); + if (region_start || region_end) /* has region hint */ + pa = memblock_alloc_range(size, size, region_start, region_end, + MEMBLOCK_NONE); + else if (nid != -1) /* has node hint */ + pa = memblock_alloc_base_nid(size, size, + MEMBLOCK_ALLOC_ANYWHERE, + nid, MEMBLOCK_NONE); + + if (!pa) + pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE); + + BUG_ON(!pa); + + pt = __va(pa); memset(pt, 0, size); return pt; @@ -58,8 +73,11 @@ static __ref void *early_alloc_pgtable(unsigned long size) static int early_map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, - unsigned int map_page_size) + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) { + unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; pud_t *pudp; pmd_t *pmdp; @@ -67,8 +85,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, pgdp = pgd_offset_k(ea); if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); + pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, + region_start, region_end); pgd_populate(&init_mm, pgdp, pudp); } pudp = pud_offset(pgdp, ea); @@ -77,8 +95,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, goto set_the_pte; } if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, + region_start, region_end); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); @@ -87,23 +105,29 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, goto set_the_pte; } if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); + ptep = early_alloc_pgtable(PAGE_SIZE, nid, + region_start, region_end); pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); set_the_pte: - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); smp_wmb(); return 0; } - -int radix__map_kernel_page(unsigned long ea, unsigned long pa, +/* + * nid, region_start, and region_end are hints to try to place the page + * table memory in the same node or region. + */ +static int __map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, - unsigned int map_page_size) + unsigned int map_page_size, + int nid, + unsigned long region_start, unsigned long region_end) { + unsigned long pfn = pa >> PAGE_SHIFT; pgd_t *pgdp; pud_t *pudp; pmd_t *pmdp; @@ -113,9 +137,15 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa, */ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); - if (!slab_is_available()) - return early_map_kernel_page(ea, pa, flags, map_page_size); + if (unlikely(!slab_is_available())) + return early_map_kernel_page(ea, pa, flags, map_page_size, + nid, region_start, region_end); + /* + * Should make page table allocation functions be able to take a + * node, so we can place kernel page tables on the right nodes after + * boot. + */ pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) @@ -136,11 +166,25 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa, return -ENOMEM; set_the_pte: - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); smp_wmb(); return 0; } +static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size, int nid) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); +} + +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); +} + #ifdef CONFIG_STRICT_KERNEL_RWX void radix__change_memory_range(unsigned long start, unsigned long end, unsigned long clear) @@ -227,7 +271,8 @@ static inline void __meminit print_mapping(unsigned long start, } static int __meminit create_physical_mapping(unsigned long start, - unsigned long end) + unsigned long end, + int nid) { unsigned long vaddr, addr, mapping_size = 0; pgprot_t prot; @@ -283,7 +328,7 @@ retry: else prot = PAGE_KERNEL; - rc = radix__map_kernel_page(vaddr, addr, prot, mapping_size); + rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); if (rc) return rc; } @@ -292,7 +337,7 @@ retry: return 0; } -static void __init radix_init_pgtable(void) +void __init radix_init_pgtable(void) { unsigned long rts_field; struct memblock_region *reg; @@ -302,9 +347,16 @@ static void __init radix_init_pgtable(void) /* * Create the linear mapping, using standard page size for now */ - for_each_memblock(memory, reg) + for_each_memblock(memory, reg) { + /* + * The memblock allocator is up at this point, so the + * page tables will be allocated within the range. No + * need or a node (which we don't have yet). + */ WARN_ON(create_physical_mapping(reg->base, - reg->base + reg->size)); + reg->base + reg->size, + -1)); + } /* Find out how many PID bits are supported */ if (cpu_has_feature(CPU_FTR_HVMODE)) { @@ -333,7 +385,7 @@ static void __init radix_init_pgtable(void) * host. */ BUG_ON(PRTB_SIZE_SHIFT > 36); - process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); /* * Fill in the process table. */ @@ -810,7 +862,7 @@ static void remove_pagetable(unsigned long start, unsigned long end) int __ref radix__create_section_mapping(unsigned long start, unsigned long end) { - return create_physical_mapping(start, end); + return create_physical_mapping(start, end, -1); } int radix__remove_section_mapping(unsigned long start, unsigned long end) @@ -827,8 +879,12 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, { /* Create a PTE encoding */ unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; + int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); + int ret; + + ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); + BUG_ON(ret); - BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size)); return 0; } -- cgit v1.2.3 From 29ab6c4708a587bc27ea0c765ac36aef9c1a77c9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 14 Feb 2018 01:08:22 +1000 Subject: powerpc/mm: Pass node id into create_section_mapping Signed-off-by: Nicholas Piggin [mpe: Move __map_kernel_page_nid() inside #ifdef SPARSEMEM_VMEMMAP] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash.h | 2 +- arch/powerpc/include/asm/book3s/64/radix.h | 2 +- arch/powerpc/include/asm/sparsemem.h | 2 +- arch/powerpc/mm/hash_utils_64.c | 2 +- arch/powerpc/mm/mem.c | 4 ++-- arch/powerpc/mm/pgtable-book3s64.c | 6 +++--- arch/powerpc/mm/pgtable-radix.c | 18 +++++++++--------- 7 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 0920eff731b3..b1ace9619e94 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -201,7 +201,7 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned long start, extern void hash__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); -int hash__create_section_mapping(unsigned long start, unsigned long end); +int hash__create_section_mapping(unsigned long start, unsigned long end, int nid); int hash__remove_section_mapping(unsigned long start, unsigned long end); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 365010f66570..705193e7192f 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -313,7 +313,7 @@ static inline unsigned long radix__get_tree_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int radix__create_section_mapping(unsigned long start, unsigned long end); +int radix__create_section_mapping(unsigned long start, unsigned long end, int nid); int radix__remove_section_mapping(unsigned long start, unsigned long end); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index a7916ee6dfb6..bc66712bdc3c 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -17,7 +17,7 @@ #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_MEMORY_HOTPLUG -extern int create_section_mapping(unsigned long start, unsigned long end); +extern int create_section_mapping(unsigned long start, unsigned long end, int nid); extern int remove_section_mapping(unsigned long start, unsigned long end); #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7d07c7e17db6..ceb5494804b2 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -781,7 +781,7 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size) } } -int hash__create_section_mapping(unsigned long start, unsigned long end) +int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) { int rc = htab_bolt_mapping(start, end, __pa(start), pgprot_val(PAGE_KERNEL), mmu_linear_psize, diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 4eee46ea4d96..f50ce66dd6bd 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -117,7 +117,7 @@ int memory_add_physaddr_to_nid(u64 start) } #endif -int __weak create_section_mapping(unsigned long start, unsigned long end) +int __weak create_section_mapping(unsigned long start, unsigned long end, int nid) { return -ENODEV; } @@ -137,7 +137,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, resize_hpt_for_hotplug(memblock_phys_mem_size()); start = (unsigned long)__va(start); - rc = create_section_mapping(start, start + size); + rc = create_section_mapping(start, start + size, nid); if (rc) { pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n", start, start + size, rc); diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 422e80253a33..c736280068ce 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -155,12 +155,12 @@ void mmu_cleanup_all(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int create_section_mapping(unsigned long start, unsigned long end) +int create_section_mapping(unsigned long start, unsigned long end, int nid) { if (radix_enabled()) - return radix__create_section_mapping(start, end); + return radix__create_section_mapping(start, end, nid); - return hash__create_section_mapping(start, end); + return hash__create_section_mapping(start, end, nid); } int remove_section_mapping(unsigned long start, unsigned long end) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 5f4b75315990..a425636af8b4 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -171,13 +171,6 @@ set_the_pte: return 0; } -static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, - pgprot_t flags, - unsigned int map_page_size, int nid) -{ - return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); -} - int radix__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, unsigned int map_page_size) @@ -860,9 +853,9 @@ static void remove_pagetable(unsigned long start, unsigned long end) radix__flush_tlb_kernel_range(start, end); } -int __ref radix__create_section_mapping(unsigned long start, unsigned long end) +int __ref radix__create_section_mapping(unsigned long start, unsigned long end, int nid) { - return create_physical_mapping(start, end, -1); + return create_physical_mapping(start, end, nid); } int radix__remove_section_mapping(unsigned long start, unsigned long end) @@ -873,6 +866,13 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end) #endif /* CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, + pgprot_t flags, unsigned int map_page_size, + int nid) +{ + return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); +} + int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys) -- cgit v1.2.3