From 29674a1c71be710f8418468aa6a8addd6d1aba1c Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 17 Sep 2019 11:22:30 -0400 Subject: powerpc/pkeys: remove unused pkey_allows_readwrite pkey_allows_readwrite() was first introduced in the commit 5586cf61e108 ("powerpc: introduce execute-only pkey"), but the usage was removed entirely in the commit a4fcc877d4e1 ("powerpc/pkeys: Preallocate execute-only key"). Found by the "-Wunused-function" compiler warning flag. Fixes: a4fcc877d4e1 ("powerpc/pkeys: Preallocate execute-only key") Signed-off-by: Qian Cai Acked-by: Ram Pai Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1568733750-14580-1-git-send-email-cai@lca.pw --- arch/powerpc/mm/book3s64/pkeys.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index ae7fca40e5b3..59e0ebbd8036 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -307,16 +307,6 @@ void thread_pkey_regs_init(struct thread_struct *thread) write_iamr(pkey_iamr_mask); } -static inline bool pkey_allows_readwrite(int pkey) -{ - int pkey_shift = pkeyshift(pkey); - - if (!is_pkey_enabled(pkey)) - return true; - - return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift)); -} - int __execute_only_pkey(struct mm_struct *mm) { return mm->context.execute_only_pkey; -- cgit v1.2.3 From 5f5d6e40a01e70b731df843d8b5a61b4b28b19d9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 17 Sep 2019 18:08:51 +0530 Subject: powerpc/nvdimm: Update vmemmap_populated to check sub-section range With commit: 7cc7867fb061 ("mm/devm_memremap_pages: enable sub-section remap") pmem namespaces are remapped in 2M chunks. On architectures like ppc64 we can map the memmap area using 16MB hugepage size and that can cover a memory range of 16G. While enabling new pmem namespaces, since memory is added in sub-section chunks, before creating a new memmap mapping, kernel should check whether there is an existing memmap mapping covering the new pmem namespace. Currently, this is validated by checking whether the section covering the range is already initialized or not. Considering there can be multiple namespaces in the same section this can result in wrong validation. Update this to check for sub-sections in the range. This is done by checking for all pfns in the range we are mapping. We could optimize this by checking only just one pfn in each sub-section. But since this is not fast-path we keep this simple. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190917123851.22553-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/init_64.c | 54 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 4e08246acd79..83d8c7122d13 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -70,31 +70,46 @@ EXPORT_SYMBOL_GPL(kernstart_addr); #ifdef CONFIG_SPARSEMEM_VMEMMAP /* - * Given an address within the vmemmap, determine the pfn of the page that - * represents the start of the section it is within. Note that we have to + * Given an address within the vmemmap, determine the page that + * represents the start of the subsection it is within. Note that we have to * do this by hand as the proffered address may not be correctly aligned. * Subtraction of non-aligned pointers produces undefined results. */ -static unsigned long __meminit vmemmap_section_start(unsigned long page) +static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_addr) { - unsigned long offset = page - ((unsigned long)(vmemmap)); + unsigned long start_pfn; + unsigned long offset = vmemmap_addr - ((unsigned long)(vmemmap)); /* Return the pfn of the start of the section. */ - return (offset / sizeof(struct page)) & PAGE_SECTION_MASK; + start_pfn = (offset / sizeof(struct page)) & PAGE_SUBSECTION_MASK; + return pfn_to_page(start_pfn); } /* - * Check if this vmemmap page is already initialised. If any section - * which overlaps this vmemmap page is initialised then this page is - * initialised already. + * Since memory is added in sub-section chunks, before creating a new vmemmap + * mapping, the kernel should check whether there is an existing memmap mapping + * covering the new subsection added. This is needed because kernel can map + * vmemmap area using 16MB pages which will cover a memory range of 16G. Such + * a range covers multiple subsections (2M) + * + * If any subsection in the 16G range mapped by vmemmap is valid we consider the + * vmemmap populated (There is a page table entry already present). We can't do + * a page table lookup here because with the hash translation we don't keep + * vmemmap details in linux page table. */ -static int __meminit vmemmap_populated(unsigned long start, int page_size) +static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) { - unsigned long end = start + page_size; - start = (unsigned long)(pfn_to_page(vmemmap_section_start(start))); + struct page *start; + unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; + start = vmemmap_subsection_start(vmemmap_addr); - for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page))) - if (pfn_valid(page_to_pfn((struct page *)start))) + for (; (unsigned long)start < vmemmap_end; start += PAGES_PER_SUBSECTION) + /* + * pfn valid check here is intended to really check + * whether we have any subsection already initialized + * in this range. + */ + if (pfn_valid(page_to_pfn(start))) return 1; return 0; @@ -201,6 +216,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, void *p = NULL; int rc; + /* + * This vmemmap range is backing different subsections. If any + * of that subsection is marked valid, that means we already + * have initialized a page table covering this range and hence + * the vmemmap range is populated. + */ if (vmemmap_populated(start, page_size)) continue; @@ -290,9 +311,10 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, struct page *page; /* - * the section has already be marked as invalid, so - * vmemmap_populated() true means some other sections still - * in this page, so skip it. + * We have already marked the subsection we are trying to remove + * invalid. So if we want to remove the vmemmap range, we + * need to make sure there is no subsection marked valid + * in this range. */ if (vmemmap_populated(start, page_size)) continue; -- cgit v1.2.3 From 75838a3290cd4ebbd1f567f310ba04b6ef017ce4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 24 Oct 2019 15:05:41 +0530 Subject: powerpc/pseries: Don't fail hash page table insert for bolted mapping If the hypervisor returned H_PTEG_FULL for H_ENTER hcall, retry a hash page table insert by removing a random entry from the group. After some runtime, it is very well possible to find all the 8 hash page table entry slot in the hpte group used for mapping. Don't fail a bolted entry insert in that case. With Storage class memory a user can find this error easily since a namespace enable/disable is equivalent to memory add/remove. This results in failures as reported below: $ ndctl create-namespace -r region1 -t pmem -m devdax -a 65536 -s 100M libndctl: ndctl_dax_enable: dax1.3: failed to enable Error: namespace1.2: failed to enable failed to create namespace: No such device or address In kernel log we find the details as below: Unable to create mapping for hot added memory 0xc000042006000000..0xc00004200d000000: -1 dax_pmem: probe of dax1.3 failed with error -14 This indicates that we failed to create a bolted hash table entry for direct-map address backing the namespace. We also observe failures such that not all namespaces will be enabled with ndctl enable-namespace all command. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191024093542.29777-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/hash_utils.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 6c123760164e..6e5a769ebcb8 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -294,7 +294,14 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, HPTE_V_BOLTED, psize, psize, ssize); - + if (ret == -1) { + /* Try to remove a non bolted entry */ + ret = mmu_hash_ops.hpte_remove(hpteg); + if (ret != -1) + ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, + HPTE_V_BOLTED, psize, psize, + ssize); + } if (ret < 0) break; -- cgit v1.2.3 From d78d5dace5398b542fd5a21b50db6e88ce7d392e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 24 Oct 2019 15:05:42 +0530 Subject: powerpc/book3s64/hash: Use secondary hash for bolted mapping if the primary is full With bolted hash page table entry, kernel currently only use primary hash group when inserting the hash page table entry. In the rare case where kernel find all the 8 primary hash slot occupied by bolted entries, this can result in hash page table insert failure for bolted entries. Avoid this by using the secondary hash group. This is different from what kernel does for the non-bolted mapping. With non-bolted entries kernel will try secondary before removing an existing entry from hash page table group. With bolted prefer primary hash group and hence try to insert the page table entry by removing a slot from primary before trying the secondary hash group. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191024093542.29777-3-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/hash_native.c | 38 ++++++++++++++++++++++++++-------- arch/powerpc/mm/book3s64/hash_utils.c | 13 +++++++++++- arch/powerpc/platforms/pseries/lpar.c | 14 ++++++++++--- 3 files changed, 52 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 523e42eb11da..d2d8237ea9d5 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -482,19 +482,12 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, return ret; } -static long native_hpte_find(unsigned long vpn, int psize, int ssize) +static long __native_hpte_find(unsigned long want_v, unsigned long slot) { struct hash_pte *hptep; - unsigned long hash; + unsigned long hpte_v; unsigned long i; - long slot; - unsigned long want_v, hpte_v; - hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(vpn, psize, ssize); - - /* Bolted mappings are only ever in the primary group */ - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; for (i = 0; i < HPTES_PER_GROUP; i++) { hptep = htab_address + slot; @@ -508,6 +501,33 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize) return -1; } +static long native_hpte_find(unsigned long vpn, int psize, int ssize) +{ + unsigned long hpte_group; + unsigned long want_v; + unsigned long hash; + long slot; + + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* + * We try to keep bolted entries always in primary hash + * But in some case we can find them in secondary too. + */ + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __native_hpte_find(want_v, hpte_group); + if (slot < 0) { + /* Try in secondary */ + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __native_hpte_find(want_v, hpte_group); + if (slot < 0) + return -1; + } + + return slot; +} + /* * Update the page protection bits. Intended to be used to create * guard pages for kernel data structures on pages which are bolted diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 6e5a769ebcb8..a9d1f72de848 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -263,6 +263,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long vsid = get_kernel_vsid(vaddr, ssize); unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); unsigned long tprot = prot; + bool secondary_hash = false; /* * If we hit a bad address return error. @@ -291,17 +292,27 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); BUG_ON(!mmu_hash_ops.hpte_insert); +repeat: ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, HPTE_V_BOLTED, psize, psize, ssize); if (ret == -1) { - /* Try to remove a non bolted entry */ + /* + * Try to to keep bolted entries in primary. + * Remove non bolted entries and try insert again + */ ret = mmu_hash_ops.hpte_remove(hpteg); if (ret != -1) ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot, HPTE_V_BOLTED, psize, psize, ssize); + if (ret == -1 && !secondary_hash) { + secondary_hash = true; + hpteg = ((~hash & htab_hash_mask) * HPTES_PER_GROUP); + goto repeat; + } } + if (ret < 0) break; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 3126fc02e50b..74c59a1e9627 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -938,11 +938,19 @@ static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize) hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); want_v = hpte_encode_avpn(vpn, psize, ssize); - /* Bolted entries are always in the primary group */ + /* + * We try to keep bolted entries always in primary hash + * But in some case we can find them in secondary too. + */ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot = __pSeries_lpar_hpte_find(want_v, hpte_group); - if (slot < 0) - return -1; + if (slot < 0) { + /* Try in secondary */ + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + slot = __pSeries_lpar_hpte_find(want_v, hpte_group); + if (slot < 0) + return -1; + } return hpte_group + slot; } -- cgit v1.2.3 From a42d6ba8c5be5aa597d25dbc15e336a2eca40260 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 24 Oct 2019 13:27:59 +0530 Subject: powerpc/mm/book3s64/radix: Remove unused code. mm_tlb_flush_nested change was added in the mmu gather tlb flush to handle the case of parallel pte invalidate happening with mmap_sem held in read mode. This fix was done by commit 02390f66bd23 ("powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP") and the problem is explained in detail in commit 99baac21e458 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem") This was later updated by commit 7a30df49f63a ("mm: mmu_gather: remove __tlb_reset_range() for force flush") to do a full mm flush rather than a range flush. By commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") we are also now allowing a page table free in mmap_sem read mode which means we should do a PWC flush too. Our current full mm flush imply a PWC flush. With all the above change the mm_tlb_flush_nested(mm) branch in radix__tlb_flush will never be taken because for the nested case we would have taken the if (tlb->fullmm) branch. This patch removes the unused code. Also, remove the gflush change in __radix__flush_tlb_range that was added to handle the range tlb flush code. We only check for THP there because hugetlb is flushed via a different code path where page size is explicitly specified. This is a partial revert of commit 02390f66bd23 ("powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191024075801.22434-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/radix_tlb.c | 66 ++++-------------------------------- 1 file changed, 6 insertions(+), 60 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 67af871190c6..24d1f30556e0 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -832,8 +832,7 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, - unsigned long start, unsigned long end, - bool flush_all_sizes) + unsigned long start, unsigned long end) { unsigned long pid; @@ -879,26 +878,16 @@ is_local: } } } else { - bool hflush = flush_all_sizes; - bool gflush = flush_all_sizes; + bool hflush = false; unsigned long hstart, hend; - unsigned long gstart, gend; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - hflush = true; - - if (hflush) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { hstart = (start + PMD_SIZE - 1) & PMD_MASK; hend = end & PMD_MASK; if (hstart == hend) hflush = false; - } - - if (gflush) { - gstart = (start + PUD_SIZE - 1) & PUD_MASK; - gend = end & PUD_MASK; - if (gstart == gend) - gflush = false; + else + hflush = true; } if (local) { @@ -907,9 +896,6 @@ is_local: if (hflush) __tlbiel_va_range(hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M); - if (gflush) - __tlbiel_va_range(gstart, gend, pid, - PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); } else if (cputlb_use_tlbie()) { asm volatile("ptesync": : :"memory"); @@ -917,10 +903,6 @@ is_local: if (hflush) __tlbie_va_range(hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M); - if (gflush) - __tlbie_va_range(gstart, gend, pid, - PUD_SIZE, MMU_PAGE_1G); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); } else { _tlbiel_va_range_multicast(mm, @@ -928,9 +910,6 @@ is_local: if (hflush) _tlbiel_va_range_multicast(mm, hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false); - if (gflush) - _tlbiel_va_range_multicast(mm, - gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false); } } preempt_enable(); @@ -945,7 +924,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, return radix__flush_hugetlb_tlb_range(vma, start, end); #endif - __radix__flush_tlb_range(vma->vm_mm, start, end, false); + __radix__flush_tlb_range(vma->vm_mm, start, end); } EXPORT_SYMBOL(radix__flush_tlb_range); @@ -1023,39 +1002,6 @@ void radix__tlb_flush(struct mmu_gather *tlb) */ if (tlb->fullmm) { __flush_all_mm(mm, true); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) - } else if (mm_tlb_flush_nested(mm)) { - /* - * If there is a concurrent invalidation that is clearing ptes, - * then it's possible this invalidation will miss one of those - * cleared ptes and miss flushing the TLB. If this invalidate - * returns before the other one flushes TLBs, that can result - * in it returning while there are still valid TLBs inside the - * range to be invalidated. - * - * See mm/memory.c:tlb_finish_mmu() for more details. - * - * The solution to this is ensure the entire range is always - * flushed here. The problem for powerpc is that the flushes - * are page size specific, so this "forced flush" would not - * do the right thing if there are a mix of page sizes in - * the range to be invalidated. So use __flush_tlb_range - * which invalidates all possible page sizes in the range. - * - * PWC flush probably is not be required because the core code - * shouldn't free page tables in this path, but accounting - * for the possibility makes us a bit more robust. - * - * need_flush_all is an uncommon case because page table - * teardown should be done with exclusive locks held (but - * after locks are dropped another invalidate could come - * in), it could be optimized further if necessary. - */ - if (!tlb->need_flush_all) - __radix__flush_tlb_range(mm, start, end, true); - else - radix__flush_all_mm(mm); -#endif } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->need_flush_all) radix__flush_tlb_mm(mm); -- cgit v1.2.3 From 52162ec784fa05f3a4b1d8e84421279998be3773 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 24 Oct 2019 13:28:00 +0530 Subject: powerpc/mm/book3s64/radix: Use freed_tables instead of need_flush_all With commit 22a61c3c4f13 ("asm-generic/tlb: Track freeing of page-table directories in struct mmu_gather") we now track whether we freed page table in mmu_gather. Use that to decide whether to flush Page Walk Cache. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191024075801.22434-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/pgalloc.h | 15 --------------- arch/powerpc/include/asm/book3s/64/tlbflush.h | 16 ---------------- arch/powerpc/mm/book3s64/radix_tlb.c | 11 +++-------- 3 files changed, 3 insertions(+), 39 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index d5a44912902f..f6968c811026 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -122,11 +122,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { - /* - * By now all the pud entries should be none entries. So go - * ahead and flush the page walk cache - */ - flush_tlb_pgtable(tlb, address); pgtable_free_tlb(tlb, pud, PUD_INDEX); } @@ -143,11 +138,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { - /* - * By now all the pud entries should be none entries. So go - * ahead and flush the page walk cache - */ - flush_tlb_pgtable(tlb, address); return pgtable_free_tlb(tlb, pmd, PMD_INDEX); } @@ -166,11 +156,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { - /* - * By now all the pud entries should be none entries. So go - * ahead and flush the page walk cache - */ - flush_tlb_pgtable(tlb, address); pgtable_free_tlb(tlb, table, PTE_INDEX); } diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 7aa8195b6cff..dcb5c3839d2f 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -147,22 +147,6 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, flush_tlb_page(vma, address); } -/* - * flush the page walk cache for the address - */ -static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long address) -{ - /* - * Flush the page table walk cache on freeing a page table. We already - * have marked the upper/higher level page table entry none by now. - * So it is safe to flush PWC here. - */ - if (!radix_enabled()) - return; - - radix__flush_tlb_pwc(tlb, address); -} - extern bool tlbie_capable; extern bool tlbie_enabled; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 24d1f30556e0..f9a4d5793f03 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -732,18 +732,13 @@ local: } preempt_enable(); } + void radix__flush_all_mm(struct mm_struct *mm) { __flush_all_mm(mm, false); } EXPORT_SYMBOL(radix__flush_all_mm); -void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) -{ - tlb->need_flush_all = 1; -} -EXPORT_SYMBOL(radix__flush_tlb_pwc); - void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { @@ -1003,12 +998,12 @@ void radix__tlb_flush(struct mmu_gather *tlb) if (tlb->fullmm) { __flush_all_mm(mm, true); } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { - if (!tlb->need_flush_all) + if (!tlb->freed_tables) radix__flush_tlb_mm(mm); else radix__flush_all_mm(mm); } else { - if (!tlb->need_flush_all) + if (!tlb->freed_tables) radix__flush_tlb_range_psize(mm, start, end, psize); else radix__flush_tlb_pwc_range_psize(mm, start, end, psize); -- cgit v1.2.3 From 864edb758c50c30787bb51f2e26c4be2b4937025 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 24 Oct 2019 13:28:01 +0530 Subject: powerpc/mm/book3s64/radix: Flush the full mm even when need_flush_all is set With the previous patch, we should now not be using need_flush_all for powerpc. But then make sure we force a PID tlbie flush with RIC=2 if we ever find need_flush_all set. Also don't reset it after a mmu gather flush. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191024075801.22434-3-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/radix_tlb.c | 3 +-- include/asm-generic/tlb.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index f9a4d5793f03..a95175c0972b 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -995,7 +995,7 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (tlb->fullmm) { + if (tlb->fullmm || tlb->need_flush_all) { __flush_all_mm(mm, true); } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->freed_tables) @@ -1008,7 +1008,6 @@ void radix__tlb_flush(struct mmu_gather *tlb) else radix__flush_tlb_pwc_range_psize(mm, start, end, psize); } - tlb->need_flush_all = 0; } static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 04c0644006fd..e64991142a8b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -428,7 +428,7 @@ static inline void tlb_change_page_size(struct mmu_gather *tlb, { #ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { - if (!tlb->fullmm) + if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); } -- cgit v1.2.3 From 16f6b67cf03cb43db7104acb2ca877bdc2606c92 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 1 Oct 2019 14:16:56 +0530 Subject: powerpc/book3s64/hash: Add cond_resched to avoid soft lockup warning With large memory (8TB and more) hotplug, we can get soft lockup warnings as below. These were caused by a long loop without any explicit cond_resched which is a problem for !PREEMPT kernels. Avoid this using cond_resched() while inserting hash page table entries. We already do similar cond_resched() in __add_pages(), see commit f64ac5e6e306 ("mm, memory_hotplug: add scheduling point to __add_pages"). rcu: 3-....: (24002 ticks this GP) idle=13e/1/0x4000000000000002 softirq=722/722 fqs=12001 (t=24003 jiffies g=4285 q=2002) NMI backtrace for cpu 3 CPU: 3 PID: 3870 Comm: ndctl Not tainted 5.3.0-197.18-default+ #2 Call Trace: dump_stack+0xb0/0xf4 (unreliable) nmi_cpu_backtrace+0x124/0x130 nmi_trigger_cpumask_backtrace+0x1ac/0x1f0 arch_trigger_cpumask_backtrace+0x28/0x3c rcu_dump_cpu_stacks+0xf8/0x154 rcu_sched_clock_irq+0x878/0xb40 update_process_times+0x48/0x90 tick_sched_handle.isra.16+0x4c/0x80 tick_sched_timer+0x68/0xe0 __hrtimer_run_queues+0x180/0x430 hrtimer_interrupt+0x110/0x300 timer_interrupt+0x108/0x2f0 decrementer_common+0x114/0x120 --- interrupt: 901 at arch_add_memory+0xc0/0x130 LR = arch_add_memory+0x74/0x130 memremap_pages+0x494/0x650 devm_memremap_pages+0x3c/0xa0 pmem_attach_disk+0x188/0x750 nvdimm_bus_probe+0xac/0x2c0 really_probe+0x148/0x570 driver_probe_device+0x19c/0x1d0 device_driver_attach+0xcc/0x100 bind_store+0x134/0x1c0 drv_attr_store+0x44/0x60 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1a0/0x270 __vfs_write+0x3c/0x70 vfs_write+0xd0/0x260 ksys_write+0xdc/0x130 system_call+0x5c/0x68 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191001084656.31277-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/hash_utils.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index a9d1f72de848..b30435c7d804 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -316,6 +316,7 @@ repeat: if (ret < 0) break; + cond_resched(); #ifdef CONFIG_DEBUG_PAGEALLOC if (debug_pagealloc_enabled() && (paddr >> PAGE_SHIFT) < linear_map_hash_count) -- cgit v1.2.3 From 23eb7f560a2a6a1b0dbaaaae8685da75314347e4 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 4 Nov 2019 13:32:56 +1100 Subject: powerpc: Convert flush_icache_range & friends to C Similar to commit 22e9c88d486a ("powerpc/64: reuse PPC32 static inline flush_dcache_range()") this patch converts the following ASM symbols to C: flush_icache_range() __flush_dcache_icache() __flush_dcache_icache_phys() This was done as we discovered a long-standing bug where the length of the range was truncated due to using a 32 bit shift instead of a 64 bit one. By converting these functions to C, it becomes easier to maintain. flush_dcache_icache_phys() retains a critical assembler section as we must ensure there are no memory accesses while the data MMU is disabled (authored by Christophe Leroy). Since this has no external callers, it has also been made static, allowing the compiler to inline it within flush_dcache_icache_page(). Signed-off-by: Alastair D'Silva Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman [mpe: Minor fixups, don't export __flush_dcache_icache()] Link: https://lore.kernel.org/r/20191104023305.9581-5-alastair@au1.ibm.com --- arch/powerpc/include/asm/cache.h | 26 +++--- arch/powerpc/include/asm/cacheflush.h | 24 +++--- arch/powerpc/kernel/misc_32.S | 120 --------------------------- arch/powerpc/kernel/misc_64.S | 102 ----------------------- arch/powerpc/mm/mem.c | 150 +++++++++++++++++++++++++++++++++- 5 files changed, 170 insertions(+), 252 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h index afb88754e0e0..72b81015cebe 100644 --- a/arch/powerpc/include/asm/cache.h +++ b/arch/powerpc/include/asm/cache.h @@ -96,22 +96,7 @@ static inline u32 l1_icache_bytes(void) } #endif -#endif /* ! __ASSEMBLY__ */ - -#if defined(__ASSEMBLY__) -/* - * For a snooping icache, we still need a dummy icbi to purge all the - * prefetched instructions from the ifetch buffers. We also need a sync - * before the icbi to order the the actual stores to memory that might - * have modified instructions with the icbi. - */ -#define PURGE_PREFETCHED_INS \ - sync; \ - icbi 0,r3; \ - sync; \ - isync -#else #define __read_mostly __attribute__((__section__(".data..read_mostly"))) #ifdef CONFIG_PPC_BOOK3S_32 @@ -145,6 +130,17 @@ static inline void dcbst(void *addr) { __asm__ __volatile__ ("dcbst 0, %0" : : "r"(addr) : "memory"); } + +static inline void icbi(void *addr) +{ + asm volatile ("icbi 0, %0" : : "r"(addr) : "memory"); +} + +static inline void iccci(void *addr) +{ + asm volatile ("iccci 0, %0" : : "r"(addr) : "memory"); +} + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_CACHE_H */ diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index ed57843ef452..4a1c9f0200e1 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -42,24 +42,20 @@ extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -extern void flush_icache_range(unsigned long, unsigned long); +void flush_icache_range(unsigned long start, unsigned long stop); extern void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -extern void __flush_dcache_icache(void *page_va); extern void flush_dcache_icache_page(struct page *page); -#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE) -extern void __flush_dcache_icache_phys(unsigned long physaddr); -#else -static inline void __flush_dcache_icache_phys(unsigned long physaddr) -{ - BUG(); -} -#endif - -/* - * Write any modified data cache blocks out to memory and invalidate them. - * Does not invalidate the corresponding instruction cache blocks. +void __flush_dcache_icache(void *page); + +/** + * flush_dcache_range(): Write any modified data cache blocks out to memory and + * invalidate them. Does not invalidate the corresponding instruction cache + * blocks. + * + * @start: the start address + * @stop: the stop address (exclusive) */ static inline void flush_dcache_range(unsigned long start, unsigned long stop) { diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 82df4b09e79f..f4e4a1926a7a 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -316,126 +316,6 @@ _GLOBAL(flush_instruction_cache) EXPORT_SYMBOL(flush_instruction_cache) #endif /* CONFIG_PPC_8xx */ -/* - * Write any modified data cache blocks out to memory - * and invalidate the corresponding instruction cache blocks. - * This is a no-op on the 601. - * - * flush_icache_range(unsigned long start, unsigned long stop) - */ -_GLOBAL(flush_icache_range) -#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) - PURGE_PREFETCHED_INS - blr /* for 601 and e200, do nothing */ -#else - rlwinm r3,r3,0,0,31 - L1_CACHE_SHIFT - subf r4,r3,r4 - addi r4,r4,L1_CACHE_BYTES - 1 - srwi. r4,r4,L1_CACHE_SHIFT - beqlr - mtctr r4 - mr r6,r3 -1: dcbst 0,r3 - addi r3,r3,L1_CACHE_BYTES - bdnz 1b - sync /* wait for dcbst's to get to ram */ -#ifndef CONFIG_44x - mtctr r4 -2: icbi 0,r6 - addi r6,r6,L1_CACHE_BYTES - bdnz 2b -#else - /* Flash invalidate on 44x because we are passed kmapped addresses and - this doesn't work for userspace pages due to the virtually tagged - icache. Sigh. */ - iccci 0, r0 -#endif - sync /* additional sync needed on g4 */ - isync - blr -#endif -_ASM_NOKPROBE_SYMBOL(flush_icache_range) -EXPORT_SYMBOL(flush_icache_range) - -/* - * Flush a particular page from the data cache to RAM. - * Note: this is necessary because the instruction cache does *not* - * snoop from the data cache. - * This is a no-op on the 601 and e200 which have a unified cache. - * - * void __flush_dcache_icache(void *page) - */ -_GLOBAL(__flush_dcache_icache) -#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) - PURGE_PREFETCHED_INS - blr -#else - rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ - li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ - mtctr r4 - mr r6,r3 -0: dcbst 0,r3 /* Write line to ram */ - addi r3,r3,L1_CACHE_BYTES - bdnz 0b - sync -#ifdef CONFIG_44x - /* We don't flush the icache on 44x. Those have a virtual icache - * and we don't have access to the virtual address here (it's - * not the page vaddr but where it's mapped in user space). The - * flushing of the icache on these is handled elsewhere, when - * a change in the address space occurs, before returning to - * user space - */ -BEGIN_MMU_FTR_SECTION - blr -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x) -#endif /* CONFIG_44x */ - mtctr r4 -1: icbi 0,r6 - addi r6,r6,L1_CACHE_BYTES - bdnz 1b - sync - isync - blr -#endif - -#ifndef CONFIG_BOOKE -/* - * Flush a particular page from the data cache to RAM, identified - * by its physical address. We turn off the MMU so we can just use - * the physical address (this may be a highmem page without a kernel - * mapping). - * - * void __flush_dcache_icache_phys(unsigned long physaddr) - */ -_GLOBAL(__flush_dcache_icache_phys) -#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) - PURGE_PREFETCHED_INS - blr /* for 601 and e200, do nothing */ -#else - mfmsr r10 - rlwinm r0,r10,0,28,26 /* clear DR */ - mtmsr r0 - isync - rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ - li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ - mtctr r4 - mr r6,r3 -0: dcbst 0,r3 /* Write line to ram */ - addi r3,r3,L1_CACHE_BYTES - bdnz 0b - sync - mtctr r4 -1: icbi 0,r6 - addi r6,r6,L1_CACHE_BYTES - bdnz 1b - sync - mtmsr r10 /* restore DR */ - isync - blr -#endif -#endif /* CONFIG_BOOKE */ - /* * Copy a whole page. We use the dcbz instruction on the destination * to reduce memory traffic (it eliminates the unnecessary reads of diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 9bc0aa9aeb65..ff20c253f273 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -49,108 +49,6 @@ _GLOBAL(call_do_irq) mtlr r0 blr - .section ".toc","aw" -PPC64_CACHES: - .tc ppc64_caches[TC],ppc64_caches - .section ".text" - -/* - * Write any modified data cache blocks out to memory - * and invalidate the corresponding instruction cache blocks. - * - * flush_icache_range(unsigned long start, unsigned long stop) - * - * flush all bytes from start through stop-1 inclusive - */ - -_GLOBAL_TOC(flush_icache_range) -BEGIN_FTR_SECTION - PURGE_PREFETCHED_INS - blr -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) -/* - * Flush the data cache to memory - * - * Different systems have different cache line sizes - * and in some cases i-cache and d-cache line sizes differ from - * each other. - */ - ld r10,PPC64_CACHES@toc(r2) - lwz r7,DCACHEL1BLOCKSIZE(r10)/* Get cache block size */ - addi r5,r7,-1 - andc r6,r3,r5 /* round low to line bdy */ - subf r8,r6,r4 /* compute length */ - add r8,r8,r5 /* ensure we get enough */ - lwz r9,DCACHEL1LOGBLOCKSIZE(r10) /* Get log-2 of cache block size */ - srd. r8,r8,r9 /* compute line count */ - beqlr /* nothing to do? */ - mtctr r8 -1: dcbst 0,r6 - add r6,r6,r7 - bdnz 1b - sync - -/* Now invalidate the instruction cache */ - - lwz r7,ICACHEL1BLOCKSIZE(r10) /* Get Icache block size */ - addi r5,r7,-1 - andc r6,r3,r5 /* round low to line bdy */ - subf r8,r6,r4 /* compute length */ - add r8,r8,r5 - lwz r9,ICACHEL1LOGBLOCKSIZE(r10) /* Get log-2 of Icache block size */ - srd. r8,r8,r9 /* compute line count */ - beqlr /* nothing to do? */ - mtctr r8 -2: icbi 0,r6 - add r6,r6,r7 - bdnz 2b - isync - blr -_ASM_NOKPROBE_SYMBOL(flush_icache_range) -EXPORT_SYMBOL(flush_icache_range) - -/* - * Flush a particular page from the data cache to RAM. - * Note: this is necessary because the instruction cache does *not* - * snoop from the data cache. - * - * void __flush_dcache_icache(void *page) - */ -_GLOBAL(__flush_dcache_icache) -/* - * Flush the data cache to memory - * - * Different systems have different cache line sizes - */ - -BEGIN_FTR_SECTION - PURGE_PREFETCHED_INS - blr -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) - -/* Flush the dcache */ - ld r7,PPC64_CACHES@toc(r2) - clrrdi r3,r3,PAGE_SHIFT /* Page align */ - lwz r4,DCACHEL1BLOCKSPERPAGE(r7) /* Get # dcache blocks per page */ - lwz r5,DCACHEL1BLOCKSIZE(r7) /* Get dcache block size */ - mr r6,r3 - mtctr r4 -0: dcbst 0,r6 - add r6,r6,r5 - bdnz 0b - sync - -/* Now invalidate the icache */ - - lwz r4,ICACHEL1BLOCKSPERPAGE(r7) /* Get # icache blocks per page */ - lwz r5,ICACHEL1BLOCKSIZE(r7) /* Get icache block size */ - mtctr r4 -1: icbi 0,r3 - add r3,r3,r5 - bdnz 1b - isync - blr - _GLOBAL(__bswapdi2) EXPORT_SYMBOL(__bswapdi2) srdi r8,r3,32 diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index be941d382c8d..3392cacabe60 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -318,6 +318,120 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } +/** + * flush_coherent_icache() - if a CPU has a coherent icache, flush it + * @addr: The base address to use (can be any valid address, the whole cache will be flushed) + * Return true if the cache was flushed, false otherwise + */ +static inline bool flush_coherent_icache(unsigned long addr) +{ + /* + * For a snooping icache, we still need a dummy icbi to purge all the + * prefetched instructions from the ifetch buffers. We also need a sync + * before the icbi to order the the actual stores to memory that might + * have modified instructions with the icbi. + */ + if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + mb(); /* sync */ + icbi((void *)addr); + mb(); /* sync */ + isync(); + return true; + } + + return false; +} + +/** + * invalidate_icache_range() - Flush the icache by issuing icbi across an address range + * @start: the start address + * @stop: the stop address (exclusive) + */ +static void invalidate_icache_range(unsigned long start, unsigned long stop) +{ + unsigned long shift = l1_icache_shift(); + unsigned long bytes = l1_icache_bytes(); + char *addr = (char *)(start & ~(bytes - 1)); + unsigned long size = stop - (unsigned long)addr + (bytes - 1); + unsigned long i; + + for (i = 0; i < size >> shift; i++, addr += bytes) + icbi(addr); + + mb(); /* sync */ + isync(); +} + +/** + * flush_icache_range: Write any modified data cache blocks out to memory + * and invalidate the corresponding blocks in the instruction cache + * + * Generic code will call this after writing memory, before executing from it. + * + * @start: the start address + * @stop: the stop address (exclusive) + */ +void flush_icache_range(unsigned long start, unsigned long stop) +{ + if (flush_coherent_icache(start)) + return; + + clean_dcache_range(start, stop); + + if (IS_ENABLED(CONFIG_44x)) { + /* + * Flash invalidate on 44x because we are passed kmapped + * addresses and this doesn't work for userspace pages due to + * the virtually tagged icache. + */ + iccci((void *)start); + mb(); /* sync */ + isync(); + } else + invalidate_icache_range(start, stop); +} +EXPORT_SYMBOL(flush_icache_range); + +#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC64) +/** + * flush_dcache_icache_phys() - Flush a page by it's physical address + * @physaddr: the physical address of the page + */ +static void flush_dcache_icache_phys(unsigned long physaddr) +{ + unsigned long bytes = l1_dcache_bytes(); + unsigned long nb = PAGE_SIZE / bytes; + unsigned long addr = physaddr & PAGE_MASK; + unsigned long msr, msr0; + unsigned long loop1 = addr, loop2 = addr; + + msr0 = mfmsr(); + msr = msr0 & ~MSR_DR; + /* + * This must remain as ASM to prevent potential memory accesses + * while the data MMU is disabled + */ + asm volatile( + " mtctr %2;\n" + " mtmsr %3;\n" + " isync;\n" + "0: dcbst 0, %0;\n" + " addi %0, %0, %4;\n" + " bdnz 0b;\n" + " sync;\n" + " mtctr %2;\n" + "1: icbi 0, %1;\n" + " addi %1, %1, %4;\n" + " bdnz 1b;\n" + " sync;\n" + " mtmsr %5;\n" + " isync;\n" + : "+&r" (loop1), "+&r" (loop2) + : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0) + : "ctr", "memory"); +} +#endif // !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC64) + /* * This is called when a page has been modified by the kernel. * It just marks the page as not i-cache clean. We do the i-cache @@ -350,12 +464,46 @@ void flush_dcache_icache_page(struct page *page) __flush_dcache_icache(start); kunmap_atomic(start); } else { - __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); + unsigned long addr = page_to_pfn(page) << PAGE_SHIFT; + + if (flush_coherent_icache(addr)) + return; + flush_dcache_icache_phys(addr); } #endif } EXPORT_SYMBOL(flush_dcache_icache_page); +/** + * __flush_dcache_icache(): Flush a particular page from the data cache to RAM. + * Note: this is necessary because the instruction cache does *not* + * snoop from the data cache. + * + * @page: the address of the page to flush + */ +void __flush_dcache_icache(void *p) +{ + unsigned long addr = (unsigned long)p; + + if (flush_coherent_icache(addr)) + return; + + clean_dcache_range(addr, addr + PAGE_SIZE); + + /* + * We don't flush the icache on 44x. Those have a virtual icache and we + * don't have access to the virtual address here (it's not the page + * vaddr but where it's mapped in user space). The flushing of the + * icache on these is handled elsewhere, when a change in the address + * space occurs, before returning to user space. + */ + + if (cpu_has_feature(MMU_FTR_TYPE_44x)) + return; + + invalidate_icache_range(addr, addr + PAGE_SIZE); +} + void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); -- cgit v1.2.3 From 076265907cf9633bbef861c7c2a1c26a8209f283 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 4 Nov 2019 13:32:57 +1100 Subject: powerpc: Chunk calls to flush_dcache_range in arch_*_memory When presented with large amounts of memory being hotplugged (in my test case, ~890GB), the call to flush_dcache_range takes a while (~50 seconds), triggering RCU stalls. This patch breaks up the call into 1GB chunks, calling cond_resched() inbetween to allow the scheduler to run. Fixes: fb5924fddf9e ("powerpc/mm: Flush cache on memory hot(un)plug") Signed-off-by: Alastair D'Silva Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191104023305.9581-6-alastair@au1.ibm.com --- arch/powerpc/mm/mem.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3392cacabe60..634e5ea55b6b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -104,6 +104,27 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } +#define FLUSH_CHUNK_SIZE SZ_1G +/** + * flush_dcache_range_chunked(): Write any modified data cache blocks out to + * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE + * Does not invalidate the corresponding instruction cache blocks. + * + * @start: the start address + * @stop: the stop address (exclusive) + * @chunk: the max size of the chunks + */ +static void flush_dcache_range_chunked(unsigned long start, unsigned long stop, + unsigned long chunk) +{ + unsigned long i; + + for (i = start; i < stop; i += chunk) { + flush_dcache_range(i, min(stop, start + chunk)); + cond_resched(); + } +} + int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions) { @@ -120,7 +141,8 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, start, start + size, rc); return -EFAULT; } - flush_dcache_range(start, start + size); + + flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE); return __add_pages(nid, start_pfn, nr_pages, restrictions); } @@ -137,7 +159,8 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, /* Remove htab bolted mappings for this section of memory */ start = (unsigned long)__va(start); - flush_dcache_range(start, start + size); + flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE); + ret = remove_section_mapping(start, start + size); WARN_ON_ONCE(ret); -- cgit v1.2.3 From ea458effa88e4f4739551d76fe3f702daf607995 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 4 Nov 2019 13:32:58 +1100 Subject: powerpc: Don't flush caches when adding memory This operation takes a significant amount of time when hotplugging large amounts of memory (~50 seconds with 890GB of persistent memory). This was orignally in commit fb5924fddf9e ("powerpc/mm: Flush cache on memory hot(un)plug") to support memtrace, but the flush on add is not needed as it is flushed on remove. Signed-off-by: Alastair D'Silva Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191104023305.9581-7-alastair@au1.ibm.com --- arch/powerpc/mm/mem.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 634e5ea55b6b..7573002077a6 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -142,8 +142,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return -EFAULT; } - flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE); - return __add_pages(nid, start_pfn, nr_pages, restrictions); } -- cgit v1.2.3 From 4ed47dbefa299d7b36944f6d4001ee83612dd680 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 20 Sep 2019 17:45:36 +0800 Subject: powerpc: move memstart_addr and kernstart_addr to init-common.c These two variables are both defined in init_32.c and init_64.c. Move them to init-common.c and make them __ro_after_init. Signed-off-by: Jason Yan Reviewed-by: Christophe Leroy Reviewed-by: Diana Craciun Tested-by: Diana Craciun Signed-off-by: Scott Wood Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init-common.c | 5 +++++ arch/powerpc/mm/init_32.c | 5 ----- arch/powerpc/mm/init_64.c | 5 ----- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index a84da92920f7..e223da482c0c 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -21,6 +21,11 @@ #include #include +phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull; +EXPORT_SYMBOL_GPL(memstart_addr); +phys_addr_t kernstart_addr __ro_after_init; +EXPORT_SYMBOL_GPL(kernstart_addr); + static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index b04896a88d79..872df48ae41b 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -56,11 +56,6 @@ phys_addr_t total_memory; phys_addr_t total_lowmem; -phys_addr_t memstart_addr = (phys_addr_t)~0ull; -EXPORT_SYMBOL(memstart_addr); -phys_addr_t kernstart_addr; -EXPORT_SYMBOL(kernstart_addr); - #ifdef CONFIG_RELOCATABLE /* Used in __va()/__pa() */ long long virt_phys_offset; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 4e08246acd79..28b9596b4b90 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -63,11 +63,6 @@ #include -phys_addr_t memstart_addr = ~0; -EXPORT_SYMBOL_GPL(memstart_addr); -phys_addr_t kernstart_addr; -EXPORT_SYMBOL_GPL(kernstart_addr); - #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Given an address within the vmemmap, determine the pfn of the page that -- cgit v1.2.3 From 39f4b7bf7571a9c6529b0bb3de49c9bb0998f194 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 20 Sep 2019 17:45:37 +0800 Subject: powerpc: introduce kernstart_virt_addr to store the kernel base Now the kernel base is a fixed value - KERNELBASE. To support KASLR, we need a variable to store the kernel base. Signed-off-by: Jason Yan Reviewed-by: Christophe Leroy Reviewed-by: Diana Craciun Tested-by: Diana Craciun Signed-off-by: Scott Wood Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/page.h | 2 ++ arch/powerpc/mm/init-common.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index c8bb14ff4713..88fa53f89f5a 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -325,6 +325,8 @@ void arch_free_page(struct page *page, int order); struct vm_area_struct; +extern unsigned long kernstart_virt_addr; + #include #endif /* __ASSEMBLY__ */ #include diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index e223da482c0c..42ef7a6e6098 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -25,6 +25,8 @@ phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull; EXPORT_SYMBOL_GPL(memstart_addr); phys_addr_t kernstart_addr __ro_after_init; EXPORT_SYMBOL_GPL(kernstart_addr); +unsigned long kernstart_virt_addr __ro_after_init = KERNELBASE; +EXPORT_SYMBOL_GPL(kernstart_virt_addr); static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); -- cgit v1.2.3 From aa1d2090e69311c65f69c0fa2311d1d0f01c55f8 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 20 Sep 2019 17:45:38 +0800 Subject: powerpc/fsl_booke/32: introduce create_kaslr_tlb_entry() helper Add a new helper create_kaslr_tlb_entry() to create a tlb entry by the virtual and physical address. This is a preparation to support boot kernel at a randomized address. Signed-off-by: Jason Yan Reviewed-by: Christophe Leroy Reviewed-by: Diana Craciun Tested-by: Diana Craciun Signed-off-by: Scott Wood Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_fsl_booke.S | 35 +++++++++++++++++++++++++++++++++++ arch/powerpc/mm/mmu_decl.h | 1 + 2 files changed, 36 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index adf0505dbe02..8c1928176ffe 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -1114,6 +1114,41 @@ __secondary_hold_acknowledge: .long -1 #endif +/* + * Create a 64M tlb by address and entry + * r3 - entry + * r4 - virtual address + * r5/r6 - physical address + */ +_GLOBAL(create_kaslr_tlb_entry) + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ + mtspr SPRN_MAS0,r7 /* Write MAS0 */ + + lis r3,(MAS1_VALID|MAS1_IPROT)@h + ori r3,r3,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l + mtspr SPRN_MAS1,r3 /* Write MAS1 */ + + lis r3,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@h + ori r3,r3,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@l + and r3,r3,r4 + ori r3,r3,MAS2_M_IF_NEEDED@l + mtspr SPRN_MAS2,r3 /* Write MAS2(EPN) */ + +#ifdef CONFIG_PHYS_64BIT + ori r8,r6,(MAS3_SW|MAS3_SR|MAS3_SX) + mtspr SPRN_MAS3,r8 /* Write MAS3(RPN) */ + mtspr SPRN_MAS7,r5 +#else + ori r8,r5,(MAS3_SW|MAS3_SR|MAS3_SX) + mtspr SPRN_MAS3,r8 /* Write MAS3(RPN) */ +#endif + + tlbwe /* Write TLB */ + isync + sync + blr + /* * Create a tlb entry with the same effective and physical address as * the tlb entry used by the current running code. But set the TS to 1. diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index c750ac9ec713..9da261ad54c3 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -139,6 +139,7 @@ extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, extern void adjust_total_lowmem(void); extern int switch_to_as1(void); extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); +void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys); #endif extern void loadcam_entry(unsigned int index); extern void loadcam_multi(int first_idx, int num, int tmp_idx); -- cgit v1.2.3 From c061b38a3e48663c29611e3b60afffe624d7c830 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 20 Sep 2019 17:45:39 +0800 Subject: powerpc/fsl_booke/32: introduce reloc_kernel_entry() helper Add a new helper reloc_kernel_entry() to jump back to the start of the new kernel. After we put the new kernel in a randomized place we can use this new helper to enter the kernel and begin to relocate again. Signed-off-by: Jason Yan Reviewed-by: Christophe Leroy Reviewed-by: Diana Craciun Tested-by: Diana Craciun Signed-off-by: Scott Wood Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_fsl_booke.S | 13 +++++++++++++ arch/powerpc/mm/mmu_decl.h | 1 + 2 files changed, 14 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 8c1928176ffe..d9f599b01ff1 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -1149,6 +1149,19 @@ _GLOBAL(create_kaslr_tlb_entry) sync blr +/* + * Return to the start of the relocated kernel and run again + * r3 - virtual address of fdt + * r4 - entry of the kernel + */ +_GLOBAL(reloc_kernel_entry) + mfmsr r7 + rlwinm r7, r7, 0, ~(MSR_IS | MSR_DS) + + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r7 + rfi + /* * Create a tlb entry with the same effective and physical address as * the tlb entry used by the current running code. But set the TS to 1. diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 9da261ad54c3..bbfd185fb99a 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -140,6 +140,7 @@ extern void adjust_total_lowmem(void); extern int switch_to_as1(void); extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys); +void reloc_kernel_entry(void *fdt, int addr); #endif extern void loadcam_entry(unsigned int index); extern void loadcam_multi(int first_idx, int num, int tmp_idx); -- cgit v1.2.3 From 2b0e86cc5de6dabadc2d64cefa429fc227c8a756 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 20 Sep 2019 17:45:40 +0800 Subject: powerpc/fsl_booke/32: implement KASLR infrastructure This patch add support to boot kernel from places other than KERNELBASE. Since CONFIG_RELOCATABLE has already supported, what we need to do is map or copy kernel to a proper place and relocate. Freescale Book-E parts expect lowmem to be mapped by fixed TLB entries(TLB1). The TLB1 entries are not suitable to map the kernel directly in a randomized region, so we chose to copy the kernel to a proper place and restart to relocate. The offset of the kernel was not randomized yet(a fixed 64M is set). We will randomize it in the next patch. Signed-off-by: Jason Yan Tested-by: Diana Craciun Reviewed-by: Christophe Leroy Signed-off-by: Scott Wood [mpe: Use PTRRELOC() in early_init()] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 11 +++++ arch/powerpc/include/asm/nohash/mmu-book3e.h | 1 - arch/powerpc/kernel/early_32.c | 9 ++-- arch/powerpc/kernel/fsl_booke_entry_mapping.S | 15 +++---- arch/powerpc/kernel/head_fsl_booke.S | 13 ++++-- arch/powerpc/mm/mmu_decl.h | 7 +++ arch/powerpc/mm/nohash/Makefile | 1 + arch/powerpc/mm/nohash/fsl_booke.c | 7 ++- arch/powerpc/mm/nohash/kaslr_booke.c | 62 +++++++++++++++++++++++++++ 9 files changed, 109 insertions(+), 17 deletions(-) create mode 100644 arch/powerpc/mm/nohash/kaslr_booke.c (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3e56c9c2f16e..4c4a0fcd1674 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -551,6 +551,17 @@ config RELOCATABLE setting can still be useful to bootwrappers that need to know the load address of the kernel (eg. u-boot/mkimage). +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + depends on (FSL_BOOKE && FLATMEM && PPC32) + depends on RELOCATABLE + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts + relying on knowledge of the location of kernel internals. + + If unsure, say Y. + config RELOCATABLE_TEST bool "Test relocatable kernel" depends on (PPC64 && RELOCATABLE) diff --git a/arch/powerpc/include/asm/nohash/mmu-book3e.h b/arch/powerpc/include/asm/nohash/mmu-book3e.h index fa3efc2d310f..b41004664312 100644 --- a/arch/powerpc/include/asm/nohash/mmu-book3e.h +++ b/arch/powerpc/include/asm/nohash/mmu-book3e.h @@ -75,7 +75,6 @@ #define MAS2_E 0x00000001 #define MAS2_WIMGE_MASK 0x0000001f #define MAS2_EPN_MASK(size) (~0 << (size + 10)) -#define MAS2_VAL(addr, size, flags) ((addr) & MAS2_EPN_MASK(size) | (flags)) #define MAS3_RPN 0xFFFFF000 #define MAS3_U0 0x00000200 diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c index 3482118ffe76..ef2ad4945904 100644 --- a/arch/powerpc/kernel/early_32.c +++ b/arch/powerpc/kernel/early_32.c @@ -19,10 +19,13 @@ */ notrace unsigned long __init early_init(unsigned long dt_ptr) { - unsigned long offset = reloc_offset(); + unsigned long kva, offset = reloc_offset(); + + kva = *PTRRELOC(&kernstart_virt_addr); /* First zero the BSS */ - memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start); + if (kva == KERNELBASE) + memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start); /* * Identify the CPU type and fix up code sections @@ -32,5 +35,5 @@ notrace unsigned long __init early_init(unsigned long dt_ptr) apply_feature_fixups(); - return KERNELBASE + offset; + return kva + offset; } diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S index f4d3eaae54a9..8bccce6544b5 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -155,23 +155,22 @@ skpinv: addi r6,r6,1 /* Increment */ #if defined(ENTRY_MAPPING_BOOT_SETUP) -/* 6. Setup KERNELBASE mapping in TLB1[0] */ +/* 6. Setup kernstart_virt_addr mapping in TLB1[0] */ lis r6,0x1000 /* Set MAS0(TLBSEL) = TLB1(1), ESEL = 0 */ mtspr SPRN_MAS0,r6 lis r6,(MAS1_VALID|MAS1_IPROT)@h ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l mtspr SPRN_MAS1,r6 - lis r6,MAS2_VAL(PAGE_OFFSET, BOOK3E_PAGESZ_64M, MAS2_M_IF_NEEDED)@h - ori r6,r6,MAS2_VAL(PAGE_OFFSET, BOOK3E_PAGESZ_64M, MAS2_M_IF_NEEDED)@l + lis r6,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@h + ori r6,r6,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@l + and r6,r6,r20 + ori r6,r6,MAS2_M_IF_NEEDED@l mtspr SPRN_MAS2,r6 mtspr SPRN_MAS3,r8 tlbwe -/* 7. Jump to KERNELBASE mapping */ - lis r6,(KERNELBASE & ~0xfff)@h - ori r6,r6,(KERNELBASE & ~0xfff)@l - rlwinm r7,r25,0,0x03ffffff - add r6,r7,r6 +/* 7. Jump to kernstart_virt_addr mapping */ + mr r6,r20 #elif defined(ENTRY_MAPPING_KEXEC_SETUP) /* diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index d9f599b01ff1..838d9d4650c7 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -155,6 +155,8 @@ _ENTRY(_start); */ _ENTRY(__early_start) + LOAD_REG_ADDR_PIC(r20, kernstart_virt_addr) + lwz r20,0(r20) #define ENTRY_MAPPING_BOOT_SETUP #include "fsl_booke_entry_mapping.S" @@ -277,8 +279,8 @@ set_ivor: ori r6, r6, swapper_pg_dir@l lis r5, abatron_pteptrs@h ori r5, r5, abatron_pteptrs@l - lis r4, KERNELBASE@h - ori r4, r4, KERNELBASE@l + lis r3, kernstart_virt_addr@ha + lwz r4, kernstart_virt_addr@l(r3) stw r5, 0(r4) /* Save abatron_pteptrs at a fixed location */ stw r6, 0(r5) @@ -1067,7 +1069,12 @@ __secondary_start: mr r5,r25 /* phys kernel start */ rlwinm r5,r5,0,~0x3ffffff /* aligned 64M */ subf r4,r5,r4 /* memstart_addr - phys kernel start */ - li r5,0 /* no device tree */ + lis r7,KERNELBASE@h + ori r7,r7,KERNELBASE@l + cmpw r20,r7 /* if kernstart_virt_addr != KERNELBASE, randomized */ + beq 2f + li r4,0 +2: li r5,0 /* no device tree */ li r6,0 /* not boot cpu */ bl restore_to_as0 diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index bbfd185fb99a..ae06c5675abb 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -141,10 +141,17 @@ extern int switch_to_as1(void); extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys); void reloc_kernel_entry(void *fdt, int addr); +extern int is_second_reloc; #endif extern void loadcam_entry(unsigned int index); extern void loadcam_multi(int first_idx, int num, int tmp_idx); +#ifdef CONFIG_RANDOMIZE_BASE +void kaslr_early_init(void *dt_ptr, phys_addr_t size); +#else +static inline void kaslr_early_init(void *dt_ptr, phys_addr_t size) {} +#endif + struct tlbcam { u32 MAS0; u32 MAS1; diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index 33b6f6f29d3f..0424f6ce5bd8 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_40x) += 40x.o obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o endif diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_booke.c index 556e3cd52a35..2dc27cf88add 100644 --- a/arch/powerpc/mm/nohash/fsl_booke.c +++ b/arch/powerpc/mm/nohash/fsl_booke.c @@ -263,7 +263,8 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, int __initdata is_second_reloc; notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) { - unsigned long base = KERNELBASE; + unsigned long base = kernstart_virt_addr; + phys_addr_t size; kernstart_addr = start; if (is_second_reloc) { @@ -291,7 +292,7 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) start &= ~0x3ffffff; base &= ~0x3ffffff; virt_phys_offset = base - start; - early_get_first_memblock_info(__va(dt_ptr), NULL); + early_get_first_memblock_info(__va(dt_ptr), &size); /* * We now get the memstart_addr, then we should check if this * address is the same as what the PAGE_OFFSET map to now. If @@ -316,6 +317,8 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) /* We should never reach here */ panic("Relocation error"); } + + kaslr_early_init(__va(dt_ptr), size); } #endif #endif diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c new file mode 100644 index 000000000000..29c1567d8d40 --- /dev/null +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// Copyright (C) 2019 Jason Yan + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size, + unsigned long kernel_sz) +{ + /* return a fixed offset of 64M for now */ + return SZ_64M; +} + +/* + * To see if we need to relocate the kernel to a random offset + * void *dt_ptr - address of the device tree + * phys_addr_t size - size of the first memory block + */ +notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size) +{ + unsigned long tlb_virt; + phys_addr_t tlb_phys; + unsigned long offset; + unsigned long kernel_sz; + + kernel_sz = (unsigned long)_end - (unsigned long)_stext; + + offset = kaslr_choose_location(dt_ptr, size, kernel_sz); + if (offset == 0) + return; + + kernstart_virt_addr += offset; + kernstart_addr += offset; + + is_second_reloc = 1; + + if (offset >= SZ_64M) { + tlb_virt = round_down(kernstart_virt_addr, SZ_64M); + tlb_phys = round_down(kernstart_addr, SZ_64M); + + /* Create kernel map to relocate in */ + create_kaslr_tlb_entry(1, tlb_virt, tlb_phys); + } + + /* Copy the kernel to it's new location and run */ + memcpy((void *)kernstart_virt_addr, (void *)_stext, kernel_sz); + flush_icache_range(kernstart_virt_addr, kernstart_virt_addr + kernel_sz); + + reloc_kernel_entry(dt_ptr, kernstart_virt_addr); +} -- cgit v1.2.3 From 6a38ea1d7b94c6c84dbf3f5c969be5e3648d9a70 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 20 Sep 2019 17:45:41 +0800 Subject: powerpc/fsl_booke/32: randomize the kernel image offset After we have the basic support of relocate the kernel in some appropriate place, we can start to randomize the offset now. Entropy is derived from the banner and timer, which will change every build and boot. This not so much safe so additionally the bootloader may pass entropy via the /chosen/kaslr-seed node in device tree. We will use the first 512M of the low memory to randomize the kernel image. The memory will be split in 64M zones. We will use the lower 8 bit of the entropy to decide the index of the 64M zone. Then we chose a 16K aligned offset inside the 64M zone to put the kernel in. We also check if we will overlap with some areas like the dtb area, the initrd area or the crashkernel area. If we cannot find a proper area, kaslr will be disabled and boot from the original kernel. Some pieces of code are derived from arch/x86/boot/compressed/kaslr.c or arch/arm64/kernel/kaslr.c such as rotate_xor(). Credit goes to Kees and Ard. Signed-off-by: Jason Yan Reviewed-by: Diana Craciun Tested-by: Diana Craciun Reviewed-by: Christophe Leroy Signed-off-by: Scott Wood Signed-off-by: Michael Ellerman --- arch/powerpc/mm/nohash/kaslr_booke.c | 325 ++++++++++++++++++++++++++++++++++- 1 file changed, 323 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 29c1567d8d40..7b238fc2c8a9 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -12,15 +12,336 @@ #include #include #include +#include +#include #include #include +#include #include +#include +#include + +struct regions { + unsigned long pa_start; + unsigned long pa_end; + unsigned long kernel_size; + unsigned long dtb_start; + unsigned long dtb_end; + unsigned long initrd_start; + unsigned long initrd_end; + unsigned long crash_start; + unsigned long crash_end; + int reserved_mem; + int reserved_mem_addr_cells; + int reserved_mem_size_cells; +}; + +/* Simplified build-specific string for starting entropy. */ +static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; + +struct regions __initdata regions; + +static __init void kaslr_get_cmdline(void *fdt) +{ + int node = fdt_path_offset(fdt, "/chosen"); + + early_init_dt_scan_chosen(node, "chosen", 1, boot_command_line); +} + +static unsigned long __init rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + const unsigned long *ptr = area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple starting entropy. This can make it defferent for + * every build but it is still not enough. Stronger entropy should + * be added to make it change for every boot. + */ +static unsigned long __init get_boot_seed(void *fdt) +{ + unsigned long hash = 0; + + hash = rotate_xor(hash, build_str, sizeof(build_str)); + hash = rotate_xor(hash, fdt, fdt_totalsize(fdt)); + + return hash; +} + +static __init u64 get_kaslr_seed(void *fdt) +{ + int node, len; + fdt64_t *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +static __init bool regions_overlap(u32 s1, u32 e1, u32 s2, u32 e2) +{ + return e1 >= s2 && e2 >= s1; +} + +static __init bool overlaps_reserved_region(const void *fdt, u32 start, + u32 end) +{ + int subnode, len, i; + u64 base, size; + + /* check for overlap with /memreserve/ entries */ + for (i = 0; i < fdt_num_mem_rsv(fdt); i++) { + if (fdt_get_mem_rsv(fdt, i, &base, &size) < 0) + continue; + if (regions_overlap(start, end, base, base + size)) + return true; + } + + if (regions.reserved_mem < 0) + return false; + + /* check for overlap with static reservations in /reserved-memory */ + for (subnode = fdt_first_subnode(fdt, regions.reserved_mem); + subnode >= 0; + subnode = fdt_next_subnode(fdt, subnode)) { + const fdt32_t *reg; + u64 rsv_end; + + len = 0; + reg = fdt_getprop(fdt, subnode, "reg", &len); + while (len >= (regions.reserved_mem_addr_cells + + regions.reserved_mem_size_cells)) { + base = fdt32_to_cpu(reg[0]); + if (regions.reserved_mem_addr_cells == 2) + base = (base << 32) | fdt32_to_cpu(reg[1]); + + reg += regions.reserved_mem_addr_cells; + len -= 4 * regions.reserved_mem_addr_cells; + + size = fdt32_to_cpu(reg[0]); + if (regions.reserved_mem_size_cells == 2) + size = (size << 32) | fdt32_to_cpu(reg[1]); + + reg += regions.reserved_mem_size_cells; + len -= 4 * regions.reserved_mem_size_cells; + + if (base >= regions.pa_end) + continue; + + rsv_end = min(base + size, (u64)U32_MAX); + + if (regions_overlap(start, end, base, rsv_end)) + return true; + } + } + return false; +} + +static __init bool overlaps_region(const void *fdt, u32 start, + u32 end) +{ + if (regions_overlap(start, end, __pa(_stext), __pa(_end))) + return true; + + if (regions_overlap(start, end, regions.dtb_start, + regions.dtb_end)) + return true; + + if (regions_overlap(start, end, regions.initrd_start, + regions.initrd_end)) + return true; + + if (regions_overlap(start, end, regions.crash_start, + regions.crash_end)) + return true; + + return overlaps_reserved_region(fdt, start, end); +} + +static void __init get_crash_kernel(void *fdt, unsigned long size) +{ +#ifdef CONFIG_CRASH_CORE + unsigned long long crash_size, crash_base; + int ret; + + ret = parse_crashkernel(boot_command_line, size, &crash_size, + &crash_base); + if (ret != 0 || crash_size == 0) + return; + if (crash_base == 0) + crash_base = KDUMP_KERNELBASE; + + regions.crash_start = (unsigned long)crash_base; + regions.crash_end = (unsigned long)(crash_base + crash_size); + + pr_debug("crash_base=0x%llx crash_size=0x%llx\n", crash_base, crash_size); +#endif +} + +static void __init get_initrd_range(void *fdt) +{ + u64 start, end; + int node, len; + const __be32 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return; + + prop = fdt_getprop(fdt, node, "linux,initrd-start", &len); + if (!prop) + return; + start = of_read_number(prop, len / 4); + + prop = fdt_getprop(fdt, node, "linux,initrd-end", &len); + if (!prop) + return; + end = of_read_number(prop, len / 4); + + regions.initrd_start = (unsigned long)start; + regions.initrd_end = (unsigned long)end; + + pr_debug("initrd_start=0x%llx initrd_end=0x%llx\n", start, end); +} + +static __init unsigned long get_usable_address(const void *fdt, + unsigned long start, + unsigned long offset) +{ + unsigned long pa; + unsigned long pa_end; + + for (pa = offset; (long)pa > (long)start; pa -= SZ_16K) { + pa_end = pa + regions.kernel_size; + if (overlaps_region(fdt, pa, pa_end)) + continue; + + return pa; + } + return 0; +} + +static __init void get_cell_sizes(const void *fdt, int node, int *addr_cells, + int *size_cells) +{ + const int *prop; + int len; + + /* + * Retrieve the #address-cells and #size-cells properties + * from the 'node', or use the default if not provided. + */ + *addr_cells = *size_cells = 1; + + prop = fdt_getprop(fdt, node, "#address-cells", &len); + if (len == 4) + *addr_cells = fdt32_to_cpu(*prop); + prop = fdt_getprop(fdt, node, "#size-cells", &len); + if (len == 4) + *size_cells = fdt32_to_cpu(*prop); +} + +static unsigned long __init kaslr_legal_offset(void *dt_ptr, unsigned long index, + unsigned long offset) +{ + unsigned long koffset = 0; + unsigned long start; + + while ((long)index >= 0) { + offset = memstart_addr + index * SZ_64M + offset; + start = memstart_addr + index * SZ_64M; + koffset = get_usable_address(dt_ptr, start, offset); + if (koffset) + break; + index--; + } + + if (koffset != 0) + koffset -= memstart_addr; + + return koffset; +} static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size, unsigned long kernel_sz) { - /* return a fixed offset of 64M for now */ - return SZ_64M; + unsigned long offset, random; + unsigned long ram, linear_sz; + u64 seed; + unsigned long index; + + kaslr_get_cmdline(dt_ptr); + + random = get_boot_seed(dt_ptr); + + seed = get_tb() << 32; + seed ^= get_tb(); + random = rotate_xor(random, &seed, sizeof(seed)); + + /* + * Retrieve (and wipe) the seed from the FDT + */ + seed = get_kaslr_seed(dt_ptr); + if (seed) + random = rotate_xor(random, &seed, sizeof(seed)); + else + pr_warn("KASLR: No safe seed for randomizing the kernel base.\n"); + + ram = min_t(phys_addr_t, __max_low_memory, size); + ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true); + linear_sz = min_t(unsigned long, ram, SZ_512M); + + /* If the linear size is smaller than 64M, do not randmize */ + if (linear_sz < SZ_64M) + return 0; + + /* check for a reserved-memory node and record its cell sizes */ + regions.reserved_mem = fdt_path_offset(dt_ptr, "/reserved-memory"); + if (regions.reserved_mem >= 0) + get_cell_sizes(dt_ptr, regions.reserved_mem, + ®ions.reserved_mem_addr_cells, + ®ions.reserved_mem_size_cells); + + regions.pa_start = memstart_addr; + regions.pa_end = memstart_addr + linear_sz; + regions.dtb_start = __pa(dt_ptr); + regions.dtb_end = __pa(dt_ptr) + fdt_totalsize(dt_ptr); + regions.kernel_size = kernel_sz; + + get_initrd_range(dt_ptr); + get_crash_kernel(dt_ptr, ram); + + /* + * Decide which 64M we want to start + * Only use the low 8 bits of the random seed + */ + index = random & 0xFF; + index %= linear_sz / SZ_64M; + + /* Decide offset inside 64M */ + offset = random % (SZ_64M - kernel_sz); + offset = round_down(offset, SZ_16K); + + return kaslr_legal_offset(dt_ptr, index, offset); } /* -- cgit v1.2.3 From b39609720069f5a6eed2b3e3f618c23587021ff5 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 20 Sep 2019 17:45:42 +0800 Subject: powerpc/fsl_booke/kaslr: clear the original kernel if randomized The original kernel still exists in the memory, clear it now. Signed-off-by: Jason Yan Reviewed-by: Christophe Leroy Reviewed-by: Diana Craciun Tested-by: Diana Craciun Signed-off-by: Scott Wood Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mmu_decl.h | 2 ++ arch/powerpc/mm/nohash/fsl_booke.c | 1 + arch/powerpc/mm/nohash/kaslr_booke.c | 11 +++++++++++ 3 files changed, 14 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index ae06c5675abb..8e99649c24fc 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -148,8 +148,10 @@ extern void loadcam_multi(int first_idx, int num, int tmp_idx); #ifdef CONFIG_RANDOMIZE_BASE void kaslr_early_init(void *dt_ptr, phys_addr_t size); +void kaslr_late_init(void); #else static inline void kaslr_early_init(void *dt_ptr, phys_addr_t size) {} +static inline void kaslr_late_init(void) {} #endif struct tlbcam { diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_booke.c index 2dc27cf88add..b4eb06ceb189 100644 --- a/arch/powerpc/mm/nohash/fsl_booke.c +++ b/arch/powerpc/mm/nohash/fsl_booke.c @@ -269,6 +269,7 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) kernstart_addr = start; if (is_second_reloc) { virt_phys_offset = PAGE_OFFSET - memstart_addr; + kaslr_late_init(); return; } diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 7b238fc2c8a9..aa1b60c782e7 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -381,3 +381,14 @@ notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size) reloc_kernel_entry(dt_ptr, kernstart_virt_addr); } + +void __init kaslr_late_init(void) +{ + /* If randomized, clear the original kernel */ + if (kernstart_virt_addr != KERNELBASE) { + unsigned long kernel_sz; + + kernel_sz = (unsigned long)_end - kernstart_virt_addr; + memzero_explicit((void *)KERNELBASE, kernel_sz); + } +} -- cgit v1.2.3 From 8c2ae87be5a4bb2d3cadff72d3aa5f1e3d5aac2b Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 20 Sep 2019 17:45:43 +0800 Subject: powerpc/fsl_booke/kaslr: support nokaslr cmdline parameter One may want to disable kaslr when boot, so provide a cmdline parameter 'nokaslr' to support this. Signed-off-by: Jason Yan Reviewed-by: Diana Craciun Tested-by: Diana Craciun Reviewed-by: Christophe Leroy Signed-off-by: Scott Wood Signed-off-by: Michael Ellerman --- arch/powerpc/mm/nohash/kaslr_booke.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index aa1b60c782e7..4a75f2d9bf0e 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -281,6 +281,11 @@ static unsigned long __init kaslr_legal_offset(void *dt_ptr, unsigned long index return koffset; } +static inline __init bool kaslr_disabled(void) +{ + return strstr(boot_command_line, "nokaslr") != NULL; +} + static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size, unsigned long kernel_sz) { @@ -290,6 +295,8 @@ static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size unsigned long index; kaslr_get_cmdline(dt_ptr); + if (kaslr_disabled()) + return 0; random = get_boot_seed(dt_ptr); -- cgit v1.2.3 From 46ddcb3950a28c0df4815e8dbb8d4b91d5d9f22d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Aug 2019 15:21:55 +0000 Subject: powerpc/mm: Show if a bad page fault on data is read or write. DSISR (or ESR on some CPUs) has a bit to tell if the fault is due to a read or a write. Display it. Signed-off-by: Christophe Leroy Reviewed-by: Santosh Sivaraj Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4f88d7e6fda53b5f80a71040ab400242f6c8cb93.1566400889.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/fault.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 8432c281de92..b5047f9b5dec 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -645,6 +645,7 @@ NOKPROBE_SYMBOL(do_page_fault); void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; + int is_write = page_fault_is_write(regs->dsisr); /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -658,9 +659,10 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) case 0x300: case 0x380: case 0xe00: - pr_alert("BUG: %s at 0x%08lx\n", + pr_alert("BUG: %s on %s at 0x%08lx\n", regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" : - "Unable to handle kernel data access", regs->dar); + "Unable to handle kernel data access", + is_write ? "write" : "read", regs->dar); break; case 0x400: case 0x480: -- cgit v1.2.3 From a2227a27774328507a5c2335a6dd600c079d1ff5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 23 Aug 2019 09:56:21 +0000 Subject: powerpc/32: Don't populate page tables for block mapped pages except on the 8xx. Commit d2f15e0979ee ("powerpc/32: always populate page tables for Abatron BDI.") wrongly sets page tables for any PPC32 for using BDI, and does't update them after init (remove RX on init section, set text and rodata read-only) Only the 8xx requires page tables to be populated for using the BDI. They also need to be populated in order to see the mappings in /sys/kernel/debug/kernel_page_tables On BOOK3S_32, pages that are not mapped by page tables are mapped by BATs. The BDI knows BATs and they can be viewed in /sys/kernel/debug/powerpc/block_address_translation Only set pagetables for RAM and IMMR on the 8xx and properly update them at the end of init. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c8610942203e0d93fcb02ad20c57edd3adb4c9d3.1566554029.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/nohash/8xx.c | 52 +++++++++++++++++++++++++++++++++++++++++--- arch/powerpc/mm/pgtable_32.c | 5 +---- 2 files changed, 50 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 4a06cb342da2..090af2d2d3e4 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -103,6 +103,19 @@ static void mmu_patch_addis(s32 *site, long simm) patch_instruction_site(site, instr); } +void __init mmu_mapin_ram_chunk(unsigned long offset, unsigned long top, pgprot_t prot) +{ + unsigned long s = offset; + unsigned long v = PAGE_OFFSET + s; + phys_addr_t p = memstart_addr + s; + + for (; s < top; s += PAGE_SIZE) { + map_kernel_page(v, p, prot); + v += PAGE_SIZE; + p += PAGE_SIZE; + } +} + unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long mapped; @@ -115,10 +128,20 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); } else { + unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); + mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) - mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, - _ALIGN(__pa(_einittext), 8 << 20)); + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, einittext8); + + /* + * Populate page tables to: + * - have them appear in /sys/kernel/debug/kernel_page_tables + * - allow the BDI to find the pages when they are not PINNED + */ + mmu_mapin_ram_chunk(0, einittext8, PAGE_KERNEL_X); + mmu_mapin_ram_chunk(einittext8, mapped, PAGE_KERNEL); + mmu_mapin_immr(); } mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); @@ -144,18 +167,41 @@ void mmu_mark_initmem_nx(void) if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23) mmu_patch_addis(&patch__itlbmiss_linmem_top8, -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1))); - if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) { + unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); + unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M); + unsigned long etext = __pa(_etext); + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext)); + + /* Update page tables for PTDUMP and BDI */ + mmu_mapin_ram_chunk(0, einittext8, __pgprot(0)); + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { + mmu_mapin_ram_chunk(0, etext, PAGE_KERNEL_TEXT); + mmu_mapin_ram_chunk(etext, einittext8, PAGE_KERNEL); + } else { + mmu_mapin_ram_chunk(0, etext8, PAGE_KERNEL_TEXT); + mmu_mapin_ram_chunk(etext8, einittext8, PAGE_KERNEL); + } + } } #ifdef CONFIG_STRICT_KERNEL_RWX void mmu_mark_rodata_ro(void) { + unsigned long sinittext = __pa(_sinittext); + unsigned long etext = __pa(_etext); + if (CONFIG_DATA_SHIFT < 23) mmu_patch_addis(&patch__dtlbmiss_romem_top8, -__pa(((unsigned long)_sinittext) & ~(LARGE_PAGE_SIZE_8M - 1))); mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext)); + + /* Update page tables for PTDUMP and BDI */ + mmu_mapin_ram_chunk(0, sinittext, __pgprot(0)); + mmu_mapin_ram_chunk(0, etext, PAGE_KERNEL_ROX); + mmu_mapin_ram_chunk(etext, sinittext, PAGE_KERNEL_RO); } #endif diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 8ec5dfb65b2e..73b84166d06a 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -117,10 +117,7 @@ void __init mapin_ram(void) if (base >= top) continue; base = mmu_mapin_ram(base, top); - if (IS_ENABLED(CONFIG_BDI_SWITCH)) - __mapin_ram_chunk(reg->base, top); - else - __mapin_ram_chunk(base, top); + __mapin_ram_chunk(base, top); } } -- cgit v1.2.3 From d538aadc2718a95bfd80095c66ea814824535b34 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 12 Sep 2019 13:49:44 +0000 Subject: powerpc/ioremap: warn on early use of ioremap() Powerpc now has EARLY_IOREMAP. Next step is to convert all early users of ioremap() to early_ioremap(). Add a warning to help locate those users. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b4f03a68ee8e68773c8973d74ec35f9c82c72871.1568295907.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/ioremap_32.c | 1 + arch/powerpc/mm/ioremap_64.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index f36121f25243..743e11384dea 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -68,6 +68,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call /* * Should check if it is a candidate for a BAT mapping */ + pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); err = early_ioremap_range(ioremap_bot - size, p, size, prot); if (err) diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index fd29e51700cd..50a99d9684f7 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -81,6 +81,8 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, if (slab_is_available()) return do_ioremap(paligned, offset, size, prot, caller); + pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); + err = early_ioremap_range(ioremap_bot, paligned, size, prot); if (err) return NULL; -- cgit v1.2.3 From cbcaff7d27ad5c5d2c2db113ec489be88adb815a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 16 Sep 2019 20:25:39 +0000 Subject: powerpc/32s: automatically allocate BAT in setbat() If no BAT is given to setbat(), select an available BAT. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a212bd36fbd6179e0929b6c727febc35132ac25c.1568665466.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/book3s32/mmu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 84d5fab94f8f..69b2419accef 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -251,9 +251,18 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, { unsigned int bl; int wimgxpp; - struct ppc_bat *bat = BATS[index]; + struct ppc_bat *bat; unsigned long flags = pgprot_val(prot); + if (index == -1) + index = find_free_bat(); + if (index == -1) { + pr_err("%s: no BAT available for mapping 0x%llx\n", __func__, + (unsigned long long)phys); + return; + } + bat = BATS[index]; + if ((flags & _PAGE_NO_CACHE) || (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0)) flags &= ~_PAGE_COHERENT; -- cgit v1.2.3 From f2bb86937d86ebcb0e52f95b6d19aba1d850e601 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 12 Sep 2019 13:49:41 +0000 Subject: powerpc/fixmap: don't clear fixmap area in paging_init() fixmap is intended to map things permanently like the IMMR region on FSL SOC (8xx, 83xx, ...), so don't clear it when initialising paging() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/41c99bc06394a6bc2888631cb98a3ed2ae281ddb.1568295907.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/mem.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 7573002077a6..bf0c54b1f161 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -237,14 +237,6 @@ void __init paging_init(void) unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); -#ifdef CONFIG_PPC32 - unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); - unsigned long end = __fix_to_virt(FIX_HOLE); - - for (; v < end; v += PAGE_SIZE) - map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */ -#endif - #ifdef CONFIG_HIGHMEM map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); -- cgit v1.2.3 From 2807273f5e88ed086d7d5d838fdee71e11e5085f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 28 Nov 2019 07:59:22 +0000 Subject: powerpc/fixmap: fix crash with HIGHMEM Commit f2bb86937d86 ("powerpc/fixmap: don't clear fixmap area in paging_init()") removed the clearing of fixmap area in order to avoid clearing fixmapped areas set earlier. However unlike all other users of fixmap which use __set_fixmap(), HIGHMEM functions directly use __set_pte_at(). This means the page table must pre-exist, otherwise the following crash can be encoutered due to the lack of entry in the PGD. Oops: Kernel access of bad area, sig: 11 [#1] BE PAGE_SIZE=4K MMU=Hash PowerMac Modules linked in: CPU: 0 PID: 1 Comm: swapper Not tainted 5.4.0+ #2528 NIP: c0144ce8 LR: c0144ccc CTR: 00000080 REGS: ef0b5aa0 TRAP: 0300 Not tainted (5.4.0+) MSR: 00009032 CR: 44282842 XER: 00000000 DAR: fffdf000 DSISR: 42000000 GPR00: c0144ccc ef0b5b58 ef0b0000 fffdf000 fffdf000 00000000 c0000f7c 00000000 GPR08: c0833000 fffdf000 00000000 ef1c53c9 24042842 00000000 00000000 00000000 GPR16: 00000000 00000000 ef7e7358 effe8160 00000000 c08a9660 c0851644 00000004 GPR24: c08c70a8 00002dc2 00000000 00000001 00000201 effe8160 effe8160 00000000 NIP [c0144ce8] prep_new_page+0x138/0x178 LR [c0144ccc] prep_new_page+0x11c/0x178 Call Trace: [ef0b5b58] [c0144ccc] prep_new_page+0x11c/0x178 (unreliable) [ef0b5b88] [c0147218] get_page_from_freelist+0x1fc/0xd88 [ef0b5c38] [c0148328] __alloc_pages_nodemask+0xd4/0xbb4 [ef0b5cf8] [c0142ba8] __vmalloc_node_range+0x1b4/0x2e0 [ef0b5d38] [c0142dd0] vzalloc+0x48/0x58 [ef0b5d58] [c0301c8c] check_partition+0x58/0x244 [ef0b5d78] [c02ffe80] blk_add_partitions+0x44/0x2cc [ef0b5db8] [c01a32d8] bdev_disk_changed+0x68/0xfc [ef0b5de8] [c01a4494] __blkdev_get+0x290/0x460 [ef0b5e28] [c02fdd40] __device_add_disk+0x480/0x4d8 [ef0b5e68] [c0810688] brd_init+0xc0/0x188 [ef0b5e88] [c0005194] do_one_initcall+0x40/0x19c [ef0b5ee8] [c07dd4dc] kernel_init_freeable+0x164/0x230 [ef0b5f28] [c0005408] kernel_init+0x18/0x10c [ef0b5f38] [c0014274] ret_from_kernel_thread+0x14/0x1c Partially revert that commit to still clear the fixmap area dedicated to HIGHMEM. Fixes: f2bb86937d86 ("powerpc/fixmap: don't clear fixmap area in paging_init()") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d42fa9747df5afa41e67b08e374c98d3b40529c9.1574927918.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/mem.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index bf0c54b1f161..b5835a71f10f 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -238,6 +238,12 @@ void __init paging_init(void) phys_addr_t top_of_ram = memblock_end_of_DRAM(); #ifdef CONFIG_HIGHMEM + unsigned long v = __fix_to_virt(FIX_KMAP_END); + unsigned long end = __fix_to_virt(FIX_KMAP_BEGIN); + + for (; v < end; v += PAGE_SIZE) + map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */ + map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); -- cgit v1.2.3