From 07e326610e5634e5038fce32fff370949eb42101 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:40 -0800 Subject: mm: add tlb_remove_check_page_size_change to track page size change With commit e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change") we added the ability to force a tlb flush when the page size change in a mmu_gather loop. We did that by checking for a page size change every time we added a page to mmu_gather for lazy flush/remove. We can improve that by moving the page size change check early and not doing it every time we add a page. This also helps us to do tlb flush when invalidating a range covering dax mapping. Wrt dax mapping we don't have a backing struct page and hence we don't call tlb_remove_page, which earlier forced the tlb flush on page size change. Moving the page size change check earlier means we will do the same even for dax mapping. We also avoid doing this check on architecture other than powerpc. In a later patch we will remove page size check from tlb_remove_page(). Link: http://lkml.kernel.org/r/20161026084839.27299-5-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/tlb.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 99e1397b71da..609557569f65 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -28,6 +28,7 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change extern void tlb_flush(struct mmu_gather *tlb); @@ -46,6 +47,21 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ + if (!tlb->page_size) + tlb->page_size = page_size; + else if (tlb->page_size != page_size) { + tlb_flush_mmu(tlb); + /* + * update the page size after flush for the new + * mmu_gather. + */ + tlb->page_size = page_size; + } +} + #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { -- cgit v1.2.3 From 4a3bac4e3ac212c31edd8b124a1a2c7e8c1767ed Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:42:52 -0800 Subject: powerpc/mm: allow memory hotplug into a memoryless node Patch series "enable movable nodes on non-x86 configs", v7. This patchset allows more configs to make use of movable nodes. When CONFIG_MOVABLE_NODE is selected, there are two ways to introduce such nodes into the system: 1. Discover movable nodes at boot. Currently this is only possible on x86, but we will enable configs supporting fdt to do the same. 2. Hotplug and online all of a node's memory using online_movable. This is already possible on any config supporting memory hotplug, not just x86, but the Kconfig doesn't say so. We will fix that. We'll also remove some cruft on power which would prevent (2). This patch (of 5): Remove the check which prevents us from hotplugging into an empty node. The original commit b226e4621245 ("[PATCH] powerpc: don't add memory to empty node/zone"), states that this was intended to be a temporary measure. It is a workaround for an oops which no longer occurs. Link: http://lkml.kernel.org/r/1479160961-25840-2-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Reviewed-by: Aneesh Kumar K.V Acked-by: Balbir Singh Acked-by: Michael Ellerman Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/numa.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index a51c188b81f3..0cb6bd8bfccf 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr) int hot_add_scn_to_nid(unsigned long scn_addr) { struct device_node *memory = NULL; - int nid, found = 0; + int nid; if (!numa_enabled || (min_common_depth < 0)) return first_online_node; @@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr) if (nid < 0 || !node_online(nid)) nid = first_online_node; - if (NODE_DATA(nid)->node_spanned_pages) - return nid; - - for_each_online_node(nid) { - if (NODE_DATA(nid)->node_spanned_pages) { - found = 1; - break; - } - } - - BUG_ON(!found); return nid; } -- cgit v1.2.3 From 1dd38b6c27d59414e89c08dd1ae9677a8e12cbc4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:44:29 -0800 Subject: mm: move vma_is_anonymous check within pmd_move_must_withdraw Independent of whether the vma is for anonymous memory, some arches like ppc64 would like to override pmd_move_must_withdraw(). One option is to encapsulate the vma_is_anonymous() check for general architectures inside pmd_move_must_withdraw() so that is always called and architectures that need unconditional overriding can override this function. ppc64 needs to override the function when the MMU is configured to use hash PTE's. [bsingharora@gmail.com: reworked changelog] Link: http://lkml.kernel.org/r/20161113150025.17942-1-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Acked-by: Michael Ellerman (powerpc) Cc: Benjamin Herrenschmidt Cc: Michael Neuling Cc: Paul Mackerras Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 3 ++- include/asm-generic/pgtable.h | 12 ------------ mm/huge_memory.c | 18 ++++++++++++++++-- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9fd77f8794a0..700301bc5190 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, - struct spinlock *old_pmd_ptl) + struct spinlock *old_pmd_ptl, + struct vm_area_struct *vma) { if (radix_enabled()) return false; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 41b95d82a185..2065e81701fc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -652,18 +652,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif -#ifndef pmd_move_must_withdraw -static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, - spinlock_t *old_pmd_ptl) -{ - /* - * With split pmd lock we also need to move preallocated - * PTE page table if new_pmd is on different PMD page table. - */ - return new_pmd_ptl != old_pmd_ptl; -} -#endif - /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 26fd1161ca85..b54044c21076 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1429,6 +1429,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, + spinlock_t *old_pmd_ptl, + struct vm_area_struct *vma) +{ + /* + * With split pmd lock we also need to move preallocated + * PTE page table if new_pmd is on different PMD page table. + * + * We also don't deposit and withdraw tables for file pages. + */ + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); +} +#endif + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) @@ -1466,8 +1481,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl) && - vma_is_anonymous(vma)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); -- cgit v1.2.3 From 953c66c2b22a304dbc3c3d7fc8e8c25cd97a03d8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:44:32 -0800 Subject: mm: THP page cache support for ppc64 Add arch specific callback in the generic THP page cache code that will deposit and withdarw preallocated page table. Archs like ppc64 use this preallocated table to store the hash pte slot information. Testing: kernel build of the patch series on tmpfs mounted with option huge=always The related thp stat: thp_fault_alloc 72939 thp_fault_fallback 60547 thp_collapse_alloc 603 thp_collapse_alloc_failed 0 thp_file_alloc 253763 thp_file_mapped 4251 thp_split_page 51518 thp_split_page_failed 1 thp_deferred_split_page 73566 thp_split_pmd 665 thp_zero_page_alloc 3 thp_zero_page_alloc_failed 0 [akpm@linux-foundation.org: remove unneeded parentheses, per Kirill] Link: http://lkml.kernel.org/r/20161113150025.17942-2-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Michael Neuling Cc: Paul Mackerras Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +++++ include/asm-generic/pgtable.h | 3 ++ mm/Kconfig | 6 +-- mm/huge_memory.c | 17 ++++++++ mm/khugepaged.c | 21 +++++++++- mm/memory.c | 60 +++++++++++++++++++++++----- 6 files changed, 100 insertions(+), 17 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 700301bc5190..0ebfbc8f0449 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1021,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } + + +#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit +static inline bool arch_needs_pgtable_deposit(void) +{ + if (radix_enabled()) + return false; + return true; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2065e81701fc..18af2bcefe6a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -652,6 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef arch_needs_pgtable_deposit +#define arch_needs_pgtable_deposit() (false) +#endif /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/Kconfig b/mm/Kconfig index 33a9b06ec618..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -447,13 +447,9 @@ choice benefit. endchoice -# -# We don't deposit page tables on file THP mapping, -# but Power makes use of them to address MMU quirk. -# config TRANSPARENT_HUGE_PAGECACHE def_bool y - depends on TRANSPARENT_HUGEPAGE && !PPC + depends on TRANSPARENT_HUGEPAGE # # UP and nommu archs use km based percpu allocator diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b54044c21076..2b44ac11178f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1380,6 +1380,15 @@ out_unlocked: return ret; } +static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t pgtable; + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pgtable); + atomic_long_dec(&mm->nr_ptes); +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1421,6 +1430,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, atomic_long_dec(&tlb->mm->nr_ptes); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -1607,6 +1618,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7a50c726c5ae..09460955e818 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; + bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); + /* + * now deposit the pgtable for arch that need it + * otherwise free it. + */ + if (arch_needs_pgtable_deposit()) { + /* + * The deposit should be visibile only after + * collapse is seen by others. + */ + smp_wmb(); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, + pmd_pgtable(_pmd)); + deposited = true; + } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + if (!deposited) { + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } } } i_mmap_unlock_write(mapping); diff --git a/mm/memory.c b/mm/memory.c index 0a72f821ccdc..32e9b7aec366 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2935,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } +static void deposit_prealloc_pte(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + atomic_long_inc(&vma->vm_mm->nr_ptes); + fe->prealloc_pte = 0; +} + static int do_set_pmd(struct fault_env *fe, struct page *page) { struct vm_area_struct *vma = fe->vma; @@ -2949,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = VM_FAULT_FALLBACK; page = compound_head(page); + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { + fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); + if (!fe->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_none(*fe->pmd))) goto out; @@ -2962,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); page_add_file_rmap(page, true); + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(fe); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); @@ -2971,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = 0; count_vm_event(THP_FILE_MAPPED); out: + /* + * If we are going to fallback to pte mapping, do a + * withdraw with pmd lock held. + */ + if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + fe->pmd); spin_unlock(fe->ptl); return ret; } @@ -3010,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, ret = do_set_pmd(fe, page); if (ret != VM_FAULT_FALLBACK) - return ret; + goto fault_handled; } if (!fe->pte) { ret = pte_alloc_one_map(fe); if (ret) - return ret; + goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) - return VM_FAULT_NOPAGE; + if (unlikely(!pte_none(*fe->pte))) { + ret = VM_FAULT_NOPAGE; + goto fault_handled; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3041,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, fe->address, fe->pte); + ret = 0; - return 0; +fault_handled: + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + return ret; } static unsigned long fault_around_bytes __read_mostly = @@ -3141,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*fe->pmd)) { ret = VM_FAULT_NOPAGE; -- cgit v1.2.3