From 57a196a58421a4b0c45949ae7309f21829aaa77f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sun, 18 Sep 2022 19:13:48 -0700 Subject: hugetlb: simplify hugetlb handling in follow_page_mask During discussions of this series [1], it was suggested that hugetlb handling code in follow_page_mask could be simplified. At the beginning of follow_page_mask, there currently is a call to follow_huge_addr which 'may' handle hugetlb pages. ia64 is the only architecture which provides a follow_huge_addr routine that does not return error. Instead, at each level of the page table a check is made for a hugetlb entry. If a hugetlb entry is found, a call to a routine associated with that entry is made. Currently, there are two checks for hugetlb entries at each page table level. The first check is of the form: if (p?d_huge()) page = follow_huge_p?d(); the second check is of the form: if (is_hugepd()) page = follow_huge_pd(). We can replace these checks, as well as the special handling routines such as follow_huge_p?d() and follow_huge_pd() with a single routine to handle hugetlb vmas. A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the beginning of follow_page_mask. hugetlb_follow_page_mask will use the existing routine huge_pte_offset to walk page tables looking for hugetlb entries. huge_pte_offset can be overwritten by architectures, and already handles special cases such as hugepd entries. [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/ [mike.kravetz@oracle.com: remove vma (pmd sharing) per Peter] Link: https://lkml.kernel.org/r/20221028181108.119432-1-mike.kravetz@oracle.com [mike.kravetz@oracle.com: remove left over hugetlb_vma_unlock_read()] Link: https://lkml.kernel.org/r/20221030225825.40872-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220919021348.22151-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Aneesh Kumar K.V Cc: Christophe Leroy Cc: Michael Ellerman Cc: Muchun Song Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 50 ++++++++----------------------------------------- 1 file changed, 8 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8b4f93e84868..4a76c0fc6bbf 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -149,6 +149,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, unsigned long *, long, unsigned int, @@ -209,17 +211,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write); -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift); -struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, - int flags); -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags); -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, - pgd_t *pgd, int flags); void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); @@ -272,6 +263,12 @@ static inline void adjust_range_if_pmd_sharing_possible( { } +static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ +} + static inline long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, @@ -282,12 +279,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm, return 0; } -static inline struct page *follow_huge_addr(struct mm_struct *mm, - unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, @@ -320,31 +311,6 @@ static inline void hugetlb_show_meminfo_node(int nid) { } -static inline struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, - int pdshift) -{ - return NULL; -} - -static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, - unsigned long address, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pud(struct mm_struct *mm, - unsigned long address, pud_t *pud, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pgd(struct mm_struct *mm, - unsigned long address, pgd_t *pgd, int flags) -{ - return NULL; -} - static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { -- cgit v1.2.3 From 0538a82c39e94d49fa6985c6a0101ca819be11ee Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Oct 2022 15:31:13 -0400 Subject: mm: vmscan: make rotations a secondary factor in balancing anon vs file We noticed a 2% webserver throughput regression after upgrading from 5.6. This could be tracked down to a shift in the anon/file reclaim balance (confirmed with swappiness) that resulted in worse reclaim efficiency and thus more kswapd activity for the same outcome. The change that exposed the problem is aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU"). By qualifying swapins based on their refault distance, it lowered the cost of anon reclaim in this workload, in turn causing (much) more anon scanning than before. Scanning the anon list is more expensive due to the higher ratio of mmapped pages that may rotate during reclaim, and so the result was an increase in %sys time. Right now, rotations aren't considered a cost when balancing scan pressure between LRUs. We can end up with very few file refaults putting all the scan pressure on hot anon pages that are rotated en masse, don't get reclaimed, and never push back on the file LRU again. We still only reclaim file cache in that case, but we burn a lot CPU rotating anon pages. It's "fair" from an LRU age POV, but doesn't reflect the real cost it imposes on the system. Consider rotations as a secondary factor in balancing the LRUs. This doesn't attempt to make a precise comparison between IO cost and CPU cost, it just says: if reloads are about comparable between the lists, or rotations are overwhelmingly different, adjust for CPU work. This fixed the regression on our webservers. It has since been deployed to the entire Meta fleet and hasn't caused any problems. Link: https://lkml.kernel.org/r/20221013193113.726425-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++-- mm/swap.c | 22 +++++++++++++++++----- mm/vmscan.c | 4 +++- mm/workingset.c | 2 +- 4 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index a18cf4b7c724..369d7799205d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); -void lru_note_cost_folio(struct folio *); +void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated); +void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void lru_cache_add(struct page *); diff --git a/mm/swap.c b/mm/swap.c index 955930f41d20..2f12a2ee1d3a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -295,8 +295,20 @@ void folio_rotate_reclaimable(struct folio *folio) } } -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) +void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated) { + unsigned long cost; + + /* + * Reflect the relative cost of incurring IO and spending CPU + * time on rotations. This doesn't attempt to make a precise + * comparison, it just says: if reloads are about comparable + * between the LRU lists, or rotations are overwhelmingly + * different between them, adjust scan balance for CPU work. + */ + cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; + do { unsigned long lrusize; @@ -310,9 +322,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) - lruvec->file_cost += nr_pages; + lruvec->file_cost += cost; else - lruvec->anon_cost += nr_pages; + lruvec->anon_cost += cost; /* * Decay previous events @@ -335,10 +347,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) } while ((lruvec = parent_lruvec(lruvec))); } -void lru_note_cost_folio(struct folio *folio) +void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), - folio_nr_pages(folio)); + folio_nr_pages(folio), 0); } static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) diff --git a/mm/vmscan.c b/mm/vmscan.c index 04d8b88e5216..ffe402e095d3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2499,7 +2499,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout); + lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); mem_cgroup_uncharge_list(&folio_list); free_unref_page_list(&folio_list); @@ -2639,6 +2639,8 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); + if (nr_rotated) + lru_note_cost(lruvec, file, 0, nr_rotated); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, diff --git a/mm/workingset.c b/mm/workingset.c index ae7e984b23c6..d2d02978588c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -493,7 +493,7 @@ void workingset_refault(struct folio *folio, void *shadow) if (workingset) { folio_set_workingset(folio); /* XXX: Move to lru_cache_add() when it supports new vs putback */ - lru_note_cost_folio(folio); + lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } out: -- cgit v1.2.3 From d03c376d9066532551dc56837c7c5490e4fcbbfe Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:03 -0500 Subject: mm/hugetlb: add folio support to hugetlb specific flag macros Patch series "begin converting hugetlb code to folios", v4. This patch series starts the conversion of the hugetlb code to operate on struct folios rather than struct pages. This removes the ambiguitiy of whether functions are operating on head pages, tail pages of compound pages, or base pages. This series passes the linux test project hugetlb test cases. Patch 1 adds hugeltb specific page macros that can operate on folios. Patch 2 adds the private field of the first tail page to struct page. For 32-bit, _private_1 alinging with page[1].private was confirmed by using pahole. Patch 3 introduces hugetlb subpool helper functions which operate on struct folios. These patches were tested using the hugepage-mmap.c selftest along with the migratepages command. Patch 4 converts hugetlb_delete_from_page_cache() to use folios. Patch 5 adds a folio_hstate() function to get hstate information from a folio and adds a user of folio_hstate(). Bpftrace was used to track time spent in the free_huge_pages function during the ltp test cases as it is a caller of the hugetlb subpool functions. From the histogram, the performance is similar before and after the patch series. Time spent in 'free_huge_page' 6.0.0-rc2.master.20220823 @nsecs: [256, 512) 14770 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ |@@@@@@@@@@@@@@@@@@@@@@@@@ | [512, 1K) 155 | | [1K, 2K) 169 | | [2K, 4K) 50 | | [4K, 8K) 14 | | [8K, 16K) 3 | | [16K, 32K) 3 | | 6.0.0-rc2.master.20220823 + patch series @nsecs: [256, 512) 13678 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |@@@@@@@@@@@@@@@@@@@@@@@@@ | [512, 1K) 142 | | [1K, 2K) 199 | | [2K, 4K) 44 | | [4K, 8K) 13 | | [8K, 16K) 4 | | [16K, 32K) 1 | | This patch (of 5): Allow the macros which test, set, and clear hugetlb specific page flags to take a hugetlb folio as an input. The macrros are generated as folio_{test, set, clear}_hugetlb_{restore_reserve, migratable, temporary, freed, vmemmap_optimized, raw_hwp_unreliable}. Link: https://lkml.kernel.org/r/20220922154207.1575343-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20220922154207.1575343-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4a76c0fc6bbf..3ff5d2dd3ca3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -589,26 +589,50 @@ enum hugetlb_page_flags { */ #ifdef CONFIG_HUGETLB_PAGE #define TESTHPAGEFLAG(uname, flname) \ +static __always_inline \ +bool folio_test_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + return test_bit(HPG_##flname, private); \ + } \ static inline int HPage##uname(struct page *page) \ { return test_bit(HPG_##flname, &(page->private)); } #define SETHPAGEFLAG(uname, flname) \ +static __always_inline \ +void folio_set_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + set_bit(HPG_##flname, private); \ + } \ static inline void SetHPage##uname(struct page *page) \ { set_bit(HPG_##flname, &(page->private)); } #define CLEARHPAGEFLAG(uname, flname) \ +static __always_inline \ +void folio_clear_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + clear_bit(HPG_##flname, private); \ + } \ static inline void ClearHPage##uname(struct page *page) \ { clear_bit(HPG_##flname, &(page->private)); } #else #define TESTHPAGEFLAG(uname, flname) \ +static inline bool \ +folio_test_hugetlb_##flname(struct folio *folio) \ + { return 0; } \ static inline int HPage##uname(struct page *page) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ +static inline void \ +folio_set_hugetlb_##flname(struct folio *folio) \ + { } \ static inline void SetHPage##uname(struct page *page) \ { } #define CLEARHPAGEFLAG(uname, flname) \ +static inline void \ +folio_clear_hugetlb_##flname(struct folio *folio) \ + { } \ static inline void ClearHPage##uname(struct page *page) \ { } #endif -- cgit v1.2.3 From d340625f4849ab5dbfebbc7d84709fbfcd39e52f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:04 -0500 Subject: mm: add private field of first tail to struct page and struct folio Allow struct folio to store hugetlb metadata that is contained in the private field of the first tail page. On 32-bit, _private_1 aligns with page[1].private. Link: https://lkml.kernel.org/r/20220922154207.1575343-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 500e536796ca..2d5b1575ffe0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -144,6 +144,7 @@ struct page { atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ + unsigned long _private_1; #endif }; struct { /* Second tail page of compound page */ @@ -264,6 +265,7 @@ struct page { * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). + * @_private_1: Do not use directly, call folio_get_private_1(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -311,6 +313,7 @@ struct folio { #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif + unsigned long _private_1; }; #define FOLIO_MATCH(pg, fl) \ @@ -338,6 +341,7 @@ FOLIO_MATCH(compound_mapcount, _total_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); +FOLIO_MATCH(_private_1, _private_1); #endif #undef FOLIO_MATCH @@ -383,6 +387,16 @@ static inline void *folio_get_private(struct folio *folio) return folio->private; } +static inline void folio_set_private_1(struct folio *folio, unsigned long private) +{ + folio->_private_1 = private; +} + +static inline unsigned long folio_get_private_1(struct folio *folio) +{ + return folio->_private_1; +} + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) -- cgit v1.2.3 From 149562f7509404c382c32c3fa8a6ba356135e5cf Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:05 -0500 Subject: mm/hugetlb: add hugetlb_folio_subpool() helpers Allow hugetlbfs_migrate_folio to check and read subpool information by passing in a folio. Link: https://lkml.kernel.org/r/20220922154207.1575343-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 8 ++++---- include/linux/hugetlb.h | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dd54f67e47fd..c5137607e523 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1091,10 +1091,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (hugetlb_page_subpool(&src->page)) { - hugetlb_set_page_subpool(&dst->page, - hugetlb_page_subpool(&src->page)); - hugetlb_set_page_subpool(&src->page, NULL); + if (hugetlb_folio_subpool(src)) { + hugetlb_set_folio_subpool(dst, + hugetlb_folio_subpool(src)); + hugetlb_set_folio_subpool(src, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ff5d2dd3ca3..496d02bdb997 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -718,18 +718,29 @@ extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) +static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) +{ + return (void *)folio_get_private_1(folio); +} + /* * hugetlb page subpool pointer located in hpage[1].private */ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { - return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL); + return hugetlb_folio_subpool(page_folio(hpage)); +} + +static inline void hugetlb_set_folio_subpool(struct folio *folio, + struct hugepage_subpool *subpool) +{ + folio_set_private_1(folio, (unsigned long)subpool); } static inline void hugetlb_set_page_subpool(struct page *hpage, struct hugepage_subpool *subpool) { - set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool); + hugetlb_set_folio_subpool(page_folio(hpage), subpool); } static inline struct hstate *hstate_file(struct file *f) -- cgit v1.2.3 From ece62684dcfb714b7d8452056b4a33d426b16457 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:06 -0500 Subject: hugetlbfs: convert hugetlb_delete_from_page_cache() to use folios Remove the last caller of delete_from_page_cache() by converting the code to its folio equivalent. Link: https://lkml.kernel.org/r/20220922154207.1575343-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 12 ++++++------ include/linux/pagemap.h | 1 - mm/folio-compat.c | 5 ----- 3 files changed, 6 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5137607e523..00495fc128c5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -364,11 +364,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void hugetlb_delete_from_page_cache(struct page *page) +static void hugetlb_delete_from_page_cache(struct folio *folio) { - ClearPageDirty(page); - ClearPageUptodate(page); - delete_from_page_cache(page); + folio_clear_dirty(folio); + folio_clear_uptodate(folio); + filemap_remove_folio(folio); } /* @@ -574,8 +574,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, * map could fail. Correspondingly, the subpool and global * reserve usage count can need to be adjusted. */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); + VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); + hugetlb_delete_from_page_cache(folio); ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bbccb4044222..060ee98474ef 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1102,7 +1102,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); -void delete_from_page_cache(struct page *page); void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index e1e23b4947d7..8ae39c06da62 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -124,11 +124,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -void delete_from_page_cache(struct page *page) -{ - return filemap_remove_folio(page_folio(page)); -} - int try_to_release_page(struct page *page, gfp_t gfp) { return filemap_release_folio(page_folio(page), gfp); -- cgit v1.2.3 From e51da3a9b6c2f67879880259a25c51dbda01c462 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:07 -0500 Subject: mm/hugetlb: add folio_hstate() Helper function to retrieve hstate information from a hugetlb folio. Link: https://lkml.kernel.org/r/20220922154207.1575343-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reported-by: kernel test robot Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 14 ++++++++++++-- mm/migrate.c | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 496d02bdb997..20a0d5a08395 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -824,10 +824,15 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, } #endif +static inline struct hstate *folio_hstate(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + return size_to_hstate(folio_size(folio)); +} + static inline struct hstate *page_hstate(struct page *page) { - VM_BUG_ON_PAGE(!PageHuge(page), page); - return size_to_hstate(page_size(page)); + return folio_hstate(page_folio(page)); } static inline unsigned hstate_index_to_shift(unsigned index) @@ -1036,6 +1041,11 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma) return NULL; } +static inline struct hstate *folio_hstate(struct folio *folio) +{ + return NULL; +} + static inline struct hstate *page_hstate(struct page *page) { return NULL; diff --git a/mm/migrate.c b/mm/migrate.c index dff333593a8a..556cb1c86e53 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1620,7 +1620,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) nid = folio_nid(folio); if (folio_test_hugetlb(folio)) { - struct hstate *h = page_hstate(&folio->page); + struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); -- cgit v1.2.3 From 3e0ee843427a573e3e1187a5331e4b7fb00a76f3 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 7 Oct 2022 13:37:41 +0200 Subject: mm: fix typo in struct vm_operations_struct comments There is no eprotect(), so I assume this is about mprotect(). Link: https://lkml.kernel.org/r/2385684.8vm7BOzihM@mobilepool36.emlix.com Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc5565..f6d2d2d9e284 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -549,7 +549,7 @@ struct vm_operations_struct { /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not - * be modified. Returns 0 if eprotect() can proceed. + * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); -- cgit v1.2.3 From eafd296e0cc0cc03b4ae01c2b3b07273514d757c Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:46 +0800 Subject: memory: remove unused register_hotmemory_notifier() Remove unused register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-8-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- include/linux/memory.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index aa619464a1df..98d2a2ebcc10 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -19,7 +19,6 @@ #include #include #include -#include #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) @@ -136,9 +135,6 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) { return 0; } -/* These aren't inline functions due to a GCC bug. */ -#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) -#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) #else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); @@ -166,8 +162,6 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, { .notifier_call = fn, .priority = pri };\ register_memory_notifier(&fn##_mem_nb); \ }) -#define register_hotmemory_notifier(nb) register_memory_notifier(nb) -#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) #ifdef CONFIG_NUMA void memory_block_add_nid(struct memory_block *mem, int nid, -- cgit v1.2.3 From 1eeaa4fd39b0b1b3e986f8eab6978e69b01e3c5e Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:47 +0800 Subject: memory: move hotplug memory notifier priority to same file for easy sorting The priority of hotplug memory callback is defined in a different file. And there are some callers using numbers directly. Collect them together into include/linux/memory.h for easy reading. This allows us to sort their priorities more intuitively without additional comments. Link: https://lkml.kernel.org/r/20220923033347.3935160-9-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Christoph Lameter Cc: David Hildenbrand Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- drivers/acpi/numa/hmat.c | 2 +- fs/proc/kcore.c | 2 +- include/linux/memory-tiers.h | 1 - include/linux/memory.h | 9 +++++++-- kernel/cgroup/cpuset.c | 2 +- mm/kasan/shadow.c | 2 +- mm/ksm.c | 2 +- mm/memory-tiers.c | 2 +- mm/mm_init.c | 2 +- mm/mmap.c | 2 +- mm/page_ext.c | 2 +- 11 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 0ecefb604734..139e3b41653e 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -849,7 +849,7 @@ static __init int hmat_init(void) hmat_register_targets(); /* Keep the table and structures if the notifier may use them */ - if (!hotplug_memory_notifier(hmat_callback, 2)) + if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI)) return 0; out_put: hmat_free_structures(); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 7692a360972d..98f3289556e4 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -689,7 +689,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - hotplug_memory_notifier(kcore_callback, 0); + hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 965009aa01d7..fc9647b1b4f9 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -18,7 +18,6 @@ * the same memory tier. */ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) -#define MEMTIER_HOTPLUG_PRIO 100 struct memory_tier; struct memory_dev_type { diff --git a/include/linux/memory.h b/include/linux/memory.h index 98d2a2ebcc10..463662ef7614 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -112,8 +112,13 @@ struct mem_section; * Priorities for the hotplug memory callback routines (stored in decreasing * order in the callback chain) */ -#define SLAB_CALLBACK_PRI 1 -#define IPC_CALLBACK_PRI 10 +#define DEFAULT_CALLBACK_PRI 0 +#define SLAB_CALLBACK_PRI 1 +#define HMAT_CALLBACK_PRI 2 +#define MM_COMPUTE_BATCH_PRI 10 +#define CPUSET_CALLBACK_PRI 10 +#define MEMTIER_HOTPLUG_PRI 100 +#define KSM_CALLBACK_PRI 100 #ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0c6db6a4f427..3ea2e836e93e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3647,7 +3647,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, 10); + hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 0e3648b603a6..2fba1f51f042 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -244,7 +244,7 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - hotplug_memory_notifier(kasan_mem_notifier, 0); + hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/mm/ksm.c b/mm/ksm.c index c19fcca9bc03..7ba97f86d831 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3211,7 +3211,7 @@ static int __init ksm_init(void) #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ - hotplug_memory_notifier(ksm_memory_callback, 100); + hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index fa8c9d07f9ce..939e200c283b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -664,7 +664,7 @@ static int __init memory_tier_init(void) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); - hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO); + hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); return 0; } subsys_initcall(memory_tier_init); diff --git a/mm/mm_init.c b/mm/mm_init.c index 44aadc162d1f..c1883362e71d 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -181,7 +181,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, static int __init mm_compute_batch_init(void) { mm_compute_batch(sysctl_overcommit_memory); - hotplug_memory_notifier(mm_compute_batch_notifier, IPC_CALLBACK_PRI); + hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 3f47fd57d165..c697771d406b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3751,7 +3751,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, static int __meminit init_reserve_notifier(void) { - if (hotplug_memory_notifier(reserve_mem_notifier, 0)) + if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; diff --git a/mm/page_ext.c b/mm/page_ext.c index affe80243b6d..b2ff5c9129f4 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -513,7 +513,7 @@ void __init page_ext_init(void) cond_resched(); } } - hotplug_memory_notifier(page_ext_callback, 0); + hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; -- cgit v1.2.3 From 3c0c9bc9c9596d5cd69529da822526f88673365b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:47 +0200 Subject: mm: vmalloc: add alloc_vmap_area trace event Patch series "Add basic trace events for vmap/vmalloc (v2)", v2. This small series add some basic trace events for the vmap/vmalloc code. Since currently we lack any, sometimes it is hard to start debuging vmap code if an issue is reported or occured. For example https://lore.kernel.org/linux-mm/Y0p8BZIiDXLQbde%2F@pc636/T/ The final patch adds two reviewers for vmalloc code. This patch (of 7): It is for debug purposes and for validation of passed parameters. Link: https://lkml.kernel.org/r/20221018181053.434508-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20221018181053.434508-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 56 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 include/trace/events/vmalloc.h (limited to 'include') diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h new file mode 100644 index 000000000000..39fbd77c91e7 --- /dev/null +++ b/include/trace/events/vmalloc.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vmalloc + +#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VMALLOC_H + +#include + +/** + * alloc_vmap_area - called when a new vmap allocation occurs + * @addr: an allocated address + * @size: a requested size + * @align: a requested alignment + * @vstart: a requested start range + * @vend: a requested end range + * @failed: an allocation failed or not + * + * This event is used for a debug purpose, it can give an extra + * information for a developer about how often it occurs and which + * parameters are passed for further validation. + */ +TRACE_EVENT(alloc_vmap_area, + + TP_PROTO(unsigned long addr, unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int failed), + + TP_ARGS(addr, size, align, vstart, vend, failed), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, size) + __field(unsigned long, align) + __field(unsigned long, vstart) + __field(unsigned long, vend) + __field(int, failed) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->size = size; + __entry->align = align; + __entry->vstart = vstart; + __entry->vend = vend; + __entry->failed = failed; + ), + + TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d", + __entry->addr, __entry->size, __entry->align, + __entry->vstart, __entry->vend, __entry->failed) +); + +#endif /* _TRACE_VMALLOC_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From b3a5a7b099162e1b11db459f8128d4374f7d1c05 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:48 +0200 Subject: mm: vmalloc: add purge_vmap_area_lazy trace event It is for debug purposes to track number of freed vmap areas including a range it occurs on. Link: https://lkml.kernel.org/r/20221018181053.434508-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h index 39fbd77c91e7..afeb8003a0f2 100644 --- a/include/trace/events/vmalloc.h +++ b/include/trace/events/vmalloc.h @@ -50,6 +50,39 @@ TRACE_EVENT(alloc_vmap_area, __entry->vstart, __entry->vend, __entry->failed) ); +/** + * purge_vmap_area_lazy - called when vmap areas were lazily freed + * @start: purging start address + * @end: purging end address + * @npurged: numbed of purged vmap areas + * + * This event is used for a debug purpose. It gives some + * indication about start:end range and how many objects + * are released. + */ +TRACE_EVENT(purge_vmap_area_lazy, + + TP_PROTO(unsigned long start, unsigned long end, + unsigned int npurged), + + TP_ARGS(start, end, npurged), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, npurged) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->npurged = npurged; + ), + + TP_printk("start=0x%lx end=0x%lx num_purged=%u", + __entry->start, __entry->end, __entry->npurged) +); + #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ -- cgit v1.2.3 From fabc27f7649e070c4f6c742e436a51ff68c4a280 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:49 +0200 Subject: mm: vmalloc: add free_vmap_area_noflush trace event This event is used in order to validate/debug a start address of freed VA, number of currently outstanding and maximum allowed areas. Link: https://lkml.kernel.org/r/20221018181053.434508-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h index afeb8003a0f2..ad4e02191f35 100644 --- a/include/trace/events/vmalloc.h +++ b/include/trace/events/vmalloc.h @@ -83,6 +83,40 @@ TRACE_EVENT(purge_vmap_area_lazy, __entry->start, __entry->end, __entry->npurged) ); +/** + * free_vmap_area_noflush - called when a vmap area is freed + * @va_start: a start address of VA + * @nr_lazy: number of current lazy pages + * @nr_lazy_max: number of maximum lazy pages + * + * This event is used for a debug purpose. It gives some + * indication about a VA that is released, number of current + * outstanding areas and a maximum allowed threshold before + * dropping all of them. + */ +TRACE_EVENT(free_vmap_area_noflush, + + TP_PROTO(unsigned long va_start, unsigned long nr_lazy, + unsigned long nr_lazy_max), + + TP_ARGS(va_start, nr_lazy, nr_lazy_max), + + TP_STRUCT__entry( + __field(unsigned long, va_start) + __field(unsigned long, nr_lazy) + __field(unsigned long, nr_lazy_max) + ), + + TP_fast_assign( + __entry->va_start = va_start; + __entry->nr_lazy = nr_lazy; + __entry->nr_lazy_max = nr_lazy_max; + ), + + TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu", + __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max) +); + #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ -- cgit v1.2.3 From c5255b421fd04ba6a405809b7216a3b6ebd5493a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 19 Oct 2022 19:33:32 +0100 Subject: mm: remove FGP_HEAD This is no longer used; all callers have been converted to use folios instead. Somehow this manages to save 11 bytes of text. Link: https://lkml.kernel.org/r/20221019183332.2802139-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ++--- mm/folio-compat.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 060ee98474ef..b33ab86d5dca 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -504,9 +504,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 -#define FGP_HEAD 0x00000080 -#define FGP_ENTRY 0x00000100 -#define FGP_STABLE 0x00000200 +#define FGP_ENTRY 0x00000080 +#define FGP_STABLE 0x00000100 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 8ae39c06da62..bac2a366aada 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -108,7 +108,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); - if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio)) + if (!folio || xa_is_value(folio)) return &folio->page; return folio_file_page(folio, index); } -- cgit v1.2.3 From 6e2be1f2ebcea42ed6044432f72f32434e60b34d Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:52 +0200 Subject: compiler-gcc: be consistent with underscores use for `no_sanitize` Patch series "compiler-gcc: be consistent with underscores use for `no_sanitize`". This patch (of 5): Other macros that define shorthands for attributes in e.g. `compiler_attributes.h` and elsewhere use underscores. Link: https://lkml.kernel.org/r/20221021115956.9947-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index f55a37efdb97..b9530d3515ac 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -83,25 +83,25 @@ #endif #if __has_attribute(__no_sanitize_address__) -#define __no_sanitize_address __attribute__((no_sanitize_address)) +#define __no_sanitize_address __attribute__((__no_sanitize_address__)) #else #define __no_sanitize_address #endif #if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) -#define __no_sanitize_thread __attribute__((no_sanitize_thread)) +#define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) #else #define __no_sanitize_thread #endif #if __has_attribute(__no_sanitize_undefined__) -#define __no_sanitize_undefined __attribute__((no_sanitize_undefined)) +#define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) #else #define __no_sanitize_undefined #endif #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) -#define __no_sanitize_coverage __attribute__((no_sanitize_coverage)) +#define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) #else #define __no_sanitize_coverage #endif -- cgit v1.2.3 From ae37a9a2c2d0960d643d782b426ea1aa9c05727a Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:53 +0200 Subject: compiler-gcc: remove attribute support check for `__no_sanitize_address__` The attribute was added in GCC 4.8, while the minimum GCC version supported by the kernel is GCC 5.1. Therefore, remove the check. Link: https://godbolt.org/z/84v56vcn8 Link: https://lkml.kernel.org/r/20221021115956.9947-2-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Marco Elver Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index b9530d3515ac..bfce7f4d0978 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -82,11 +82,7 @@ #define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif -#if __has_attribute(__no_sanitize_address__) #define __no_sanitize_address __attribute__((__no_sanitize_address__)) -#else -#define __no_sanitize_address -#endif #if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) #define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) -- cgit v1.2.3 From 095ac0763ac507dd4e1a71ad9784f49f51498483 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:54 +0200 Subject: compiler-gcc: remove attribute support check for `__no_sanitize_thread__` The attribute was added in GCC 5.1, which matches the minimum GCC version supported by the kernel. Therefore, remove the check. Link: https://godbolt.org/z/vbxKejxbx Link: https://lkml.kernel.org/r/20221021115956.9947-3-ojeda@kernel.org Signed-off-by: Miguel Ojeda Acked-by: Marco Elver Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index bfce7f4d0978..ba207deb77ca 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -84,7 +84,7 @@ #define __no_sanitize_address __attribute__((__no_sanitize_address__)) -#if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) +#if defined(__SANITIZE_THREAD__) #define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) #else #define __no_sanitize_thread -- cgit v1.2.3 From 689540cbda7f69594ae5e13fef4c8239519d8b66 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:55 +0200 Subject: compiler-gcc: remove attribute support check for `__no_sanitize_undefined__` The attribute was added in GCC 4.9, while the minimum GCC version supported by the kernel is GCC 5.1. Therefore, remove the check. Link: https://godbolt.org/z/GrMeo6fYr Link: https://lkml.kernel.org/r/20221021115956.9947-4-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Marco Elver Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index ba207deb77ca..7f2c2bb73815 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -90,11 +90,7 @@ #define __no_sanitize_thread #endif -#if __has_attribute(__no_sanitize_undefined__) #define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) -#else -#define __no_sanitize_undefined -#endif #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) #define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) -- cgit v1.2.3 From f39556bc2530c83a22bc11b73c7a46df9a340685 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:56 +0200 Subject: compiler-gcc: document minimum version for `__no_sanitize_coverage__` The attribute was added in GCC 12.1. This will simplify future cleanups, and is closer to what we do in `compiler_attributes.h`. Link: https://godbolt.org/z/MGbT76j6G Link: https://lkml.kernel.org/r/20221021115956.9947-5-ojeda@kernel.org Signed-off-by: Miguel Ojeda Acked-by: Marco Elver Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7f2c2bb73815..7af9e34ec261 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -92,6 +92,9 @@ #define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) +/* + * Only supported since gcc >= 12 + */ #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) #define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) #else -- cgit v1.2.3 From e591ef7d96d6ea249916f351dc26a636e565c635 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:09 +0900 Subject: mm,hwpoison,hugetlb,memory_hotplug: hotremove memory section with hwpoisoned hugepage Patch series "mm, hwpoison: improve handling workload related to hugetlb and memory_hotplug", v7. This patchset tries to solve the issue among memory_hotplug, hugetlb and hwpoison. In this patchset, memory hotplug handles hwpoison pages like below: - hwpoison pages should not prevent memory hotremove, - memory block with hwpoison pages should not be onlined. This patch (of 4): HWPoisoned page is not supposed to be accessed once marked, but currently such accesses can happen during memory hotremove because do_migrate_range() can be called before dissolve_free_huge_pages() is called. Clear HPageMigratable for hwpoisoned hugepages to prevent them from being migrated. This should be done in hugetlb_lock to avoid race against isolate_hugetlb(). get_hwpoison_huge_page() needs to have a flag to show it's called from unpoison to take refcount of hwpoisoned hugepages, so add it. [naoya.horiguchi@linux.dev: remove TestClearHPageMigratable and reduce to test and clear separately] Link: https://lkml.kernel.org/r/20221025053559.GA2104800@ik1-406-35019.vs.sakura.ne.jp Link: https://lkml.kernel.org/r/20221024062012.1520887-1-naoya.horiguchi@linux.dev Link: https://lkml.kernel.org/r/20221024062012.1520887-2-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: Miaohe Lin Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ++++++---- include/linux/mm.h | 6 ++++-- mm/hugetlb.c | 9 +++++---- mm/memory-failure.c | 21 +++++++++++++++++---- 4 files changed, 32 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 20a0d5a08395..65ea34022aa2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -183,8 +183,9 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb); -int get_huge_page_for_hwpoison(unsigned long pfn, int flags); +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); void putback_active_hugepage(struct page *page); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); @@ -391,12 +392,13 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list) return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { return 0; } -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index f6d2d2d9e284..e2ac6fff03a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3277,9 +3277,11 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); #else -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fc8908d715d6..fdb36afea2b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7265,7 +7265,7 @@ unlock: return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { int ret = 0; @@ -7275,7 +7275,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) *hugetlb = true; if (HPageFreed(page)) ret = 0; - else if (HPageMigratable(page)) + else if (HPageMigratable(page) || unpoison) ret = get_page_unless_zero(page); else ret = -EBUSY; @@ -7284,12 +7284,13 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } -int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); - ret = __get_huge_page_for_hwpoison(pfn, flags); + ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 13594556146c..4fff0b36c61d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1244,7 +1244,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, false); if (hugetlb) return ret; @@ -1334,7 +1334,7 @@ static int __get_unpoison_page(struct page *page) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, true); if (hugetlb) return ret; @@ -1785,7 +1785,8 @@ void hugetlb_clear_page_hwpoison(struct page *hpage) * -EBUSY - the hugepage is busy (try to retry) * -EHWPOISON - the hugepage is already hwpoisoned */ -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); struct page *head = compound_head(page); @@ -1815,6 +1816,15 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) goto out; } + /* + * Clearing HPageMigratable for hwpoisoned hugepages to prevent them + * from being migrated by memory hotremove. + */ + if (count_increased && HPageMigratable(head)) { + ClearHPageMigratable(head); + *migratable_cleared = true; + } + return ret; out: if (count_increased) @@ -1834,10 +1844,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb struct page *p = pfn_to_page(pfn); struct page *head; unsigned long page_flags; + bool migratable_cleared = false; *hugetlb = 1; retry: - res = get_huge_page_for_hwpoison(pfn, flags); + res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); if (res == 2) { /* fallback to normal page handling */ *hugetlb = 0; return 0; @@ -1861,6 +1872,8 @@ retry: if (hwpoison_filter(p)) { hugetlb_clear_page_hwpoison(head); + if (migratable_cleared) + SetHPageMigratable(head); unlock_page(head); if (res == 1) put_page(head); -- cgit v1.2.3 From d027122d8363e58cd8bc2fa6a16917f7f69b85bb Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:10 +0900 Subject: mm/hwpoison: move definitions of num_poisoned_pages_* to memory-failure.c These interfaces will be used by drivers/base/memory.c by later patch, so as a preparatory work move them to more common header file visible to the file. Link: https://lkml.kernel.org/r/20221024062012.1520887-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/parisc/kernel/pdt.c | 3 +-- include/linux/mm.h | 5 +++++ include/linux/swapops.h | 24 ++---------------------- mm/memory-failure.c | 10 ++++++++++ 4 files changed, 18 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index e391b175f5ec..fdc880e2575a 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index e2ac6fff03a8..c667a6c4b657 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,12 +3279,17 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); +extern void num_poisoned_pages_inc(void); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } + +static inline void num_poisoned_pages_inc(void) +{ +} #endif #ifndef arch_memory_failure diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 86b95ccb81bb..3ba9bf56899d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -581,8 +581,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) #ifdef CONFIG_MEMORY_FAILURE -extern atomic_long_t num_poisoned_pages __read_mostly; - /* * Support for hardware poisoned pages */ @@ -597,17 +595,7 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } -static inline void num_poisoned_pages_inc(void) -{ - atomic_long_inc(&num_poisoned_pages); -} - -static inline void num_poisoned_pages_sub(long i) -{ - atomic_long_sub(i, &num_poisoned_pages); -} - -#else /* CONFIG_MEMORY_FAILURE */ +#else static inline swp_entry_t make_hwpoison_entry(struct page *page) { @@ -618,15 +606,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } - -static inline void num_poisoned_pages_inc(void) -{ -} - -static inline void num_poisoned_pages_sub(long i) -{ -} -#endif /* CONFIG_MEMORY_FAILURE */ +#endif static inline int non_swap_entry(swp_entry_t entry) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4fff0b36c61d..44d7bf6ff214 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,6 +74,16 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; +inline void num_poisoned_pages_inc(void) +{ + atomic_long_inc(&num_poisoned_pages); +} + +static inline void num_poisoned_pages_sub(long i) +{ + atomic_long_sub(i, &num_poisoned_pages); +} + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, -- cgit v1.2.3 From a46c9304b4bbf1b164154976cbb7e648980c7b5b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:11 +0900 Subject: mm/hwpoison: pass pfn to num_poisoned_pages_*() No functional change. Link: https://lkml.kernel.org/r/20221024062012.1520887-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/parisc/kernel/pdt.c | 2 +- include/linux/mm.h | 4 ++-- mm/memory-failure.c | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index fdc880e2575a..80943a00e245 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -231,7 +231,7 @@ void __init pdc_pdt_init(void) /* mark memory page bad */ memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(addr >> PAGE_SHIFT); } } diff --git a/include/linux/mm.h b/include/linux/mm.h index c667a6c4b657..78ae2ee09a24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,7 +3279,7 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -extern void num_poisoned_pages_inc(void); +extern void num_poisoned_pages_inc(unsigned long pfn); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) @@ -3287,7 +3287,7 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void num_poisoned_pages_inc(void) +static inline void num_poisoned_pages_inc(unsigned long pfn) { } #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44d7bf6ff214..757a46e172de 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,12 +74,12 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -inline void num_poisoned_pages_inc(void) +inline void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); } -static inline void num_poisoned_pages_sub(long i) +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); } @@ -125,7 +125,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo if (release) put_page(page); page_ref_inc(page); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); return true; } @@ -1194,7 +1194,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(pfn); pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -1741,7 +1741,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) llist_add(&raw_hwp->node, head); /* the first error event will be counted in action_result(). */ if (ret) - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); } else { /* * Failed to save raw error info. We no longer trace all @@ -2414,7 +2414,7 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { - num_poisoned_pages_sub(count); + num_poisoned_pages_sub(pfn, count); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } @@ -2630,5 +2630,5 @@ void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } } if (total) - num_poisoned_pages_sub(total); + num_poisoned_pages_sub(0, total); } -- cgit v1.2.3 From 5033091de814ab4b5623faed2755f3064e19e2d2 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:12 +0900 Subject: mm/hwpoison: introduce per-memory_block hwpoison counter Currently PageHWPoison flag does not behave well when experiencing memory hotremove/hotplug. Any data field in struct page is unreliable when the associated memory is offlined, and the current mechanism can't tell whether a memory block is onlined because a new memory devices is installed or because previous failed offline operations are undone. Especially if there's a hwpoisoned memory, it's unclear what the best option is. So introduce a new mechanism to make struct memory_block remember that a memory block has hwpoisoned memory inside it. And make any online event fail if the onlining memory block contains hwpoison. struct memory_block is freed and reallocated over ACPI-based hotremove/hotplug, but not over sysfs-based hotremove/hotplug. So the new counter can distinguish these cases. Link: https://lkml.kernel.org/r/20221024062012.1520887-5-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: kernel test robot Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- drivers/base/memory.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/memory.h | 3 +++ include/linux/mm.h | 20 +++++++++++++++++++- mm/internal.h | 8 -------- mm/memory-failure.c | 36 +++++++++++------------------------- mm/sparse.c | 2 -- 6 files changed, 71 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 9aa0da991cfb..fe98fb8d94e5 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -175,6 +175,15 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +static unsigned long memblk_nr_poison(struct memory_block *mem); +#else +static inline unsigned long memblk_nr_poison(struct memory_block *mem) +{ + return 0; +} +#endif + static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); @@ -183,6 +192,9 @@ static int memory_block_online(struct memory_block *mem) struct zone *zone; int ret; + if (memblk_nr_poison(mem)) + return -EHWPOISON; + zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, start_pfn, nr_pages); @@ -864,6 +876,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; + num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); unregister_memory_block_under_nodes(mem); remove_memory_block(mem); } @@ -1164,3 +1177,28 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, } return ret; } + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +void memblk_nr_poison_inc(unsigned long pfn) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + atomic_long_inc(&mem->nr_hwpoison); +} + +void memblk_nr_poison_sub(unsigned long pfn, long i) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + atomic_long_sub(i, &mem->nr_hwpoison); +} + +static unsigned long memblk_nr_poison(struct memory_block *mem) +{ + return atomic_long_read(&mem->nr_hwpoison); +} +#endif diff --git a/include/linux/memory.h b/include/linux/memory.h index 463662ef7614..31343566c221 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -84,6 +84,9 @@ struct memory_block { unsigned long nr_vmemmap_pages; struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) + atomic_long_t nr_hwpoison; +#endif }; int arch_get_memory_phys_device(unsigned long start_pfn); diff --git a/include/linux/mm.h b/include/linux/mm.h index 78ae2ee09a24..429ff89bfe06 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,7 +3279,8 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -extern void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_sub(unsigned long pfn, long i); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) @@ -3290,6 +3291,23 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, static inline void num_poisoned_pages_inc(unsigned long pfn) { } + +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +{ +} +#endif + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +extern void memblk_nr_poison_inc(unsigned long pfn); +extern void memblk_nr_poison_sub(unsigned long pfn, long i); +#else +static inline void memblk_nr_poison_inc(unsigned long pfn) +{ +} + +static inline void memblk_nr_poison_sub(unsigned long pfn, long i) +{ +} #endif #ifndef arch_memory_failure diff --git a/mm/internal.h b/mm/internal.h index 68afdbe7106e..bcf75a8b032d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -708,14 +708,6 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; -#ifdef CONFIG_MEMORY_FAILURE -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages); -#else -static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ -} -#endif - extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757a46e172de..9b82402ec242 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -77,11 +77,14 @@ static bool hw_memory_failure __read_mostly = false; inline void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); + memblk_nr_poison_inc(pfn); } -static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +inline void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); + if (pfn != -1UL) + memblk_nr_poison_sub(pfn, i); } /* @@ -1706,6 +1709,8 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) if (move_flag) SetPageHWPoison(p->page); + else + num_poisoned_pages_sub(page_to_pfn(p->page), 1); kfree(p); count++; } @@ -2332,6 +2337,7 @@ int unpoison_memory(unsigned long pfn) int ret = -EBUSY; int freeit = 0; unsigned long count = 1; + bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2380,6 +2386,7 @@ int unpoison_memory(unsigned long pfn) ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2395,6 +2402,7 @@ int unpoison_memory(unsigned long pfn) pfn, &unpoison_rs); } else { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2414,7 +2422,8 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { - num_poisoned_pages_sub(pfn, count); + if (!huge) + num_poisoned_pages_sub(pfn, 1); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } @@ -2609,26 +2618,3 @@ retry: return ret; } - -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ - int i, total = 0; - - /* - * A further optimization is to have per section refcounted - * num_poisoned_pages. But that would need more space per memmap, so - * for now just do a quick global check to speed up this routine in the - * absence of bad pages. - */ - if (atomic_long_read(&num_poisoned_pages) == 0) - return; - - for (i = 0; i < nr_pages; i++) { - if (PageHWPoison(&memmap[i])) { - total++; - ClearPageHWPoison(&memmap[i]); - } - } - if (total) - num_poisoned_pages_sub(0, total); -} diff --git a/mm/sparse.c b/mm/sparse.c index e5a8a3a0edd7..2779b419ef2a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, - nr_pages - map_offset); section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit v1.2.3 From ea0ffd0c08d0fef1f6e93eb07badbeeabf6b43d6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 24 Oct 2022 00:25:33 +0800 Subject: swap: add a limit for readahead page-cluster value Currenty there is no upper limit for /proc/sys/vm/page-cluster, and it's a bit shift value, so it could result in overflow of the 32-bit integer. Add a reasonable upper limit for it, read-in at most 2**31 pages, which is a large enough value for readahead. Link: https://lkml.kernel.org/r/20221023162533.81561-1-ryncsn@gmail.com Signed-off-by: Kairui Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + kernel/sysctl.c | 1 + mm/swap.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 429ff89bfe06..255931ebf2dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -74,6 +74,7 @@ static inline void totalram_pages_add(long count) extern void * high_memory; extern int page_cluster; +extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 188c305aeb8b..71a4350ac601 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2125,6 +2125,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&page_cluster_max, }, { .procname = "dirtytime_expire_seconds", diff --git a/mm/swap.c b/mm/swap.c index 2f12a2ee1d3a..b9a6817e07ff 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -43,8 +43,9 @@ #define CREATE_TRACE_POINTS #include -/* How many pages do we try to swap or page in/out together? */ +/* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; +const int page_cluster_max = 31; /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { -- cgit v1.2.3 From f1a7941243c102a44e8847e3b94ff4ff3ec56f25 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 24 Oct 2022 05:28:41 +0000 Subject: mm: convert mm's rss stats into percpu_counter Currently mm_struct maintains rss_stats which are updated on page fault and the unmapping codepaths. For page fault codepath the updates are cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. The reason for caching is performance for multithreaded applications otherwise the rss_stats updates may become hotspot for such applications. However this optimization comes with the cost of error margin in the rss stats. The rss_stats for applications with large number of threads can be very skewed. At worst the error margin is (nr_threads * 64) and we have a lot of applications with 100s of threads, so the error margin can be very high. Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32. Recently we started seeing the unbounded errors for rss_stats for specific applications which use TCP rx0cp. It seems like vm_insert_pages() codepath does not sync rss_stats at all. This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However this conversion enable us to get the accurate stats for situations where accuracy is more important than the cpu cost. This patch does not make such tradeoffs - we can just use percpu_counter_add_local() for the updates and percpu_counter_sum() (or percpu_counter_sync() + percpu_counter_read) for the readers. At the moment the readers are either procfs interface, oom_killer and memory reclaim which I think are not performance critical and should be ok with slow read. However I think we can make that change in a separate patch. Link: https://lkml.kernel.org/r/20221024052841.3291983-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/mm.h | 26 +++++---------- include/linux/mm_types.h | 7 ++-- include/linux/mm_types_task.h | 13 -------- include/linux/percpu_counter.h | 1 - include/linux/sched.h | 3 -- include/trace/events/kmem.h | 8 ++--- kernel/fork.c | 16 ++++++++- mm/memory.c | 73 +++++++----------------------------------- 8 files changed, 40 insertions(+), 107 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f919befc8fac..0cb4e196d60b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2052,40 +2052,30 @@ static inline bool get_user_page_fast_only(unsigned long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - long val = atomic_long_read(&mm->rss_stat.count[member]); - -#ifdef SPLIT_RSS_COUNTING - /* - * counter is updated in asynchronous manner and may go to minus. - * But it's never be expected number for users. - */ - if (val < 0) - val = 0; -#endif - return (unsigned long)val; + return percpu_counter_read_positive(&mm->rss_stat[member]); } -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + percpu_counter_add(&mm->rss_stat[member], value); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + percpu_counter_inc(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + percpu_counter_dec(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2d5b1575ffe0..e86861ff5bbd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -626,11 +627,7 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - /* - * Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - struct mm_rss_stat rss_stat; + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 0bb4b6da9993..5414b5c6a103 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,19 +36,6 @@ enum { NR_MM_COUNTERS }; -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) -#define SPLIT_RSS_COUNTING -/* per-thread cached information, */ -struct task_rss_stat { - int events; /* for synchronization threshold */ - int count[NR_MM_COUNTERS]; -}; -#endif /* USE_SPLIT_PTE_PTLOCKS */ - -struct mm_rss_stat { - atomic_long_t count[NR_MM_COUNTERS]; -}; - struct page_frag { struct page *page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 8ed5fba6d156..bde6c4c1f405 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -13,7 +13,6 @@ #include #include #include -#include /* percpu_counter batch for local add or sub */ #define PERCPU_COUNTER_LOCAL_BATCH INT_MAX diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55cd13..079d299fa465 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,9 +870,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; -#ifdef SPLIT_RSS_COUNTING - struct task_rss_stat rss_stat; -#endif int exit_state; int exit_code; int exit_signal; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 243073cfc29d..58688768ef0f 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -346,10 +346,9 @@ TRACE_MM_PAGES TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, - int member, - long count), + int member), - TP_ARGS(mm, member, count), + TP_ARGS(mm, member), TP_STRUCT__entry( __field(unsigned int, mm_id) @@ -362,7 +361,8 @@ TRACE_EVENT(rss_stat, __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; - __entry->size = (count << PAGE_SHIFT); + __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) + << PAGE_SHIFT); ), TP_printk("mm_id=%u curr=%d type=%s size=%ldB", diff --git a/kernel/fork.c b/kernel/fork.c index 08969f5aa38d..0fef202434c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -753,7 +753,7 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); + long x = percpu_counter_sum(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", @@ -779,6 +779,8 @@ static void check_mm(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { + int i; + BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); WARN_ON_ONCE(mm == current->active_mm); @@ -788,6 +790,9 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + + for (i = 0; i < NR_MM_COUNTERS; i++) + percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1107,6 +1112,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { + int i; + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); @@ -1148,10 +1155,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + for (i = 0; i < NR_MM_COUNTERS; i++) + if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) + goto fail_pcpu; + mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; +fail_pcpu: + while (i > 0) + percpu_counter_destroy(&mm->rss_stat[--i]); fail_nocontext: mm_free_pgd(mm); fail_nopgd: diff --git a/mm/memory.c b/mm/memory.c index 7826143ec9cd..e0555ddd71b5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -162,58 +162,11 @@ static int __init init_zero_pfn(void) } early_initcall(init_zero_pfn); -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) +void mm_trace_rss_stat(struct mm_struct *mm, int member) { - trace_rss_stat(mm, member, count); + trace_rss_stat(mm, member); } -#if defined(SPLIT_RSS_COUNTING) - -void sync_mm_rss(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - if (current->rss_stat.count[i]) { - add_mm_counter(mm, i, current->rss_stat.count[i]); - current->rss_stat.count[i] = 0; - } - } - current->rss_stat.events = 0; -} - -static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) -{ - struct task_struct *task = current; - - if (likely(task->mm == mm)) - task->rss_stat.count[member] += val; - else - add_mm_counter(mm, member, val); -} -#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) -#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) - -/* sync counter once per 64 page faults */ -#define TASK_RSS_EVENTS_THRESH (64) -static void check_sync_rss_stat(struct task_struct *task) -{ - if (unlikely(task != current)) - return; - if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - sync_mm_rss(task->mm); -} -#else /* SPLIT_RSS_COUNTING */ - -#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) -#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) - -static void check_sync_rss_stat(struct task_struct *task) -{ -} - -#endif /* SPLIT_RSS_COUNTING */ - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -1857,7 +1810,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -3153,12 +3106,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, - mm_counter_file(old_page)); - inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter(mm, mm_counter_file(old_page)); + inc_mm_counter(mm, MM_ANONPAGES); } } else { - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); @@ -3965,8 +3917,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); /* @@ -4146,7 +4098,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); setpte: @@ -4336,11 +4288,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } else { - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); @@ -5192,9 +5144,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, count_vm_event(PGFAULT); count_memcg_event_mm(vma->vm_mm, PGFAULT); - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) -- cgit v1.2.3 From f689054aace2ff13af2e9a44a74fbba650ca31ba Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 9 Nov 2022 01:20:11 +0000 Subject: percpu_counter: add percpu_counter_sum_all interface The percpu_counter is used for scenarios where performance is more important than the accuracy. For percpu_counter users, who want more accurate information in their slowpath, percpu_counter_sum is provided which traverses all the online CPUs to accumulate the data. The reason it only needs to traverse online CPUs is because percpu_counter does implement CPU offline callback which syncs the local data of the offlined CPU. However there is a small race window between the online CPUs traversal of percpu_counter_sum and the CPU offline callback. The offline callback has to traverse all the percpu_counters on the system to flush the CPU local data which can be a lot. During that time, the CPU which is going offline has already been published as offline to all the readers. So, as the offline callback is running, percpu_counter_sum can be called for one counter which has some state on the CPU going offline. Since percpu_counter_sum only traverses online CPUs, it will skip that specific CPU and the offline callback might not have flushed the state for that specific percpu_counter on that offlined CPU. Normally this is not an issue because percpu_counter users can deal with some inaccuracy for small time window. However a new user i.e. mm_struct on the cleanup path wants to check the exact state of the percpu_counter through check_mm(). For such users, this patch introduces percpu_counter_sum_all() which traverses all possible CPUs and it is used in fork.c:check_mm() to avoid the potential race. This issue is exposed by the later patch "mm: convert mm's rss stats into percpu_counter". Link: https://lkml.kernel.org/r/20221109012011.881058-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 6 ++++++ kernel/fork.c | 5 +++++ lib/percpu_counter.c | 29 +++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index bde6c4c1f405..a3aae8d57a42 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -45,6 +45,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); +s64 percpu_counter_sum_all(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); @@ -193,6 +194,11 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc) return percpu_counter_read(fbc); } +static inline s64 percpu_counter_sum_all(struct percpu_counter *fbc) +{ + return percpu_counter_read(fbc); +} + static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; diff --git a/kernel/fork.c b/kernel/fork.c index 0fef202434c3..1be4c4ab7f3e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -755,6 +755,11 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); + if (likely(!x)) + continue; + + /* Making sure this is not due to race with CPU offlining. */ + x = percpu_counter_sum_all(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index ed610b75dc32..42f729c8e56c 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -117,11 +117,8 @@ void percpu_counter_sync(struct percpu_counter *fbc) } EXPORT_SYMBOL(percpu_counter_sync); -/* - * Add up all the per-cpu counts, return the result. This is a more accurate - * but much slower version of percpu_counter_read_positive() - */ -s64 __percpu_counter_sum(struct percpu_counter *fbc) +static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, + const struct cpumask *cpu_mask) { s64 ret; int cpu; @@ -129,15 +126,35 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, cpu_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } + +/* + * Add up all the per-cpu counts, return the result. This is a more accurate + * but much slower version of percpu_counter_read_positive() + */ +s64 __percpu_counter_sum(struct percpu_counter *fbc) +{ + return __percpu_counter_sum_mask(fbc, cpu_online_mask); +} EXPORT_SYMBOL(__percpu_counter_sum); +/* + * This is slower version of percpu_counter_sum as it traverses all possible + * cpus. Use this only in the cases where accurate data is needed in the + * presense of CPUs getting offlined. + */ +s64 percpu_counter_sum_all(struct percpu_counter *fbc) +{ + return __percpu_counter_sum_mask(fbc, cpu_possible_mask); +} +EXPORT_SYMBOL(percpu_counter_sum_all); + int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { -- cgit v1.2.3 From a873dfe1032a132bf89f9e19a6ac44f5a0b78754 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 21 Oct 2022 13:01:19 -0700 Subject: mm, hwpoison: try to recover from copy-on write faults Patch series "Copy-on-write poison recovery", v3. Part 1 deals with the process that triggered the copy on write fault with a store to a shared read-only page. That process is send a SIGBUS with the usual machine check decoration to specify the virtual address of the lost page, together with the scope. Part 2 sets up to asynchronously take the page with the uncorrected error offline to prevent additional machine check faults. H/t to Miaohe Lin and Shuai Xue for pointing me to the existing function to queue a call to memory_failure(). On x86 there is some duplicate reporting (because the error is also signalled by the memory controller as well as by the core that triggered the machine check). Console logs look like this: This patch (of 2): If the kernel is copying a page as the result of a copy-on-write fault and runs into an uncorrectable error, Linux will crash because it does not have recovery code for this case where poison is consumed by the kernel. It is easy to set up a test case. Just inject an error into a private page, fork(2), and have the child process write to the page. I wrapped that neatly into a test at: git://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git just enable ACPI error injection and run: # ./einj_mem-uc -f copy-on-write Add a new copy_user_highpage_mc() function that uses copy_mc_to_kernel() on architectures where that is available (currently x86 and powerpc). When an error is detected during the page copy, return VM_FAULT_HWPOISON to caller of wp_page_copy(). This propagates up the call stack. Both x86 and powerpc have code in their fault handler to deal with this code by sending a SIGBUS to the application. Note that this patch avoids a system crash and signals the process that triggered the copy-on-write action. It does not take any action for the memory error that is still in the shared page. To handle that a call to memory_failure() is needed. But this cannot be done from wp_page_copy() because it holds mmap_lock(). Perhaps the architecture fault handlers can deal with this loose end in a subsequent patch? On Intel/x86 this loose end will often be handled automatically because the memory controller provides an additional notification of the h/w poison in memory, the handler for this will call memory_failure(). This isn't a 100% solution. If there are multiple errors, not all may be logged in this way. [tony.luck@intel.com: add call to kmsan_unpoison_memory(), per Miaohe Lin] Link: https://lkml.kernel.org/r/20221031201029.102123-2-tony.luck@intel.com Link: https://lkml.kernel.org/r/20221021200120.175753-1-tony.luck@intel.com Link: https://lkml.kernel.org/r/20221021200120.175753-2-tony.luck@intel.com Signed-off-by: Tony Luck Reviewed-by: Dan Williams Reviewed-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Reviewed-by: Alexander Potapenko Tested-by: Shuai Xue Cc: Christophe Leroy Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/highmem.h | 26 ++++++++++++++++++++++++++ mm/memory.c | 30 ++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index e9912da5441b..44242268f53b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -319,6 +319,32 @@ static inline void copy_user_highpage(struct page *to, struct page *from, #endif +#ifdef copy_mc_to_kernel +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + unsigned long ret; + char *vfrom, *vto; + + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); + if (!ret) + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); + kunmap_local(vto); + kunmap_local(vfrom); + + return ret; +} +#else +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_user_highpage(to, from, vaddr, vma); + return 0; +} +#endif + #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) diff --git a/mm/memory.c b/mm/memory.c index e0555ddd71b5..13b1fe661d86 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2798,10 +2798,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf) return same; } -static inline bool __wp_page_copy_user(struct page *dst, struct page *src, - struct vm_fault *vmf) +/* + * Return: + * 0: copied succeeded + * -EHWPOISON: copy failed due to hwpoison in source page + * -EAGAIN: copied failed (some other reason) + */ +static inline int __wp_page_copy_user(struct page *dst, struct page *src, + struct vm_fault *vmf) { - bool ret; + int ret; void *kaddr; void __user *uaddr; bool locked = false; @@ -2810,8 +2816,9 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, unsigned long addr = vmf->address; if (likely(src)) { - copy_user_highpage(dst, src, addr, vma); - return true; + if (copy_mc_user_highpage(dst, src, addr, vma)) + return -EHWPOISON; + return 0; } /* @@ -2838,7 +2845,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, * and update local tlb only */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2863,7 +2870,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2882,7 +2889,7 @@ warn: } } - ret = true; + ret = 0; pte_unlock: if (locked) @@ -3054,6 +3061,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; + int ret; delayacct_wpcopy_start(); @@ -3071,19 +3079,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; - if (!__wp_page_copy_user(new_page, old_page, vmf)) { + ret = __wp_page_copy_user(new_page, old_page, vmf); + if (ret) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. + * The -EHWPOISON case will not be retried. */ put_page(new_page); if (old_page) put_page(old_page); delayacct_wpcopy_end(); - return 0; + return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(new_page, old_page); } -- cgit v1.2.3 From d302c2398ba269e788a4f37ae57c07a7fcabaa42 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 21 Oct 2022 13:01:20 -0700 Subject: mm, hwpoison: when copy-on-write hits poison, take page offline Cannot call memory_failure() directly from the fault handler because mmap_lock (and others) are held. It is important, but not urgent, to mark the source page as h/w poisoned and unmap it from other tasks. Use memory_failure_queue() to request a call to memory_failure() for the page with the error. Also provide a stub version for CONFIG_MEMORY_FAILURE=n Link: https://lkml.kernel.org/r/20221021200120.175753-3-tony.luck@intel.com Signed-off-by: Tony Luck Reviewed-by: Miaohe Lin Cc: Christophe Leroy Cc: Dan Williams Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Naoya Horiguchi Cc: Nicholas Piggin Cc: Shuai Xue Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++++- mm/memory.c | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0cb4e196d60b..3950ef45b9a9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3280,7 +3280,6 @@ enum mf_flags { int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); -extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; @@ -3289,11 +3288,16 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE +extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else +static inline void memory_failure_queue(unsigned long pfn, int flags) +{ +} + static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { diff --git a/mm/memory.c b/mm/memory.c index 13b1fe661d86..659620b6770f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2816,8 +2816,10 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, unsigned long addr = vmf->address; if (likely(src)) { - if (copy_mc_user_highpage(dst, src, addr, vma)) + if (copy_mc_user_highpage(dst, src, addr, vma)) { + memory_failure_queue(page_to_pfn(src), 0); return -EHWPOISON; + } return 0; } -- cgit v1.2.3 From 57e9cc50f4dd926d6c38751799d25cad89fb2bd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 26 Oct 2022 14:01:33 -0400 Subject: mm: vmscan: split khugepaged stats from direct reclaim stats Direct reclaim stats are useful for identifying a potential source for application latency, as well as spotting issues with kswapd. However, khugepaged currently distorts the picture: as a kernel thread it doesn't impose allocation latencies on userspace, and it explicitly opts out of kswapd reclaim. Its activity showing up in the direct reclaim stats is misleading. Counting it as kswapd reclaim could also cause confusion when trying to understand actual kswapd behavior. Break out khugepaged from the direct reclaim counters into new pgsteal_khugepaged, pgdemote_khugepaged, pgscan_khugepaged counters. Test with a huge executable (CONFIG_READ_ONLY_THP_FOR_FS): pgsteal_kswapd 1342185 pgsteal_direct 0 pgsteal_khugepaged 3623 pgscan_kswapd 1345025 pgscan_direct 0 pgscan_khugepaged 3623 Link: https://lkml.kernel.org/r/20221026180133.377671-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Eric Bergen Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 6 ++++++ include/linux/khugepaged.h | 6 ++++++ include/linux/vm_event_item.h | 3 +++ mm/khugepaged.c | 5 +++++ mm/memcontrol.c | 8 ++++++-- mm/vmscan.c | 32 ++++++++++++++++++++++++-------- mm/vmstat.c | 3 +++ 7 files changed, 53 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index dc254a3cb956..74cec76be9f2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1488,12 +1488,18 @@ PAGE_SIZE multiple when read back. pgscan_direct (npn) Amount of scanned pages directly (in an inactive LRU list) + pgscan_khugepaged (npn) + Amount of scanned pages by khugepaged (in an inactive LRU list) + pgsteal_kswapd (npn) Amount of reclaimed pages by kswapd pgsteal_direct (npn) Amount of reclaimed pages directly + pgsteal_khugepaged (npn) + Amount of reclaimed pages by khugepaged + pgfault (npn) Total number of page faults incurred diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 70162d707caf..f68865e19b0b 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,7 @@ extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); +extern bool current_is_khugepaged(void); #ifdef CONFIG_SHMEM extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); @@ -57,6 +58,11 @@ static inline int collapse_pte_mapped_thp(struct mm_struct *mm, static inline void khugepaged_min_free_kbytes_update(void) { } + +static inline bool current_is_khugepaged(void) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3518dba1e02f..7f5d1caf5890 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -40,10 +40,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, PGSCAN_KSWAPD, PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, PGSCAN_DIRECT_THROTTLE, PGSCAN_ANON, PGSCAN_FILE, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3703a56571c1..9c111273bbf9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2577,6 +2577,11 @@ void khugepaged_min_free_kbytes_update(void) mutex_unlock(&khugepaged_mutex); } +bool current_is_khugepaged(void) +{ + return kthread_func(current) == khugepaged; +} + static int madvise_collapse_errno(enum scan_result r) { /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c95e2ed6e7fd..23750cec0036 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -661,8 +661,10 @@ static const unsigned int memcg_vm_event_stat[] = { PGPGOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, PGFAULT, PGMAJFAULT, PGREFILL, @@ -1574,10 +1576,12 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) /* Accumulated memory events */ seq_buf_printf(&s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT)); + memcg_events(memcg, PGSCAN_DIRECT) + + memcg_events(memcg, PGSCAN_KHUGEPAGED)); seq_buf_printf(&s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT)); + memcg_events(memcg, PGSTEAL_DIRECT) + + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { if (memcg_vm_event_stat[i] == PGPGIN || diff --git a/mm/vmscan.c b/mm/vmscan.c index 55a5b5d66d68..d7c71be6417d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -1047,6 +1048,24 @@ void drop_slab(void) drop_slab_node(nid); } +static int reclaimer_offset(void) +{ + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGSCAN_DIRECT - PGSCAN_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); + + if (current_is_kswapd()) + return 0; + if (current_is_khugepaged()) + return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; +} + static inline int is_page_cache_freeable(struct folio *folio) { /* @@ -1599,10 +1618,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); - if (current_is_kswapd()) - __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); - else - __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); return nr_succeeded; } @@ -2475,7 +2491,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); @@ -2492,7 +2508,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); @@ -4871,7 +4887,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, break; } - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); @@ -5049,7 +5065,7 @@ retry: if (walk && walk->batched) reset_batch_size(lruvec, walk); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); __count_memcg_events(memcg, item, reclaimed); diff --git a/mm/vmstat.c b/mm/vmstat.c index b2371d745e00..1ea6a5ce1c41 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1271,10 +1271,13 @@ const char * const vmstat_text[] = { "pgreuse", "pgsteal_kswapd", "pgsteal_direct", + "pgsteal_khugepaged", "pgdemote_kswapd", "pgdemote_direct", + "pgdemote_khugepaged", "pgscan_kswapd", "pgscan_direct", + "pgscan_khugepaged", "pgscan_direct_throttle", "pgscan_anon", "pgscan_file", -- cgit v1.2.3 From a098c977722ca27d3b4bfeb966767af3cce45f85 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:51 -0700 Subject: mm/hugetlb_cgroup: convert __set_hugetlb_cgroup() to folios Patch series "convert hugetlb_cgroup helper functions to folios", v2. This patch series continues the conversion of hugetlb code from being managed in pages to folios by converting many of the hugetlb_cgroup helper functions to use folios. This allows the core hugetlb functions to pass in a folio to these helper functions. This patch (of 9); Change __set_hugetlb_cgroup() to use folios so it is explicit that the function operates on a head page. Link: https://lkml.kernel.org/r/20221101223059.460937-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221101223059.460937-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 14 +++++++------- mm/hugetlb_cgroup.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 630cd255d0cf..7576e9ed8afe 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -90,31 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page) return __hugetlb_cgroup_from_page(page, true); } -static inline void __set_hugetlb_cgroup(struct page *page, +static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { - VM_BUG_ON_PAGE(!PageHuge(page), page); + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return; if (rsvd) - set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD, + set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD), (unsigned long)h_cg); else - set_page_private(page + SUBPAGE_INDEX_CGROUP, + set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP), (unsigned long)h_cg); } static inline void set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page, h_cg, false); + __set_hugetlb_cgroup(page_folio(page), h_cg, false); } static inline void set_hugetlb_cgroup_rsvd(struct page *page, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page, h_cg, true); + __set_hugetlb_cgroup(page_folio(page), h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f61d132df52b..b2316bcbf634 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -314,7 +314,7 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled() || !h_cg) return; - __set_hugetlb_cgroup(page, h_cg, rsvd); + __set_hugetlb_cgroup(page_folio(page), h_cg, rsvd); if (!rsvd) { unsigned long usage = h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; @@ -356,7 +356,7 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, h_cg = __hugetlb_cgroup_from_page(page, rsvd); if (unlikely(!h_cg)) return; - __set_hugetlb_cgroup(page, NULL, rsvd); + __set_hugetlb_cgroup(page_folio(page), NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), -- cgit v1.2.3 From f074732d599e19a2a5b12e54743ad5eaccbe6550 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:52 -0700 Subject: mm/hugetlb_cgroup: convert hugetlb_cgroup_from_page() to folios Introduce folios in __remove_hugetlb_page() by converting hugetlb_cgroup_from_page() to use folios. Also gets rid of unsed hugetlb_cgroup_from_page_resv() function. Link: https://lkml.kernel.org/r/20221101223059.460937-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 39 ++++++++++++++++++++------------------- mm/hugetlb.c | 5 +++-- mm/hugetlb_cgroup.c | 13 ++++++++----- 3 files changed, 31 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 7576e9ed8afe..feb2edafc8b6 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -67,27 +67,34 @@ struct hugetlb_cgroup { }; static inline struct hugetlb_cgroup * -__hugetlb_cgroup_from_page(struct page *page, bool rsvd) +__hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { - VM_BUG_ON_PAGE(!PageHuge(page), page); + struct page *tail; - if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return NULL; - if (rsvd) - return (void *)page_private(page + SUBPAGE_INDEX_CGROUP_RSVD); - else - return (void *)page_private(page + SUBPAGE_INDEX_CGROUP); + + if (rsvd) { + tail = folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD); + return (void *)page_private(tail); + } + + else { + tail = folio_page(folio, SUBPAGE_INDEX_CGROUP); + return (void *)page_private(tail); + } } -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { - return __hugetlb_cgroup_from_page(page, false); + return __hugetlb_cgroup_from_folio(folio, false); } static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_rsvd(struct page *page) +hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { - return __hugetlb_cgroup_from_page(page, true); + return __hugetlb_cgroup_from_folio(folio, true); } static inline void __set_hugetlb_cgroup(struct folio *folio, @@ -181,19 +188,13 @@ static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, { } -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) -{ - return NULL; -} - -static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_resv(struct page *page) +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_rsvd(struct page *page) +hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { return NULL; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index be09678d0582..f86a61a73112 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1446,9 +1446,10 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page, bool demote) { int nid = page_to_nid(page); + struct folio *folio = page_folio(page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b2316bcbf634..8b95c1560f9c 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -191,8 +191,9 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); + struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_page(page); + page_hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely @@ -349,14 +350,15 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page, bool rsvd) { struct hugetlb_cgroup *h_cg; + struct folio *folio = page_folio(page); if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); - h_cg = __hugetlb_cgroup_from_page(page, rsvd); + h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); if (unlikely(!h_cg)) return; - __set_hugetlb_cgroup(page_folio(page), NULL, rsvd); + __set_hugetlb_cgroup(folio, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), @@ -888,13 +890,14 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = page_hstate(oldhpage); + struct folio *old_folio = page_folio(oldhpage); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); - h_cg = hugetlb_cgroup_from_page(oldhpage); - h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); + h_cg = hugetlb_cgroup_from_folio(old_folio); + h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); set_hugetlb_cgroup(oldhpage, NULL); set_hugetlb_cgroup_rsvd(oldhpage, NULL); -- cgit v1.2.3 From de656ed376c4cb47c5713fba52f8bbfbea44f387 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:53 -0700 Subject: mm/hugetlb_cgroup: convert set_hugetlb_cgroup*() to folios Allows __prep_new_huge_page() to operate on a folio by converting set_hugetlb_cgroup*() to take in a folio. Link: https://lkml.kernel.org/r/20221101223059.460937-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 12 ++++++------ mm/hugetlb.c | 33 +++++++++++++++++++-------------- mm/hugetlb_cgroup.c | 11 ++++++----- 3 files changed, 31 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index feb2edafc8b6..a7e3540f7f38 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -112,16 +112,16 @@ static inline void __set_hugetlb_cgroup(struct folio *folio, (unsigned long)h_cg); } -static inline void set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page_folio(page), h_cg, false); + __set_hugetlb_cgroup(folio, h_cg, false); } -static inline void set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page_folio(page), h_cg, true); + __set_hugetlb_cgroup(folio, h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) @@ -199,12 +199,12 @@ hugetlb_cgroup_from_folio_rsvd(struct folio *folio) return NULL; } -static inline void set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { } -static inline void set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f86a61a73112..01ea43b22724 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1774,19 +1774,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_huge_page(struct hstate *h, struct page *page) +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { - hugetlb_vmemmap_optimize(h, page); - INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - hugetlb_set_page_subpool(page, NULL); - set_hugetlb_cgroup(page, NULL); - set_hugetlb_cgroup_rsvd(page, NULL); + hugetlb_vmemmap_optimize(h, &folio->page); + INIT_LIST_HEAD(&folio->lru); + folio->_folio_dtor = HUGETLB_PAGE_DTOR; + hugetlb_set_folio_subpool(folio, NULL); + set_hugetlb_cgroup(folio, NULL); + set_hugetlb_cgroup_rsvd(folio, NULL); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { - __prep_new_huge_page(h, page); + struct folio *folio = page_folio(page); + + __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); @@ -2748,8 +2750,10 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - int nid = page_to_nid(old_page); + struct folio *old_folio = page_folio(old_page); + int nid = folio_nid(old_folio); struct page *new_page; + struct folio *new_folio; int ret = 0; /* @@ -2762,16 +2766,17 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; - __prep_new_huge_page(h, new_page); + new_folio = page_folio(new_page); + __prep_new_hugetlb_folio(h, new_folio); retry: spin_lock_irq(&hugetlb_lock); - if (!PageHuge(old_page)) { + if (!folio_test_hugetlb(old_folio)) { /* * Freed from under us. Drop new_page too. */ goto free_new; - } else if (page_count(old_page)) { + } else if (folio_ref_count(old_folio)) { /* * Someone has grabbed the page, try to isolate it here. * Fail with -EBUSY if not possible. @@ -2780,7 +2785,7 @@ retry: ret = isolate_hugetlb(old_page, list); spin_lock_irq(&hugetlb_lock); goto free_new; - } else if (!HPageFreed(old_page)) { + } else if (!folio_test_hugetlb_freed(old_folio)) { /* * Page's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if @@ -2818,7 +2823,7 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); /* Page has a zero ref count, but needs a ref to be freed */ - set_page_refcounted(new_page); + folio_ref_unfreeze(new_folio, 1); update_and_free_page(h, new_page, false); return ret; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 8b95c1560f9c..87a1125aa42d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -212,7 +212,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); - set_hugetlb_cgroup(page, parent); + set_hugetlb_cgroup(folio, parent); out: return; } @@ -891,6 +891,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = page_hstate(oldhpage); struct folio *old_folio = page_folio(oldhpage); + struct folio *new_folio = page_folio(newhpage); if (hugetlb_cgroup_disabled()) return; @@ -898,12 +899,12 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_folio(old_folio); h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); - set_hugetlb_cgroup(oldhpage, NULL); - set_hugetlb_cgroup_rsvd(oldhpage, NULL); + set_hugetlb_cgroup(old_folio, NULL); + set_hugetlb_cgroup_rsvd(old_folio, NULL); /* move the h_cg details to new cgroup */ - set_hugetlb_cgroup(newhpage, h_cg); - set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); + set_hugetlb_cgroup(new_folio, h_cg); + set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&newhpage->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); return; -- cgit v1.2.3 From 29f394304f624b06fafb3cc9c3da8779f71f4bee Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:54 -0700 Subject: mm/hugetlb_cgroup: convert hugetlb_cgroup_migrate to folios Cleans up intermediate page to folio conversion code in hugetlb_cgroup_migrate() by changing its arguments from pages to folios. Link: https://lkml.kernel.org/r/20221101223059.460937-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 8 ++++---- mm/hugetlb.c | 2 +- mm/hugetlb_cgroup.c | 8 +++----- 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index a7e3540f7f38..789b6fef176d 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -177,8 +177,8 @@ extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, bool region_del); extern void hugetlb_cgroup_file_init(void) __init; -extern void hugetlb_cgroup_migrate(struct page *oldhpage, - struct page *newhpage); +extern void hugetlb_cgroup_migrate(struct folio *old_folio, + struct folio *new_folio); #else static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, @@ -286,8 +286,8 @@ static inline void hugetlb_cgroup_file_init(void) { } -static inline void hugetlb_cgroup_migrate(struct page *oldhpage, - struct page *newhpage) +static inline void hugetlb_cgroup_migrate(struct folio *old_folio, + struct folio *new_folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 01ea43b22724..05a832886a09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7325,7 +7325,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) { struct hstate *h = page_hstate(oldpage); - hugetlb_cgroup_migrate(oldpage, newpage); + hugetlb_cgroup_migrate(page_folio(oldpage), page_folio(newpage)); set_page_owner_migrate_reason(newpage, reason); /* diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 87a1125aa42d..b1b18337a56a 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -885,13 +885,11 @@ void __init hugetlb_cgroup_file_init(void) * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ -void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) +void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; - struct hstate *h = page_hstate(oldhpage); - struct folio *old_folio = page_folio(oldhpage); - struct folio *new_folio = page_folio(newhpage); + struct hstate *h = folio_hstate(old_folio); if (hugetlb_cgroup_disabled()) return; @@ -905,7 +903,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(new_folio, h_cg); set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); - list_move(&newhpage->lru, &h->hugepage_activelist); + list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); return; } -- cgit v1.2.3 From d4ab0316cc33aeedf6dcb1c2c25e097a25766132 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:57 -0700 Subject: mm/hugetlb_cgroup: convert hugetlb_cgroup_uncharge_page() to folios Continue to use a folio inside free_huge_page() by converting hugetlb_cgroup_uncharge_page*() to folios. Link: https://lkml.kernel.org/r/20221101223059.460937-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 16 ++++++++-------- mm/hugetlb.c | 15 +++++++++------ mm/hugetlb_cgroup.c | 21 ++++++++++----------- 3 files changed, 27 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 789b6fef176d..c70f92fe493e 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -158,10 +158,10 @@ extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page); -extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page); -extern void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, - struct page *page); +extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio); +extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, + struct folio *folio); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); @@ -254,14 +254,14 @@ hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, { } -static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio) { } -static inline void hugetlb_cgroup_uncharge_page_rsvd(int idx, +static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_cgroup(int idx, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9841fb0fcaf9..e1950fff6aa9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1742,10 +1742,10 @@ void free_huge_page(struct page *page) spin_lock_irqsave(&hugetlb_lock, flags); folio_clear_hugetlb_migratable(folio); - hugetlb_cgroup_uncharge_page(hstate_index(h), - pages_per_huge_page(h), page); - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_folio(hstate_index(h), + pages_per_huge_page(h), folio); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); if (restore_reserve) h->resv_huge_pages++; @@ -2872,6 +2872,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; + struct folio *folio; long map_chg, map_commit; long gbl_chg; int ret, idx; @@ -2935,6 +2936,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * a reservation exists for the allocation. */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); + if (!page) { spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); @@ -2949,6 +2951,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_refcounted(page); /* Fall through */ } + folio = page_folio(page); hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. @@ -2978,8 +2981,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); if (deferred_reserve) - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); } return page; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b1b18337a56a..4cd57f979245 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -346,11 +346,10 @@ void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, /* * Should be called with hugetlb_lock held */ -static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page, bool rsvd) +static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio, bool rsvd) { struct hugetlb_cgroup *h_cg; - struct folio *folio = page_folio(page); if (hugetlb_cgroup_disabled()) return; @@ -368,27 +367,27 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, css_put(&h_cg->css); else { unsigned long usage = - h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ - WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage - nr_pages); } } -void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio) { - __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false); + __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); } -void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, - struct page *page) +void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, + struct folio *folio) { - __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true); + __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, -- cgit v1.2.3 From 345c62d163496ae4b5c1ce530b1588067d8f5a8b Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:59 -0700 Subject: mm/hugetlb: convert move_hugetlb_state() to folios Clean up unmap_and_move_huge_page() by converting move_hugetlb_state() to take in folios. [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n build] Link: https://lkml.kernel.org/r/20221101223059.460937-10-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 11 ++++++++--- mm/hugetlb.c | 22 ++++++++++++---------- mm/migrate.c | 4 ++-- 3 files changed, 22 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65ea34022aa2..58a30938a9b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -187,7 +187,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void putback_active_hugepage(struct page *page); -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -407,8 +407,8 @@ static inline void putback_active_hugepage(struct page *page) { } -static inline void move_hugetlb_state(struct page *oldpage, - struct page *newpage, int reason) +static inline void move_hugetlb_state(struct folio *old_folio, + struct folio *new_folio, int reason) { } @@ -991,6 +991,11 @@ void hugetlb_unregister_node(struct node *node); #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) +{ + return NULL; +} + static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e1950fff6aa9..76ebefe02827 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7324,15 +7324,15 @@ void putback_active_hugepage(struct page *page) put_page(page); } -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { - struct hstate *h = page_hstate(oldpage); + struct hstate *h = folio_hstate(old_folio); - hugetlb_cgroup_migrate(page_folio(oldpage), page_folio(newpage)); - set_page_owner_migrate_reason(newpage, reason); + hugetlb_cgroup_migrate(old_folio, new_folio); + set_page_owner_migrate_reason(&new_folio->page, reason); /* - * transfer temporary state of the new huge page. This is + * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. @@ -7341,12 +7341,14 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) * here as well otherwise the global surplus count will not match * the per-node's. */ - if (HPageTemporary(newpage)) { - int old_nid = page_to_nid(oldpage); - int new_nid = page_to_nid(newpage); + if (folio_test_hugetlb_temporary(new_folio)) { + int old_nid = folio_nid(old_folio); + int new_nid = folio_nid(new_folio); + + + folio_set_hugetlb_temporary(old_folio); + folio_clear_hugetlb_temporary(new_folio); - SetHPageTemporary(oldpage); - ClearHPageTemporary(newpage); /* * There is no need to transfer the per-node surplus state diff --git a/mm/migrate.c b/mm/migrate.c index f8c85b42e2bc..4aea647a0180 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ - if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) { + if (hugetlb_folio_subpool(src) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } @@ -1348,7 +1348,7 @@ put_anon: put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { - move_hugetlb_state(hpage, new_hpage, reason); + move_hugetlb_state(src, dst, reason); put_new_page = NULL; } -- cgit v1.2.3 From 44467bbb7e81ebcef2a5bfc9d6546bf7cd015374 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:21 +0000 Subject: mm/damon/core: add a callback for scheme target regions check Patch series "efficiently expose damos action tried regions information". DAMON users can retrieve the monitoring results via 'after_aggregation' callbacks if the user is using the kernel API, or 'damon_aggregated' tracepoint if the user is in the user space. Those are useful if full monitoring results are necessary. However, if the user has interest in only a snapshot of the results for some regions having specific access pattern, the interfaces could be inefficient. For example, some users only want to know which memory regions are not accessed for more than a specific time at the moment. Also, some DAMOS users would want to know exactly to what memory regions the schemes' actions tried to be applied, for a debugging or a tuning. As DAMOS has its internal mechanism for quota and regions prioritization, the users would need to simulate DAMOS' mechanism against the monitoring results. That's unnecessarily complex. This patchset implements DAMON kernel API callbacks and sysfs directory for efficient exposure of the information for the use cases. The new callback will be called for each region when a DAMOS action is gonna tried to be applied to it. The sysfs directory will be called 'tried_regions' and placed under each scheme sysfs directory. Users can write a special keyworkd, 'update_schemes_regions', to the 'state' file of a kdamond sysfs directory. Then, DAMON sysfs interface will fill the directory with the information of regions that corresponding scheme action was tried to be applied for next one aggregation interval. Patches Sequence ---------------- The first one (patch 1) implements the callback for the kernel space users. Following two patches (patches 2 and 3) implements sysfs directories for the information and its sub directories. Two patches (patches 4 and 5) for implementing the special keywords for filling the data to and cleaning up the directories follow. Patch 6 adds a selftest for the new sysfs directory. Finally, two patches (patches 7 and 8) document the new feature in the administrator guide and the ABI document. This patch (of 8): Getting DAMON monitoring results of only specific access pattern (e.g., getting address ranges of memory that not accessed at all for two minutes) can be useful for efficient monitoring of the system. The information can also be helpful for deep level investigation of DAMON-based operation schemes. For that, users need to record (in case of the user space users) or iterate (in case of the kernel space users) full monitoring results and filter it out for the specific access pattern. In case of the DAMOS investigation, users will even need to simulate DAMOS' quota and prioritization mechanisms. It's inefficient and complex. Add a new DAMON callback that will be called before each scheme is applied to each region. DAMON kernel API users will be able to do the query-like monitoring results collection, or DAMOS investigation in an efficient and simple way using it. Commits for providing the capability to the user space users will follow. Link: https://lkml.kernel.org/r/20221101220328.95765-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221101220328.95765-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ mm/damon/core.c | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 620ada094c3b..35630634d790 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -357,6 +357,7 @@ struct damon_operations { * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. + * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * @private: User private data. * @@ -385,6 +386,10 @@ struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); + int (*before_damos_apply)(struct damon_ctx *context, + struct damon_target *target, + struct damon_region *region, + struct damos *scheme); void (*before_terminate)(struct damon_ctx *context); }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 80d5937fe337..ceec75b88ef9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -772,6 +772,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; + int err = 0; if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { @@ -782,7 +783,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, damon_split_region_at(t, r, sz); } ktime_get_coarse_ts64(&begin); - sz_applied = c->ops.apply_scheme(c, t, r, s); + if (c->callback.before_damos_apply) + err = c->callback.before_damos_apply(c, t, r, s); + if (!err) + sz_applied = c->ops.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); -- cgit v1.2.3 From ca92ea3dc5a2b01f98e9f02b7a6bc03be06fe124 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sun, 30 Oct 2022 17:41:50 -0400 Subject: mm: always compile in pte markers Patch series "mm: Use pte marker for swapin errors". This series uses the pte marker to replace the swapin error swap entry, then we save one more swap entry slot for swap devices. A new pte marker bit is defined. This patch (of 2): The PTE markers code is tiny and now it's enabled for most of the distributions. It's fine to keep it as-is, but to make a broader use of it (e.g. replacing read error swap entry) it needs to be there always otherwise we need special code path to take care of !PTE_MARKER case. It'll be easier just make pte marker always exist. Use this chance to extend its usage to anonymous too by simply touching up some of the old comments, because it'll be used for anonymous pages in the follow up patches. Link: https://lkml.kernel.org/r/20221030214151.402274-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221030214151.402274-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Huang Ying Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Naoya Horiguchi Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++------- include/linux/swapops.h | 31 ------------------------------- mm/Kconfig | 7 ------- mm/memory.c | 7 +++---- 4 files changed, 6 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 369d7799205d..211aeca9bfa7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -60,17 +60,13 @@ static inline int current_is_kswapd(void) SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ SWP_PTE_MARKER_NUM) /* - * PTE markers are used to persist information onto PTEs that are mapped with - * file-backed memories. As its name "PTE" hints, it should only be applied to - * the leaves of pgtables. + * PTE markers are used to persist information onto PTEs that otherwise + * should be a none pte. As its name "PTE" hints, it should only be + * applied to the leaves of pgtables. */ -#ifdef CONFIG_PTE_MARKER #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) -#else -#define SWP_PTE_MARKER_NUM 0 -#endif /* * Unaddressable device memory support. See include/linux/hmm.h and diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3ba9bf56899d..35c1fe62d2e1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -412,8 +412,6 @@ typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) #define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) -#ifdef CONFIG_PTE_MARKER - static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { return swp_entry(SWP_PTE_MARKER, marker); @@ -434,32 +432,6 @@ static inline bool is_pte_marker(pte_t pte) return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); } -#else /* CONFIG_PTE_MARKER */ - -static inline swp_entry_t make_pte_marker_entry(pte_marker marker) -{ - /* This should never be called if !CONFIG_PTE_MARKER */ - WARN_ON_ONCE(1); - return swp_entry(0, 0); -} - -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return false; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) -{ - return 0; -} - -static inline bool is_pte_marker(pte_t pte) -{ - return false; -} - -#endif /* CONFIG_PTE_MARKER */ - static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); @@ -477,9 +449,6 @@ static inline pte_t make_pte_marker(pte_marker marker) * memory, kernel-only memory (including when the system is during-boot), * non-ram based generic file-system. It's fine to be used even there, but the * extra pte marker check will be pure overhead. - * - * For systems configured with !CONFIG_PTE_MARKER this will be automatically - * optimized to pte_none(). */ static inline int pte_none_mostly(pte_t pte) { diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b505..4b28800d9be1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1107,17 +1107,10 @@ config HAVE_ARCH_USERFAULTFD_MINOR help Arch has userfaultfd minor fault support -config PTE_MARKER - bool - - help - Allows to create marker PTEs for file-backed memory. - config PTE_MARKER_UFFD_WP bool "Userfaultfd write protection support for shmem/hugetlbfs" default y depends on HAVE_ARCH_USERFAULTFD_WP - select PTE_MARKER help Allows to create marker PTEs for userfaultfd write protection diff --git a/mm/memory.c b/mm/memory.c index 659620b6770f..b79d27533722 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3662,11 +3662,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) unsigned long marker = pte_marker_get(entry); /* - * PTE markers should always be with file-backed memories, and the - * marker should never be empty. If anything weird happened, the best - * thing to do is to kill the process along with its mm. + * PTE markers should never be empty. If anything weird happened, + * the best thing to do is to kill the process along with its mm. */ - if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) + if (WARN_ON_ONCE(!marker)) return VM_FAULT_SIGBUS; if (pte_marker_entry_uffd_wp(entry)) -- cgit v1.2.3 From 15520a3f046998e3f57e695743e99b0875e2dae7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sun, 30 Oct 2022 17:41:51 -0400 Subject: mm: use pte markers for swap errors PTE markers are ideal mechanism for things like SWP_SWAPIN_ERROR. Using a whole swap entry type for this purpose can be an overkill, especially if we already have PTE markers. Define a new bit for swapin error and replace it with pte markers. Then we can safely drop SWP_SWAPIN_ERROR and give one device slot back to swap. We used to have SWP_SWAPIN_ERROR taking the page pfn as part of the swap entry, but it's never used. Neither do I see how it can be useful because normally the swapin failure should not be caused by a bad page but bad swap device. Drop it alongside. Link: https://lkml.kernel.org/r/20221030214151.402274-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Huang Ying Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 +----- include/linux/swapops.h | 26 ++++++++++++++------------ mm/memory.c | 6 ++++-- mm/shmem.c | 2 +- mm/swapfile.c | 2 +- 5 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 211aeca9bfa7..fec6647a289a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,10 +55,6 @@ static inline int current_is_kswapd(void) * actions on faults. */ -#define SWP_SWAPIN_ERROR_NUM 1 -#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ - SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ - SWP_PTE_MARKER_NUM) /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be @@ -121,7 +117,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ - SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM) + SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 35c1fe62d2e1..27ade4f22abb 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -162,16 +162,6 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) return xa_mk_value(entry.val); } -static inline swp_entry_t make_swapin_error_entry(struct page *page) -{ - return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page)); -} - -static inline int is_swapin_error_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_SWAPIN_ERROR; -} - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -409,8 +399,9 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) typedef unsigned long pte_marker; -#define PTE_MARKER_UFFD_WP BIT(0) -#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) +#define PTE_MARKER_UFFD_WP BIT(0) +#define PTE_MARKER_SWAPIN_ERROR BIT(1) +#define PTE_MARKER_MASK (BIT(2) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { @@ -437,6 +428,17 @@ static inline pte_t make_pte_marker(pte_marker marker) return swp_entry_to_pte(make_pte_marker_entry(marker)); } +static inline swp_entry_t make_swapin_error_entry(void) +{ + return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR); +} + +static inline int is_swapin_error_entry(swp_entry_t entry) +{ + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR); +} + /* * This is a special version to check pte_none() just to cover the case when * the pte is a pte marker. It existed because in many cases the pte marker diff --git a/mm/memory.c b/mm/memory.c index b79d27533722..142c4229549b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3668,6 +3668,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (WARN_ON_ONCE(!marker)) return VM_FAULT_SIGBUS; + /* Higher priority than uffd-wp when data corrupted */ + if (marker & PTE_MARKER_SWAPIN_ERROR) + return VM_FAULT_SIGBUS; + if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); @@ -3727,8 +3731,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_swapin_error_entry(entry)) { - ret = VM_FAULT_SIGBUS; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { diff --git a/mm/shmem.c b/mm/shmem.c index 0a7c4a748811..7428ae3fa4b9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1682,7 +1682,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, swp_entry_t swapin_error; void *old; - swapin_error = make_swapin_error_entry(&folio->page); + swapin_error = make_swapin_error_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); diff --git a/mm/swapfile.c b/mm/swapfile.c index 72e481aacd5d..03fe0949f6b2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1781,7 +1781,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t pteval; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - pteval = swp_entry_to_pte(make_swapin_error_entry(page)); + pteval = swp_entry_to_pte(make_swapin_error_entry()); set_pte_at(vma->vm_mm, addr, pte, pteval); swap_free(entry); ret = 0; -- cgit v1.2.3 From dad6a5eb55564845aa17b8b20fa834af21e46c48 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:48:45 -0700 Subject: mm,hugetlb: use folio fields in second tail page Patch series "mm,huge,rmap: unify and speed up compound mapcounts". This patch (of 3): We want to declare one more int in the first tail of a compound page: that first tail page being valuable property, since every compound page has a first tail, but perhaps no more than that. No problem on 64-bit: there is already space for it. No problem with 32-bit THPs: 5.18 commit 5232c63f46fd ("mm: Make compound_pincount always available") kindly cleared the space for it, apparently not realizing that only 64-bit architectures enable CONFIG_THP_SWAP (whose use of tail page->private might conflict) - but make sure of that in its Kconfig. But hugetlb pages use tail page->private of the first tail page for a subpool pointer, which will conflict; and they also use page->private of the 2nd, 3rd and 4th tails. Undo "mm: add private field of first tail to struct page and struct folio"'s recent addition of private_1 to the folio tail: instead add hugetlb_subpool, hugetlb_cgroup, hugetlb_cgroup_rsvd, hugetlb_hwpoison to a second tail page of the folio: THP has long been using several fields of that tail, so make better use of it for hugetlb too. This is not how a generic folio should be declared in future, but it is an effective transitional way to make use of it. Delete the SUBPAGE_INDEX stuff, but keep __NR_USED_SUBPAGE: now 3. [hughd@google.com: prefix folio's page_1 and page_2 with double underscore, give folio's _flags_2 and _head_2 a line documentation each] Link: https://lkml.kernel.org/r/9e2cb6b-5b58-d3f2-b5ee-5f8a14e8f10@google.com Link: https://lkml.kernel.org/r/5f52de70-975-e94f-f141-543765736181@google.com Link: https://lkml.kernel.org/r/3818cc9a-9999-d064-d778-9c94c5911e6@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 23 +++---------- include/linux/hugetlb_cgroup.h | 31 +++++------------- include/linux/mm_types.h | 74 ++++++++++++++++++++++++++++-------------- mm/Kconfig | 2 +- mm/memory-failure.c | 5 ++- 5 files changed, 67 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 58a30938a9b1..551834cd5299 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -33,22 +33,9 @@ typedef struct { unsigned long pd; } hugepd_t; /* * For HugeTLB page, there are more metadata to save in the struct page. But * the head struct page cannot meet our needs, so we have to abuse other tail - * struct page to store the metadata. In order to avoid conflicts caused by - * subsequent use of more tail struct pages, we gather these discrete indexes - * of tail struct page here. + * struct page to store the metadata. */ -enum { - SUBPAGE_INDEX_SUBPOOL = 1, /* reuse page->private */ -#ifdef CONFIG_CGROUP_HUGETLB - SUBPAGE_INDEX_CGROUP, /* reuse page->private */ - SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */ - __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD, -#endif -#ifdef CONFIG_MEMORY_FAILURE - SUBPAGE_INDEX_HWPOISON, -#endif - __NR_USED_SUBPAGE, -}; +#define __NR_USED_SUBPAGE 3 struct hugepage_subpool { spinlock_t lock; @@ -722,11 +709,11 @@ extern unsigned int default_hstate_idx; static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { - return (void *)folio_get_private_1(folio); + return folio->_hugetlb_subpool; } /* - * hugetlb page subpool pointer located in hpage[1].private + * hugetlb page subpool pointer located in hpage[2].hugetlb_subpool */ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { @@ -736,7 +723,7 @@ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { - folio_set_private_1(folio, (unsigned long)subpool); + folio->_hugetlb_subpool = subpool; } static inline void hugetlb_set_page_subpool(struct page *hpage, diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index c70f92fe493e..f706626a8063 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -24,12 +24,10 @@ struct file_region; #ifdef CONFIG_CGROUP_HUGETLB /* * Minimum page order trackable by hugetlb cgroup. - * At least 4 pages are necessary for all the tracking information. - * The second tail page (hpage[SUBPAGE_INDEX_CGROUP]) is the fault - * usage cgroup. The third tail page (hpage[SUBPAGE_INDEX_CGROUP_RSVD]) - * is the reservation usage cgroup. + * At least 3 pages are necessary for all the tracking information. + * The second tail page contains all of the hugetlb-specific fields. */ -#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__MAX_CGROUP_SUBPAGE_INDEX + 1) +#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE) enum hugetlb_memory_event { HUGETLB_MAX, @@ -69,21 +67,13 @@ struct hugetlb_cgroup { static inline struct hugetlb_cgroup * __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { - struct page *tail; - VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return NULL; - - if (rsvd) { - tail = folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD); - return (void *)page_private(tail); - } - - else { - tail = folio_page(folio, SUBPAGE_INDEX_CGROUP); - return (void *)page_private(tail); - } + if (rsvd) + return folio->_hugetlb_cgroup_rsvd; + else + return folio->_hugetlb_cgroup; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) @@ -101,15 +91,12 @@ static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return; if (rsvd) - set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD), - (unsigned long)h_cg); + folio->_hugetlb_cgroup_rsvd = h_cg; else - set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP), - (unsigned long)h_cg); + folio->_hugetlb_cgroup = h_cg; } static inline void set_hugetlb_cgroup(struct folio *folio, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e86861ff5bbd..44f1f8b6be02 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -145,15 +145,22 @@ struct page { atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ - unsigned long _private_1; #endif }; - struct { /* Second tail page of compound page */ + struct { /* Second tail page of transparent huge page */ unsigned long _compound_pad_1; /* compound_head */ unsigned long _compound_pad_2; /* For both global and memcg */ struct list_head deferred_list; }; + struct { /* Second tail page of hugetlb page */ + unsigned long _hugetlb_pad_1; /* compound_head */ + void *hugetlb_subpool; + void *hugetlb_cgroup; + void *hugetlb_cgroup_rsvd; + void *hugetlb_hwpoison; + /* No more space on 32-bit: use third tail if more */ + }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ @@ -260,13 +267,18 @@ struct page { * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. * @_flags_1: For large folios, additional page flags. - * @__head: Points to the folio. Do not use. + * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). - * @_private_1: Do not use directly, call folio_get_private_1(). + * @_flags_2: For alignment. Do not use. + * @_head_2: Points to the folio. Do not use. + * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. + * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. + * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. + * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -305,16 +317,31 @@ struct folio { }; struct page page; }; - unsigned long _flags_1; - unsigned long __head; - unsigned char _folio_dtor; - unsigned char _folio_order; - atomic_t _total_mapcount; - atomic_t _pincount; + union { + struct { + unsigned long _flags_1; + unsigned long _head_1; + unsigned char _folio_dtor; + unsigned char _folio_order; + atomic_t _total_mapcount; + atomic_t _pincount; #ifdef CONFIG_64BIT - unsigned int _folio_nr_pages; + unsigned int _folio_nr_pages; #endif - unsigned long _private_1; + }; + struct page __page_1; + }; + union { + struct { + unsigned long _flags_2; + unsigned long _head_2; + void *_hugetlb_subpool; + void *_hugetlb_cgroup; + void *_hugetlb_cgroup_rsvd; + void *_hugetlb_hwpoison; + }; + struct page __page_2; + }; }; #define FOLIO_MATCH(pg, fl) \ @@ -335,16 +362,25 @@ FOLIO_MATCH(memcg_data, memcg_data); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); -FOLIO_MATCH(compound_head, __head); +FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); FOLIO_MATCH(compound_mapcount, _total_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); -FOLIO_MATCH(_private_1, _private_1); #endif #undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + 2 * sizeof(struct page)) +FOLIO_MATCH(flags, _flags_2); +FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool); +FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup); +FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); +FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); +#undef FOLIO_MATCH static inline atomic_t *folio_mapcount_ptr(struct folio *folio) { @@ -388,16 +424,6 @@ static inline void *folio_get_private(struct folio *folio) return folio->private; } -static inline void folio_set_private_1(struct folio *folio, unsigned long private) -{ - folio->_private_1 = private; -} - -static inline unsigned long folio_get_private_1(struct folio *folio) -{ - return folio->_private_1; -} - struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/mm/Kconfig b/mm/Kconfig index 4b28800d9be1..c86b69aff7d4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -775,7 +775,7 @@ endchoice config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT help Swap transparent huge pages in one piece, without splitting. XXX: For now, swap cluster backing transparent huge page diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 779a426d2cab..63d8501001c6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1687,8 +1687,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #ifdef CONFIG_HUGETLB_PAGE /* * Struct raw_hwp_page represents information about "raw error page", - * constructing singly linked list originated from ->private field of - * SUBPAGE_INDEX_HWPOISON-th tail page. + * constructing singly linked list from ->_hugetlb_hwpoison field of folio. */ struct raw_hwp_page { struct llist_node node; @@ -1697,7 +1696,7 @@ struct raw_hwp_page { static inline struct llist_head *raw_hwp_list_head(struct page *hpage) { - return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON); + return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison; } static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) -- cgit v1.2.3 From cb67f4282bf9693658dbda934a441ddbbb1446df Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:51:38 -0700 Subject: mm,thp,rmap: simplify compound page mapcount handling Compound page (folio) mapcount calculations have been different for anon and file (or shmem) THPs, and involved the obscure PageDoubleMap flag. And each huge mapping and unmapping of a file (or shmem) THP involved atomically incrementing and decrementing the mapcount of every subpage of that huge page, dirtying many struct page cachelines. Add subpages_mapcount field to the struct folio and first tail page, so that the total of subpage mapcounts is available in one place near the head: then page_mapcount() and total_mapcount() and page_mapped(), and their folio equivalents, are so quick that anon and file and hugetlb don't need to be optimized differently. Delete the unloved PageDoubleMap. page_add and page_remove rmap functions must now maintain the subpages_mapcount as well as the subpage _mapcount, when dealing with pte mappings of huge pages; and correct maintenance of NR_ANON_MAPPED and NR_FILE_MAPPED statistics still needs reading through the subpages, using nr_subpages_unmapped() - but only when first or last pmd mapping finds subpages_mapcount raised (double-map case, not the common case). But are those counts (used to decide when to split an anon THP, and in vmscan's pagecache_reclaimable heuristic) correctly maintained? Not quite: since page_remove_rmap() (and also split_huge_pmd()) is often called without page lock, there can be races when a subpage pte mapcount 0<->1 while compound pmd mapcount 0<->1 is scanning - races which the previous implementation had prevented. The statistics might become inaccurate, and even drift down until they underflow through 0. That is not good enough, but is better dealt with in a followup patch. Update a few comments on first and second tail page overlaid fields. hugepage_add_new_anon_rmap() has to "increment" compound_mapcount, but subpages_mapcount and compound_pincount are already correctly at 0, so delete its reinitialization of compound_pincount. A simple 100 X munmap(mmap(2GB, MAP_SHARED|MAP_POPULATE, tmpfs), 2GB) took 18 seconds on small pages, and used to take 1 second on huge pages, but now takes 119 milliseconds on huge pages. Mapping by pmds a second time used to take 860ms and now takes 92ms; mapping by pmds after mapping by ptes (when the scan is needed) used to take 870ms and now takes 495ms. But there might be some benchmarks which would show a slowdown, because tail struct pages now fall out of cache until final freeing checks them. Link: https://lkml.kernel.org/r/47ad693-717-79c8-e1ba-46c3a6602e48@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 18 ------ include/linux/mm.h | 85 +++++++++++++++++------- include/linux/mm_types.h | 21 +++++- include/linux/page-flags.h | 21 ------ include/linux/rmap.h | 2 + mm/debug.c | 5 +- mm/folio-compat.c | 6 -- mm/huge_memory.c | 36 +++-------- mm/hugetlb.c | 2 + mm/khugepaged.c | 11 +--- mm/page_alloc.c | 27 ++++---- mm/rmap.c | 142 +++++++++++++++++++++++------------------ mm/util.c | 79 ----------------------- 13 files changed, 194 insertions(+), 261 deletions(-) (limited to 'include') diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index 216db1d67d04..a560e0c01b16 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -125,24 +125,6 @@ pages: ->_mapcount of all sub-pages in order to have race-free detection of last unmap of subpages. -PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. - -For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all -subpages is offset up by one. This additional reference is required to -get race-free detection of unmap of subpages when we have them mapped with -both PMDs and PTEs. - -This optimization is required to lower the overhead of per-subpage mapcount -tracking. The alternative is to alter ->_mapcount in all subpages on each -map/unmap of the whole compound page. - -For anonymous pages, we set PG_double_map when a PMD of the page is split -for the first time, but still have a PMD mapping. The additional references -go away with the last compound_mapcount. - -File pages get PG_double_map set on the first map of the page with PTE and -goes away when the page gets evicted from the page cache. - split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page structures. It can be done easily for refcounts taken by page table diff --git a/include/linux/mm.h b/include/linux/mm.h index 3950ef45b9a9..a904c2d60f12 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -818,8 +818,8 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes; look at folio_mapcount() or page_mapcount() - * instead. + * debugging purposes - it does not include PTE-mapped sub-pages; look + * at folio_mapcount() or page_mapcount() or total_mapcount() instead. */ static inline int folio_entire_mapcount(struct folio *folio) { @@ -829,12 +829,20 @@ static inline int folio_entire_mapcount(struct folio *folio) /* * Mapcount of compound page as a whole, does not include mapped sub-pages. - * - * Must be called only for compound pages. + * Must be called only on head of compound page. */ -static inline int compound_mapcount(struct page *page) +static inline int head_compound_mapcount(struct page *head) { - return folio_entire_mapcount(page_folio(page)); + return atomic_read(compound_mapcount_ptr(head)) + 1; +} + +/* + * Sum of mapcounts of sub-pages, does not include compound mapcount. + * Must be called only on head of compound page. + */ +static inline int head_subpages_mapcount(struct page *head) +{ + return atomic_read(subpages_mapcount_ptr(head)); } /* @@ -847,11 +855,9 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -int __page_mapcount(struct page *page); - /* * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount(). + * compound_mapcount of compound_head of page. * * Result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). @@ -859,25 +865,61 @@ int __page_mapcount(struct page *page); */ static inline int page_mapcount(struct page *page) { - if (unlikely(PageCompound(page))) - return __page_mapcount(page); - return atomic_read(&page->_mapcount) + 1; -} + int mapcount = atomic_read(&page->_mapcount) + 1; -int folio_mapcount(struct folio *folio); + if (likely(!PageCompound(page))) + return mapcount; + page = compound_head(page); + return head_compound_mapcount(page) + mapcount; +} -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int total_mapcount(struct page *page) { - return folio_mapcount(page_folio(page)); + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + page = compound_head(page); + return head_compound_mapcount(page) + head_subpages_mapcount(page); } -#else -static inline int total_mapcount(struct page *page) +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped, + * even if this particular subpage is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(struct page *page) { - return page_mapcount(page); + return total_mapcount(page) > 0; +} + +/** + * folio_mapcount() - Calculate the number of mappings of this folio. + * @folio: The folio. + * + * A large folio tracks both how many times the entire folio is mapped, + * and how many times each individual page in the folio is mapped. + * This function calculates the total number of times the folio is + * mapped. + * + * Return: The number of times this folio is mapped. + */ +static inline int folio_mapcount(struct folio *folio) +{ + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) + 1; + return atomic_read(folio_mapcount_ptr(folio)) + 1 + + atomic_read(folio_subpages_mapcount_ptr(folio)); +} + +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. + * + * Return: True if any page in this folio is referenced by user page tables. + */ +static inline bool folio_mapped(struct folio *folio) +{ + return folio_mapcount(folio) > 0; } -#endif static inline struct page *virt_to_head_page(const void *x) { @@ -1800,9 +1842,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -bool page_mapped(struct page *page); -bool folio_mapped(struct folio *folio); - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 44f1f8b6be02..44a1a699b5ad 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -142,6 +142,7 @@ struct page { unsigned char compound_dtor; unsigned char compound_order; atomic_t compound_mapcount; + atomic_t subpages_mapcount; atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ @@ -270,7 +271,8 @@ struct page { * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). - * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_subpages_mapcount: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_flags_2: For alignment. Do not use. @@ -323,7 +325,8 @@ struct folio { unsigned long _head_1; unsigned char _folio_dtor; unsigned char _folio_order; - atomic_t _total_mapcount; + atomic_t _compound_mapcount; + atomic_t _subpages_mapcount; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; @@ -365,7 +368,8 @@ FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _total_mapcount); +FOLIO_MATCH(compound_mapcount, _compound_mapcount); +FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); @@ -388,11 +392,22 @@ static inline atomic_t *folio_mapcount_ptr(struct folio *folio) return &tail->compound_mapcount; } +static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) +{ + struct page *tail = &folio->page + 1; + return &tail->subpages_mapcount; +} + static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; } +static inline atomic_t *subpages_mapcount_ptr(struct page *page) +{ + return &page[1].subpages_mapcount; +} + static inline atomic_t *compound_pincount_ptr(struct page *page) { return &page[1].compound_pincount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b0ae5084e60..e42c55a7e012 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -176,9 +176,6 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, - /* Compound pages. Stored in first tail page's flags */ - PG_double_map = PG_workingset, - #ifdef CONFIG_MEMORY_FAILURE /* * Compound pages. Stored in first tail page's flags. @@ -874,29 +871,11 @@ static inline int PageTransTail(struct page *page) { return PageTail(page); } - -/* - * PageDoubleMap indicates that the compound page is mapped with PTEs as well - * as PMDs. - * - * This is required for optimization of rmap operations for THP: we can postpone - * per small page mapcount accounting (and its overhead from atomic operations) - * until the first PMD split. - * - * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up - * by one. This reference will go away with last compound_mapcount. - * - * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). - */ -PAGEFLAG(DoubleMap, double_map, PF_SECOND) - TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) TESTPAGEFLAG_FALSE(TransTail, transtail) -PAGEFLAG_FALSE(DoubleMap, double_map) - TESTSCFLAG_FALSE(DoubleMap, double_map) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b15..1973649e8f93 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -206,6 +206,8 @@ void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, static inline void __page_dup_rmap(struct page *page, bool compound) { + if (!compound && PageCompound(page)) + atomic_inc(subpages_mapcount_ptr(compound_head(page))); atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } diff --git a/mm/debug.c b/mm/debug.c index 0fd15ba70d16..7f8e5f744e42 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,9 +94,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", head, compound_order(head), - folio_entire_mapcount(folio), + head_compound_mapcount(head), + head_subpages_mapcount(head), head_compound_pincount(head)); } diff --git a/mm/folio-compat.c b/mm/folio-compat.c index bac2a366aada..cbfe51091c39 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -39,12 +39,6 @@ void wait_for_stable_page(struct page *page) } EXPORT_SYMBOL_GPL(wait_for_stable_page); -bool page_mapped(struct page *page) -{ - return folio_mapped(page_folio(page)); -} -EXPORT_SYMBOL(page_mapped); - void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b26998d1845f..7703169107c6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2142,6 +2142,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); + atomic_add(HPAGE_PMD_NR, subpages_mapcount_ptr(page)); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -2225,33 +2226,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); } - if (!pmd_migration) { - /* - * Set PG_double_map before dropping compound_mapcount to avoid - * false-negative page_mapped(). - */ - if (compound_mapcount(page) > 1 && - !TestSetPageDoubleMap(page)) { - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_inc(&page[i]._mapcount); - } - - lock_page_memcg(page); - if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { - /* Last compound_mapcount is gone. */ - __mod_lruvec_page_state(page, NR_ANON_THPS, - -HPAGE_PMD_NR); - if (TestClearPageDoubleMap(page)) { - /* No need in mapcount reference anymore */ - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_dec(&page[i]._mapcount); - } - } - unlock_page_memcg(page); - - /* Above is effectively page_remove_rmap(page, vma, true) */ - munlock_vma_page(page, vma, true); - } + if (!pmd_migration) + page_remove_rmap(page, vma, true); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); @@ -2453,7 +2429,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); - /* ->mapping in first tail page is compound_mapcount */ + /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; @@ -2463,6 +2439,10 @@ static void __split_huge_page_tail(struct page *head, int tail, * page->private should not be set in tail pages with the exception * of swap cache pages that store the swp_entry_t in tail pages. * Fix up and warn once if private is unexpectedly set. + * + * What of 32-bit systems, on which head[1].compound_pincount overlays + * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and + * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. */ if (!folio_test_swapcache(page_folio(head))) { VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 76ebefe02827..4f1338d82aab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1333,6 +1333,7 @@ static void __destroy_compound_gigantic_page(struct page *page, struct page *p; atomic_set(compound_mapcount_ptr(page), 0); + atomic_set(subpages_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); for (i = 1; i < nr_pages; i++) { @@ -1852,6 +1853,7 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(subpages_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); return true; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9c111273bbf9..0d8f548d9d7e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1238,15 +1238,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, /* * Check if the page has any GUP (or other external) pins. * - * Here the check is racy it may see total_mapcount > refcount - * in some cases. - * For example, one process with one forked child process. - * The parent has the PMD split due to MADV_DONTNEED, then - * the child is trying unmap the whole PMD, but khugepaged - * may be scanning the parent between the child has - * PageDoubleMap flag cleared and dec the mapcount. So - * khugepaged may see total_mapcount > refcount. - * + * Here the check may be racy: + * it may see total_mapcount > refcount in some cases? * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e60657875d3..0705917ddf54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -798,6 +798,7 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(subpages_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); } @@ -1324,11 +1325,19 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping may be compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { + /* the first tail page: these may be in place of ->mapping */ + if (unlikely(head_compound_mapcount(head_page))) { bad_page(page, "nonzero compound_mapcount"); goto out; } + if (unlikely(head_subpages_mapcount(head_page))) { + bad_page(page, "nonzero subpages_mapcount"); + goto out; + } + if (unlikely(head_compound_pincount(head_page))) { + bad_page(page, "nonzero compound_pincount"); + goto out; + } break; case 2: /* @@ -1431,10 +1440,8 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) { - ClearPageDoubleMap(page); + if (compound) ClearPageHasHWPoisoned(page); - } for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); @@ -6874,13 +6881,11 @@ static void __ref memmap_init_compound(struct page *head, set_page_count(page, 0); /* - * The first tail page stores compound_mapcount_ptr() and - * compound_order() and the second tail page stores - * compound_pincount_ptr(). Call prep_compound_head() after - * the first and second tail pages have been initialized to - * not have the data overwritten. + * The first tail page stores important compound page info. + * Call prep_compound_head() after the first tail page has + * been initialized, to not have the data overwritten. */ - if (pfn == head_pfn + 2) + if (pfn == head_pfn + 1) prep_compound_head(head, order); } } diff --git a/mm/rmap.c b/mm/rmap.c index 3b2d18bbdc44..f43339ea4970 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,6 +1085,24 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } +/* + * When mapping a THP's first pmd, or unmapping its last pmd, if that THP + * also has pte mappings, then those must be discounted: in order to maintain + * NR_ANON_MAPPED and NR_FILE_MAPPED statistics exactly, without any drift, + * and to decide when an anon THP should be put on the deferred split queue. + */ +static int nr_subpages_unmapped(struct page *head, int nr_subpages) +{ + int nr = nr_subpages; + int i; + + /* Discount those subpages mapped by pte */ + for (i = 0; i < nr_subpages; i++) + if (atomic_read(&head[i]._mapcount) >= 0) + nr--; + return nr; +} + /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma @@ -1194,6 +1212,7 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { + int nr, nr_pages; bool compound = flags & RMAP_COMPOUND; bool first; @@ -1202,28 +1221,32 @@ void page_add_anon_rmap(struct page *page, else VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound) { + if (compound && PageTransHuge(page)) { atomic_t *mapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); mapcount = compound_mapcount_ptr(page); first = atomic_inc_and_test(mapcount); + + nr = nr_pages = thp_nr_pages(page); + if (first && head_subpages_mapcount(page)) + nr = nr_subpages_unmapped(page, nr_pages); } else { + nr = 1; + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + atomic_inc(subpages_mapcount_ptr(head)); + nr = !head_compound_mapcount(head); + } first = atomic_inc_and_test(&page->_mapcount); } + VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) { - int nr = compound ? thp_nr_pages(page) : 1; - /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption - * disabled. - */ if (compound) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pages); __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } @@ -1265,8 +1288,6 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { /* increment count (starts at -1) */ @@ -1287,29 +1308,19 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - int i, nr = 0; + int nr = 0; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { - int nr_pages = thp_nr_pages(page); + int nr_pages; - for (i = 0; i < nr_pages; i++) { - if (atomic_inc_and_test(&page[i]._mapcount)) - nr++; - } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; - /* - * It is racy to ClearPageDoubleMap in page_remove_file_rmap(); - * but page lock is held by all page_add_file_rmap() compound - * callers, and SetPageDoubleMap below warns if !PageLocked: - * so here is a place that DoubleMap can be safely cleared. - */ - VM_WARN_ON_ONCE(!PageLocked(page)); - if (nr == nr_pages && PageDoubleMap(page)) - ClearPageDoubleMap(page); + nr = nr_pages = thp_nr_pages(page); + if (head_subpages_mapcount(page)) + nr = nr_subpages_unmapped(page, nr_pages); if (PageSwapBacked(page)) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, @@ -1318,11 +1329,15 @@ void page_add_file_rmap(struct page *page, __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, nr_pages); } else { - if (PageTransCompound(page) && page_mapping(page)) { - VM_WARN_ON_ONCE(!PageLocked(page)); - SetPageDoubleMap(compound_head(page)); + bool pmd_mapped = false; + + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + atomic_inc(subpages_mapcount_ptr(head)); + pmd_mapped = head_compound_mapcount(head); } - if (atomic_inc_and_test(&page->_mapcount)) + if (atomic_inc_and_test(&page->_mapcount) && !pmd_mapped) nr++; } out: @@ -1335,7 +1350,7 @@ out: static void page_remove_file_rmap(struct page *page, bool compound) { - int i, nr = 0; + int nr = 0; VM_BUG_ON_PAGE(compound && !PageHead(page), page); @@ -1348,14 +1363,15 @@ static void page_remove_file_rmap(struct page *page, bool compound) /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - int nr_pages = thp_nr_pages(page); + int nr_pages; - for (i = 0; i < nr_pages; i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; - } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; + return; + + nr = nr_pages = thp_nr_pages(page); + if (head_subpages_mapcount(page)) + nr = nr_subpages_unmapped(page, nr_pages); + if (PageSwapBacked(page)) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, -nr_pages); @@ -1363,17 +1379,25 @@ static void page_remove_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, -nr_pages); } else { - if (atomic_add_negative(-1, &page->_mapcount)) + bool pmd_mapped = false; + + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + atomic_dec(subpages_mapcount_ptr(head)); + pmd_mapped = head_compound_mapcount(head); + } + if (atomic_add_negative(-1, &page->_mapcount) && !pmd_mapped) nr++; } -out: + if (nr) __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); } static void page_remove_anon_compound_rmap(struct page *page) { - int i, nr; + int nr, nr_pages; if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) return; @@ -1385,27 +1409,19 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page)); + nr = nr_pages = thp_nr_pages(page); + __mod_lruvec_page_state(page, NR_ANON_THPS, -nr); - if (TestClearPageDoubleMap(page)) { - /* - * Subpages can be mapped with PTEs too. Check how many of - * them are still mapped. - */ - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; - } + if (head_subpages_mapcount(page)) { + nr = nr_subpages_unmapped(page, nr_pages); /* * Queue the page for deferred split if at least one small * page of the compound page is unmapped, but at least one * small page is still mapped. */ - if (nr && nr < thp_nr_pages(page)) + if (nr && nr < nr_pages) deferred_split_huge_page(page); - } else { - nr = thp_nr_pages(page); } if (nr) @@ -1423,6 +1439,8 @@ static void page_remove_anon_compound_rmap(struct page *page) void page_remove_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { + bool pmd_mapped = false; + lock_page_memcg(page); if (!PageAnon(page)) { @@ -1435,15 +1453,17 @@ void page_remove_rmap(struct page *page, goto out; } + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + atomic_dec(subpages_mapcount_ptr(head)); + pmd_mapped = head_compound_mapcount(head); + } + /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) + if (!atomic_add_negative(-1, &page->_mapcount) || pmd_mapped) goto out; - /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption disabled. - */ __dec_lruvec_page_state(page, NR_ANON_MAPPED); if (PageTransCompound(page)) @@ -2569,8 +2589,8 @@ void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); ClearHPageRestoreReserve(page); __page_set_anon_rmap(page, vma, address, 1); } diff --git a/mm/util.c b/mm/util.c index 12984e76767e..b56c92fb910f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -717,32 +717,6 @@ void *page_rmapping(struct page *page) return folio_raw_mapping(page_folio(page)); } -/** - * folio_mapped - Is this folio mapped into userspace? - * @folio: The folio. - * - * Return: True if any page in this folio is referenced by user page tables. - */ -bool folio_mapped(struct folio *folio) -{ - long i, nr; - - if (!folio_test_large(folio)) - return atomic_read(&folio->_mapcount) >= 0; - if (atomic_read(folio_mapcount_ptr(folio)) >= 0) - return true; - if (folio_test_hugetlb(folio)) - return false; - - nr = folio_nr_pages(folio); - for (i = 0; i < nr; i++) { - if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0) - return true; - } - return false; -} -EXPORT_SYMBOL(folio_mapped); - struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -783,59 +757,6 @@ struct address_space *folio_mapping(struct folio *folio) } EXPORT_SYMBOL(folio_mapping); -/* Slow path of page_mapcount() for compound pages */ -int __page_mapcount(struct page *page) -{ - int ret; - - ret = atomic_read(&page->_mapcount) + 1; - /* - * For file THP page->_mapcount contains total number of mapping - * of the page: no need to look into compound_mapcount. - */ - if (!PageAnon(page) && !PageHuge(page)) - return ret; - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) - ret--; - return ret; -} -EXPORT_SYMBOL_GPL(__page_mapcount); - -/** - * folio_mapcount() - Calculate the number of mappings of this folio. - * @folio: The folio. - * - * A large folio tracks both how many times the entire folio is mapped, - * and how many times each individual page in the folio is mapped. - * This function calculates the total number of times the folio is - * mapped. - * - * Return: The number of times this folio is mapped. - */ -int folio_mapcount(struct folio *folio) -{ - int i, compound, nr, ret; - - if (likely(!folio_test_large(folio))) - return atomic_read(&folio->_mapcount) + 1; - - compound = folio_entire_mapcount(folio); - if (folio_test_hugetlb(folio)) - return compound; - ret = compound; - nr = folio_nr_pages(folio); - for (i = 0; i < nr; i++) - ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; - /* File pages has compound_mapcount included in _mapcount */ - if (!folio_test_anon(folio)) - return ret - compound * nr; - if (folio_test_double_map(folio)) - ret -= nr; - return ret; -} - /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. -- cgit v1.2.3 From 9bd3155ed83b723be719e522760f107229e2a61b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:53:45 -0700 Subject: mm,thp,rmap: lock_compound_mapcounts() on THP mapcounts Fix the races in maintaining compound_mapcount, subpages_mapcount and subpage _mapcount by using PG_locked in the first tail of any compound page for a bit_spin_lock() on such modifications; skipping the usual atomic operations on those fields in this case. Bring page_remove_file_rmap() and page_remove_anon_compound_rmap() back into page_remove_rmap() itself. Rearrange page_add_anon_rmap() and page_add_file_rmap() and page_remove_rmap() to follow the same "if (compound) {lock} else if (PageCompound) {lock} else {atomic}" pattern (with a PageTransHuge in the compound test, like before, to avoid BUG_ONs and optimize away that block when THP is not configured). Move all the stats updates outside, after the bit_spin_locked section, so that it is sure to be a leaf lock. Add page_dup_compound_rmap() to manage compound locking versus atomics in sync with the rest. In particular, hugetlb pages are still using the atomics: to avoid unnecessary interference there, and because they never have subpage mappings; but this exception can easily be changed. Conveniently, page_dup_compound_rmap() turns out to suit an anon THP's __split_huge_pmd_locked() too. bit_spin_lock() is not popular with PREEMPT_RT folks: but PREEMPT_RT sensibly excludes TRANSPARENT_HUGEPAGE already, so its only exposure is to the non-hugetlb non-THP pte-mapped compound pages (with large folios being currently dependent on TRANSPARENT_HUGEPAGE). There is never any scan of subpages in this case; but we have chosen to use PageCompound tests rather than PageTransCompound tests to gate the use of lock_compound_mapcounts(), so that page_mapped() is correct on all compound pages, whether or not TRANSPARENT_HUGEPAGE is enabled: could that be a problem for PREEMPT_RT, when there is contention on the lock - under heavy concurrent forking for example? If so, then it can be turned into a sleeping lock (like folio_lock()) when PREEMPT_RT. A simple 100 X munmap(mmap(2GB, MAP_SHARED|MAP_POPULATE, tmpfs), 2GB) took 18 seconds on small pages, and used to take 1 second on huge pages, but now takes 115 milliseconds on huge pages. Mapping by pmds a second time used to take 860ms and now takes 86ms; mapping by pmds after mapping by ptes (when the scan is needed) used to take 870ms and now takes 495ms. Mapping huge pages by ptes is largely unaffected but variable: between 5% faster and 5% slower in what I've recorded. Contention on the lock is likely to behave worse than contention on the atomics behaved. Link: https://lkml.kernel.org/r/1b42bd1a-8223-e827-602f-d466c2db7d3c@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 16 +- include/linux/rmap.h | 14 +- mm/huge_memory.c | 3 +- mm/rmap.c | 333 +++++++++++++++++++++++------------------ 4 files changed, 204 insertions(+), 162 deletions(-) (limited to 'include') diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index a560e0c01b16..1e2a637cc607 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -117,13 +117,15 @@ pages: - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeeds on tail pages. - - map/unmap of the pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page. - - - map/unmap of the whole compound page is accounted for in compound_mapcount - (stored in first tail page). For file huge pages, we also increment - ->_mapcount of all sub-pages in order to have race-free detection of - last unmap of subpages. + - map/unmap of PMD entry for the whole compound page increment/decrement + ->compound_mapcount, stored in the first tail page of the compound page. + + - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount + on relevant sub-page of the compound page, and also increment/decrement + ->subpages_mapcount, stored in first tail page of the compound page. + In order to have race-free accounting of sub-pages mapped, changes to + sub-page ->_mapcount, ->subpages_mapcount and ->compound_mapcount are + are all locked by bit_spin_lock of PG_locked in the first tail ->flags. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1973649e8f93..011a7530dc76 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,16 +204,14 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -static inline void __page_dup_rmap(struct page *page, bool compound) -{ - if (!compound && PageCompound(page)) - atomic_inc(subpages_mapcount_ptr(compound_head(page))); - atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); -} +void page_dup_compound_rmap(struct page *page, bool compound); static inline void page_dup_file_rmap(struct page *page, bool compound) { - __page_dup_rmap(page, compound); + if (PageCompound(page)) + page_dup_compound_rmap(page, compound); + else + atomic_inc(&page->_mapcount); } /** @@ -262,7 +260,7 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * the page R/O into both processes. */ dup: - __page_dup_rmap(page, compound); + page_dup_file_rmap(page, compound); return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7703169107c6..114517e8cbfc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2142,7 +2142,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - atomic_add(HPAGE_PMD_NR, subpages_mapcount_ptr(page)); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -2222,7 +2221,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); if (!pmd_migration) - atomic_inc(&page[i]._mapcount); + page_dup_compound_rmap(page + i, false); pte_unmap(pte); } diff --git a/mm/rmap.c b/mm/rmap.c index f43339ea4970..512e53cae2ca 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,11 +1085,66 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } +struct compound_mapcounts { + unsigned int compound_mapcount; + unsigned int subpages_mapcount; +}; + +/* + * lock_compound_mapcounts() first locks, then copies subpages_mapcount and + * compound_mapcount from head[1].compound_mapcount and subpages_mapcount, + * converting from struct page's internal representation to logical count + * (that is, adding 1 to compound_mapcount to hide its offset by -1). + */ +static void lock_compound_mapcounts(struct page *head, + struct compound_mapcounts *local) +{ + bit_spin_lock(PG_locked, &head[1].flags); + local->compound_mapcount = atomic_read(compound_mapcount_ptr(head)) + 1; + local->subpages_mapcount = atomic_read(subpages_mapcount_ptr(head)); +} + +/* + * After caller has updated subpage._mapcount, local subpages_mapcount and + * local compound_mapcount, as necessary, unlock_compound_mapcounts() converts + * and copies them back to the compound head[1] fields, and then unlocks. + */ +static void unlock_compound_mapcounts(struct page *head, + struct compound_mapcounts *local) +{ + atomic_set(compound_mapcount_ptr(head), local->compound_mapcount - 1); + atomic_set(subpages_mapcount_ptr(head), local->subpages_mapcount); + bit_spin_unlock(PG_locked, &head[1].flags); +} + +/* + * When acting on a compound page under lock_compound_mapcounts(), avoid the + * unnecessary overhead of an actual atomic operation on its subpage mapcount. + * Return true if this is the first increment or the last decrement + * (remembering that page->_mapcount -1 represents logical mapcount 0). + */ +static bool subpage_mapcount_inc(struct page *page) +{ + int orig_mapcount = atomic_read(&page->_mapcount); + + atomic_set(&page->_mapcount, orig_mapcount + 1); + return orig_mapcount < 0; +} + +static bool subpage_mapcount_dec(struct page *page) +{ + int orig_mapcount = atomic_read(&page->_mapcount); + + atomic_set(&page->_mapcount, orig_mapcount - 1); + return orig_mapcount == 0; +} + /* * When mapping a THP's first pmd, or unmapping its last pmd, if that THP * also has pte mappings, then those must be discounted: in order to maintain * NR_ANON_MAPPED and NR_FILE_MAPPED statistics exactly, without any drift, * and to decide when an anon THP should be put on the deferred split queue. + * This function must be called between lock_ and unlock_compound_mapcounts(). */ static int nr_subpages_unmapped(struct page *head, int nr_subpages) { @@ -1103,6 +1158,40 @@ static int nr_subpages_unmapped(struct page *head, int nr_subpages) return nr; } +/* + * page_dup_compound_rmap(), used when copying mm, or when splitting pmd, + * provides a simple example of using lock_ and unlock_compound_mapcounts(). + */ +void page_dup_compound_rmap(struct page *page, bool compound) +{ + struct compound_mapcounts mapcounts; + struct page *head; + + /* + * Hugetlb pages could use lock_compound_mapcounts(), like THPs do; + * but at present they are still being managed by atomic operations: + * which are likely to be somewhat faster, so don't rush to convert + * them over without evaluating the effect. + * + * Note that hugetlb does not call page_add_file_rmap(): + * here is where hugetlb shared page mapcount is raised. + */ + if (PageHuge(page)) { + atomic_inc(compound_mapcount_ptr(page)); + return; + } + + head = compound_head(page); + lock_compound_mapcounts(head, &mapcounts); + if (compound) { + mapcounts.compound_mapcount++; + } else { + mapcounts.subpages_mapcount++; + subpage_mapcount_inc(page); + } + unlock_compound_mapcounts(head, &mapcounts); +} + /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma @@ -1212,7 +1301,8 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { - int nr, nr_pages; + struct compound_mapcounts mapcounts; + int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; bool first; @@ -1222,33 +1312,37 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound && PageTransHuge(page)) { - atomic_t *mapcount; - VM_BUG_ON_PAGE(!PageLocked(page), page); - mapcount = compound_mapcount_ptr(page); - first = atomic_inc_and_test(mapcount); + lock_compound_mapcounts(page, &mapcounts); + first = !mapcounts.compound_mapcount; + mapcounts.compound_mapcount++; + if (first) { + nr = nr_pmdmapped = thp_nr_pages(page); + if (mapcounts.subpages_mapcount) + nr = nr_subpages_unmapped(page, nr_pmdmapped); + } + unlock_compound_mapcounts(page, &mapcounts); - nr = nr_pages = thp_nr_pages(page); - if (first && head_subpages_mapcount(page)) - nr = nr_subpages_unmapped(page, nr_pages); - } else { - nr = 1; - if (PageTransCompound(page)) { - struct page *head = compound_head(page); + } else if (PageCompound(page)) { + struct page *head = compound_head(page); - atomic_inc(subpages_mapcount_ptr(head)); - nr = !head_compound_mapcount(head); - } + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount++; + first = subpage_mapcount_inc(page); + nr = first && !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + + } else { first = atomic_inc_and_test(&page->_mapcount); + nr = first; } VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); - if (first) { - if (compound) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pages); + if (nr_pmdmapped) + __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped); + if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - } if (unlikely(PageKsm(page))) unlock_page_memcg(page); @@ -1308,39 +1402,41 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - int nr = 0; + struct compound_mapcounts mapcounts; + int nr = 0, nr_pmdmapped = 0; + bool first; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); + if (compound && PageTransHuge(page)) { - int nr_pages; + lock_compound_mapcounts(page, &mapcounts); + first = !mapcounts.compound_mapcount; + mapcounts.compound_mapcount++; + if (first) { + nr = nr_pmdmapped = thp_nr_pages(page); + if (mapcounts.subpages_mapcount) + nr = nr_subpages_unmapped(page, nr_pmdmapped); + } + unlock_compound_mapcounts(page, &mapcounts); - if (!atomic_inc_and_test(compound_mapcount_ptr(page))) - goto out; + } else if (PageCompound(page)) { + struct page *head = compound_head(page); - nr = nr_pages = thp_nr_pages(page); - if (head_subpages_mapcount(page)) - nr = nr_subpages_unmapped(page, nr_pages); + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount++; + first = subpage_mapcount_inc(page); + nr = first && !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); - if (PageSwapBacked(page)) - __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, - nr_pages); - else - __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, - nr_pages); } else { - bool pmd_mapped = false; - - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - atomic_inc(subpages_mapcount_ptr(head)); - pmd_mapped = head_compound_mapcount(head); - } - if (atomic_inc_and_test(&page->_mapcount) && !pmd_mapped) - nr++; + first = atomic_inc_and_test(&page->_mapcount); + nr = first; } -out: + + if (nr_pmdmapped) + __mod_lruvec_page_state(page, PageSwapBacked(page) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); unlock_page_memcg(page); @@ -1348,137 +1444,84 @@ out: mlock_vma_page(page, vma, compound); } -static void page_remove_file_rmap(struct page *page, bool compound) +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed + * @compound: uncharge the page as compound or small page + * + * The caller needs to hold the pte lock. + */ +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { - int nr = 0; + struct compound_mapcounts mapcounts; + int nr = 0, nr_pmdmapped = 0; + bool last; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + /* Hugetlb pages are not counted in NR_*MAPPED */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); return; } + lock_page_memcg(page); + /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - int nr_pages; + lock_compound_mapcounts(page, &mapcounts); + mapcounts.compound_mapcount--; + last = !mapcounts.compound_mapcount; + if (last) { + nr = nr_pmdmapped = thp_nr_pages(page); + if (mapcounts.subpages_mapcount) + nr = nr_subpages_unmapped(page, nr_pmdmapped); + } + unlock_compound_mapcounts(page, &mapcounts); - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; + } else if (PageCompound(page)) { + struct page *head = compound_head(page); - nr = nr_pages = thp_nr_pages(page); - if (head_subpages_mapcount(page)) - nr = nr_subpages_unmapped(page, nr_pages); + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount--; + last = subpage_mapcount_dec(page); + nr = last && !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); - if (PageSwapBacked(page)) - __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, - -nr_pages); - else - __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, - -nr_pages); } else { - bool pmd_mapped = false; - - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - atomic_dec(subpages_mapcount_ptr(head)); - pmd_mapped = head_compound_mapcount(head); - } - if (atomic_add_negative(-1, &page->_mapcount) && !pmd_mapped) - nr++; + last = atomic_add_negative(-1, &page->_mapcount); + nr = last; } - if (nr) - __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); -} - -static void page_remove_anon_compound_rmap(struct page *page) -{ - int nr, nr_pages; - - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; - - /* Hugepages are not counted in NR_ANON_PAGES for now. */ - if (unlikely(PageHuge(page))) - return; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - return; - - nr = nr_pages = thp_nr_pages(page); - __mod_lruvec_page_state(page, NR_ANON_THPS, -nr); - - if (head_subpages_mapcount(page)) { - nr = nr_subpages_unmapped(page, nr_pages); - + if (nr_pmdmapped) { + __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS : + (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED : + NR_FILE_PMDMAPPED), -nr_pmdmapped); + } + if (nr) { + __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED : + NR_FILE_MAPPED, -nr); /* - * Queue the page for deferred split if at least one small + * Queue anon THP for deferred split if at least one small * page of the compound page is unmapped, but at least one * small page is still mapped. */ - if (nr && nr < nr_pages) - deferred_split_huge_page(page); - } - - if (nr) - __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); -} - -/** - * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from - * @vma: the vm area from which the mapping is removed - * @compound: uncharge the page as compound or small page - * - * The caller needs to hold the pte lock. - */ -void page_remove_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - bool pmd_mapped = false; - - lock_page_memcg(page); - - if (!PageAnon(page)) { - page_remove_file_rmap(page, compound); - goto out; + if (PageTransCompound(page) && PageAnon(page)) + if (!compound || nr < nr_pmdmapped) + deferred_split_huge_page(compound_head(page)); } - if (compound) { - page_remove_anon_compound_rmap(page); - goto out; - } - - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - atomic_dec(subpages_mapcount_ptr(head)); - pmd_mapped = head_compound_mapcount(head); - } - - /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount) || pmd_mapped) - goto out; - - __dec_lruvec_page_state(page, NR_ANON_MAPPED); - - if (PageTransCompound(page)) - deferred_split_huge_page(compound_head(page)); - /* - * It would be tidy to reset the PageAnon mapping here, + * It would be tidy to reset PageAnon mapping when fully unmapped, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_unref_page, + * before us: so leave the reset to free_pages_prepare, * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. */ -out: + unlock_page_memcg(page); munlock_vma_page(page, vma, compound); -- cgit v1.2.3 From be5ef2d9b006bbd93b1a03e1da2dbd19fb0b9f14 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Nov 2022 01:42:04 -0800 Subject: mm,thp,rmap: subpages_mapcount of PTE-mapped subpages Patch series "mm,thp,rmap: rework the use of subpages_mapcount", v2. This patch (of 3): Following suggestion from Linus, instead of counting every PTE map of a compound page in subpages_mapcount, just count how many of its subpages are PTE-mapped: this yields the exact number needed for NR_ANON_MAPPED and NR_FILE_MAPPED stats, without any need for a locked scan of subpages; and requires updating the count less often. This does then revert total_mapcount() and folio_mapcount() to needing a scan of subpages; but they are inherently racy, and need no locking, so Linus is right that the scans are much better done there. Plus (unlike in 6.1 and previous) subpages_mapcount lets us avoid the scan in the common case of no PTE maps. And page_mapped() and folio_mapped() remain scanless and just as efficient with the new meaning of subpages_mapcount: those are the functions which I most wanted to remove the scan from. The updated page_dup_compound_rmap() is no longer suitable for use by anon THP's __split_huge_pmd_locked(); but page_add_anon_rmap() can be used for that, so long as its VM_BUG_ON_PAGE(!PageLocked) is deleted. Evidence is that this way goes slightly faster than the previous implementation for most cases; but significantly faster in the (now scanless) pmds after ptes case, which started out at 870ms and was brought down to 495ms by the previous series, now takes around 105ms. Link: https://lkml.kernel.org/r/a5849eca-22f1-3517-bf29-95d982242742@google.com Link: https://lkml.kernel.org/r/eec17e16-4e1-7c59-f1bc-5bca90dac919@google.com Signed-off-by: Hugh Dickins Suggested-by: Linus Torvalds Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: David Hildenbrand Cc: James Houghton Cc: Johannes Weiner Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 3 +- include/linux/mm.h | 52 ++++++++------ include/linux/rmap.h | 9 +-- mm/huge_memory.c | 2 +- mm/rmap.c | 160 ++++++++++++++++++----------------------- 5 files changed, 107 insertions(+), 119 deletions(-) (limited to 'include') diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index 1e2a637cc607..af4c9d70321d 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -122,7 +122,8 @@ pages: - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount on relevant sub-page of the compound page, and also increment/decrement - ->subpages_mapcount, stored in first tail page of the compound page. + ->subpages_mapcount, stored in first tail page of the compound page, when + _mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE. In order to have race-free accounting of sub-pages mapped, changes to sub-page ->_mapcount, ->subpages_mapcount and ->compound_mapcount are are all locked by bit_spin_lock of PG_locked in the first tail ->flags. diff --git a/include/linux/mm.h b/include/linux/mm.h index a904c2d60f12..84fb91f6f56e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -837,7 +837,7 @@ static inline int head_compound_mapcount(struct page *head) } /* - * Sum of mapcounts of sub-pages, does not include compound mapcount. + * Number of sub-pages mapped by PTE, does not include compound mapcount. * Must be called only on head of compound page. */ static inline int head_subpages_mapcount(struct page *head) @@ -873,23 +873,7 @@ static inline int page_mapcount(struct page *page) return head_compound_mapcount(page) + mapcount; } -static inline int total_mapcount(struct page *page) -{ - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - page = compound_head(page); - return head_compound_mapcount(page) + head_subpages_mapcount(page); -} - -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped, - * even if this particular subpage is not itself mapped by any PTE or PMD. - */ -static inline bool page_mapped(struct page *page) -{ - return total_mapcount(page) > 0; -} +int total_compound_mapcount(struct page *head); /** * folio_mapcount() - Calculate the number of mappings of this folio. @@ -906,8 +890,20 @@ static inline int folio_mapcount(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; - return atomic_read(folio_mapcount_ptr(folio)) + 1 + - atomic_read(folio_subpages_mapcount_ptr(folio)); + return total_compound_mapcount(&folio->page); +} + +static inline int total_mapcount(struct page *page) +{ + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + return total_compound_mapcount(compound_head(page)); +} + +static inline bool folio_large_is_mapped(struct folio *folio) +{ + return atomic_read(folio_mapcount_ptr(folio)) + + atomic_read(folio_subpages_mapcount_ptr(folio)) >= 0; } /** @@ -918,7 +914,21 @@ static inline int folio_mapcount(struct folio *folio) */ static inline bool folio_mapped(struct folio *folio) { - return folio_mapcount(folio) > 0; + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) >= 0; + return folio_large_is_mapped(folio); +} + +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any sub-page of compound page is mapped, + * even if this particular sub-page is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(struct page *page) +{ + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + return folio_large_is_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 011a7530dc76..5dadb9a3e010 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,14 +204,15 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -void page_dup_compound_rmap(struct page *page, bool compound); +void page_dup_compound_rmap(struct page *page); static inline void page_dup_file_rmap(struct page *page, bool compound) { - if (PageCompound(page)) - page_dup_compound_rmap(page, compound); - else + /* Is page being mapped by PTE? */ + if (likely(!compound)) atomic_inc(&page->_mapcount); + else + page_dup_compound_rmap(page); } /** diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 114517e8cbfc..681253adf529 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2221,7 +2221,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); if (!pmd_migration) - page_dup_compound_rmap(page + i, false); + page_add_anon_rmap(page + i, vma, addr, false); pte_unmap(pte); } diff --git a/mm/rmap.c b/mm/rmap.c index 4833d28c5e1a..e813785da613 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1117,55 +1117,36 @@ static void unlock_compound_mapcounts(struct page *head, bit_spin_unlock(PG_locked, &head[1].flags); } -/* - * When acting on a compound page under lock_compound_mapcounts(), avoid the - * unnecessary overhead of an actual atomic operation on its subpage mapcount. - * Return true if this is the first increment or the last decrement - * (remembering that page->_mapcount -1 represents logical mapcount 0). - */ -static bool subpage_mapcount_inc(struct page *page) -{ - int orig_mapcount = atomic_read(&page->_mapcount); - - atomic_set(&page->_mapcount, orig_mapcount + 1); - return orig_mapcount < 0; -} - -static bool subpage_mapcount_dec(struct page *page) -{ - int orig_mapcount = atomic_read(&page->_mapcount); - - atomic_set(&page->_mapcount, orig_mapcount - 1); - return orig_mapcount == 0; -} - -/* - * When mapping a THP's first pmd, or unmapping its last pmd, if that THP - * also has pte mappings, then those must be discounted: in order to maintain - * NR_ANON_MAPPED and NR_FILE_MAPPED statistics exactly, without any drift, - * and to decide when an anon THP should be put on the deferred split queue. - * This function must be called between lock_ and unlock_compound_mapcounts(). - */ -static int nr_subpages_unmapped(struct page *head, int nr_subpages) +int total_compound_mapcount(struct page *head) { - int nr = nr_subpages; + int mapcount = head_compound_mapcount(head); + int nr_subpages; int i; - /* Discount those subpages mapped by pte */ + /* In the common case, avoid the loop when no subpages mapped by PTE */ + if (head_subpages_mapcount(head) == 0) + return mapcount; + /* + * Add all the PTE mappings of those subpages mapped by PTE. + * Limit the loop, knowing that only subpages_mapcount are mapped? + * Perhaps: given all the raciness, that may be a good or a bad idea. + */ + nr_subpages = thp_nr_pages(head); for (i = 0; i < nr_subpages; i++) - if (atomic_read(&head[i]._mapcount) >= 0) - nr--; - return nr; + mapcount += atomic_read(&head[i]._mapcount); + + /* But each of those _mapcounts was based on -1 */ + mapcount += nr_subpages; + return mapcount; } /* - * page_dup_compound_rmap(), used when copying mm, or when splitting pmd, + * page_dup_compound_rmap(), used when copying mm, * provides a simple example of using lock_ and unlock_compound_mapcounts(). */ -void page_dup_compound_rmap(struct page *page, bool compound) +void page_dup_compound_rmap(struct page *head) { struct compound_mapcounts mapcounts; - struct page *head; /* * Hugetlb pages could use lock_compound_mapcounts(), like THPs do; @@ -1176,20 +1157,15 @@ void page_dup_compound_rmap(struct page *page, bool compound) * Note that hugetlb does not call page_add_file_rmap(): * here is where hugetlb shared page mapcount is raised. */ - if (PageHuge(page)) { - atomic_inc(compound_mapcount_ptr(page)); - return; - } + if (PageHuge(head)) { + atomic_inc(compound_mapcount_ptr(head)); + } else if (PageTransHuge(head)) { + /* That test is redundant: it's for safety or to optimize out */ - head = compound_head(page); - lock_compound_mapcounts(head, &mapcounts); - if (compound) { + lock_compound_mapcounts(head, &mapcounts); mapcounts.compound_mapcount++; - } else { - mapcounts.subpages_mapcount++; - subpage_mapcount_inc(page); + unlock_compound_mapcounts(head, &mapcounts); } - unlock_compound_mapcounts(head, &mapcounts); } /** @@ -1304,35 +1280,34 @@ void page_add_anon_rmap(struct page *page, struct compound_mapcounts mapcounts; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; - bool first; + bool first = true; if (unlikely(PageKsm(page))) lock_page_memcg(page); - else - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (likely(!PageCompound(page))) { + /* Is page being mapped by PTE? Is this its first map to be added? */ + if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; + if (first && PageCompound(page)) { + struct page *head = compound_head(page); + + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount++; + nr = !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + } + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ - } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); first = !mapcounts.compound_mapcount; mapcounts.compound_mapcount++; if (first) { - nr = nr_pmdmapped = thp_nr_pages(page); - if (mapcounts.subpages_mapcount) - nr = nr_subpages_unmapped(page, nr_pmdmapped); + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - mapcounts.subpages_mapcount; } unlock_compound_mapcounts(page, &mapcounts); - } else { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount++; - first = subpage_mapcount_inc(page); - nr = first && !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); } VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); @@ -1411,28 +1386,29 @@ void page_add_file_rmap(struct page *page, VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); - if (likely(!PageCompound(page))) { + /* Is page being mapped by PTE? Is this its first map to be added? */ + if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; + if (first && PageCompound(page)) { + struct page *head = compound_head(page); + + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount++; + nr = !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + } + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ - } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); first = !mapcounts.compound_mapcount; mapcounts.compound_mapcount++; if (first) { - nr = nr_pmdmapped = thp_nr_pages(page); - if (mapcounts.subpages_mapcount) - nr = nr_subpages_unmapped(page, nr_pmdmapped); + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - mapcounts.subpages_mapcount; } unlock_compound_mapcounts(page, &mapcounts); - } else { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount++; - first = subpage_mapcount_inc(page); - nr = first && !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); } if (nr_pmdmapped) @@ -1471,29 +1447,29 @@ void page_remove_rmap(struct page *page, lock_page_memcg(page); - /* page still mapped by someone else? */ - if (likely(!PageCompound(page))) { + /* Is page being unmapped by PTE? Is this its last map to be removed? */ + if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); nr = last; + if (last && PageCompound(page)) { + struct page *head = compound_head(page); + + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount--; + nr = !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + } + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ - } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); mapcounts.compound_mapcount--; last = !mapcounts.compound_mapcount; if (last) { - nr = nr_pmdmapped = thp_nr_pages(page); - if (mapcounts.subpages_mapcount) - nr = nr_subpages_unmapped(page, nr_pmdmapped); + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - mapcounts.subpages_mapcount; } unlock_compound_mapcounts(page, &mapcounts); - } else { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount--; - last = subpage_mapcount_dec(page); - nr = last && !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); } if (nr_pmdmapped) { -- cgit v1.2.3 From 4b51634cd16a01b2be0f6b69cc0dae63de4751f2 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Nov 2022 01:49:36 -0800 Subject: mm,thp,rmap: subpages_mapcount COMPOUND_MAPPED if PMD-mapped Can the lock_compound_mapcount() bit_spin_lock apparatus be removed now? Yes. Not by atomic64_t or cmpxchg games, those get difficult on 32-bit; but if we slightly abuse subpages_mapcount by additionally demanding that one bit be set there when the compound page is PMD-mapped, then a cascade of two atomic ops is able to maintain the stats without bit_spin_lock. This is harder to reason about than when bit_spin_locked, but I believe safe; and no drift in stats detected when testing. When there are racing removes and adds, of course the sequence of operations is less well- defined; but each operation on subpages_mapcount is atomically good. What might be disastrous, is if subpages_mapcount could ever fleetingly appear negative: but the pte lock (or pmd lock) these rmap functions are called under, ensures that a last remove cannot race ahead of a first add. Continue to make an exception for hugetlb (PageHuge) pages, though that exception can be easily removed by a further commit if necessary: leave subpages_mapcount 0, don't bother with COMPOUND_MAPPED in its case, just carry on checking compound_mapcount too in folio_mapped(), page_mapped(). Evidence is that this way goes slightly faster than the previous implementation in all cases (pmds after ptes now taking around 103ms); and relieves us of worrying about contention on the bit_spin_lock. Link: https://lkml.kernel.org/r/3978f3ca-5473-55a7-4e14-efea5968d892@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: David Hildenbrand Cc: James Houghton Cc: Johannes Weiner Cc: John Hubbard Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 7 +-- include/linux/mm.h | 19 ++++++- include/linux/rmap.h | 13 ++--- mm/page_alloc.c | 2 +- mm/rmap.c | 121 +++++++++-------------------------------- 5 files changed, 51 insertions(+), 111 deletions(-) (limited to 'include') diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index af4c9d70321d..ec3dc5b04226 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -118,15 +118,14 @@ pages: succeeds on tail pages. - map/unmap of PMD entry for the whole compound page increment/decrement - ->compound_mapcount, stored in the first tail page of the compound page. + ->compound_mapcount, stored in the first tail page of the compound page; + and also increment/decrement ->subpages_mapcount (also in the first tail) + by COMPOUND_MAPPED when compound_mapcount goes from -1 to 0 or 0 to -1. - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount on relevant sub-page of the compound page, and also increment/decrement ->subpages_mapcount, stored in first tail page of the compound page, when _mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE. - In order to have race-free accounting of sub-pages mapped, changes to - sub-page ->_mapcount, ->subpages_mapcount and ->compound_mapcount are - are all locked by bit_spin_lock of PG_locked in the first tail ->flags. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page diff --git a/include/linux/mm.h b/include/linux/mm.h index 84fb91f6f56e..d33639be3db3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -836,13 +836,22 @@ static inline int head_compound_mapcount(struct page *head) return atomic_read(compound_mapcount_ptr(head)) + 1; } +/* + * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, + * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves subpages_mapcount at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) + /* * Number of sub-pages mapped by PTE, does not include compound mapcount. * Must be called only on head of compound page. */ static inline int head_subpages_mapcount(struct page *head) { - return atomic_read(subpages_mapcount_ptr(head)); + return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; } /* @@ -902,8 +911,12 @@ static inline int total_mapcount(struct page *page) static inline bool folio_large_is_mapped(struct folio *folio) { - return atomic_read(folio_mapcount_ptr(folio)) + - atomic_read(folio_subpages_mapcount_ptr(folio)) >= 0; + /* + * Reading folio_mapcount_ptr() below could be omitted if hugetlb + * participated in incrementing subpages_mapcount when compound mapped. + */ + return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || + atomic_read(folio_mapcount_ptr(folio)) >= 0; } /** diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5dadb9a3e010..bd3504d11b15 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,15 +204,14 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -void page_dup_compound_rmap(struct page *page); +static inline void __page_dup_rmap(struct page *page, bool compound) +{ + atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); +} static inline void page_dup_file_rmap(struct page *page, bool compound) { - /* Is page being mapped by PTE? */ - if (likely(!compound)) - atomic_inc(&page->_mapcount); - else - page_dup_compound_rmap(page); + __page_dup_rmap(page, compound); } /** @@ -261,7 +260,7 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * the page R/O into both processes. */ dup: - page_dup_file_rmap(page, compound); + __page_dup_rmap(page, compound); return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0705917ddf54..c33b6963c2d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1330,7 +1330,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) bad_page(page, "nonzero compound_mapcount"); goto out; } - if (unlikely(head_subpages_mapcount(head_page))) { + if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { bad_page(page, "nonzero subpages_mapcount"); goto out; } diff --git a/mm/rmap.c b/mm/rmap.c index e813785da613..459dc1c44d8a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,38 +1085,6 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -struct compound_mapcounts { - unsigned int compound_mapcount; - unsigned int subpages_mapcount; -}; - -/* - * lock_compound_mapcounts() first locks, then copies subpages_mapcount and - * compound_mapcount from head[1].compound_mapcount and subpages_mapcount, - * converting from struct page's internal representation to logical count - * (that is, adding 1 to compound_mapcount to hide its offset by -1). - */ -static void lock_compound_mapcounts(struct page *head, - struct compound_mapcounts *local) -{ - bit_spin_lock(PG_locked, &head[1].flags); - local->compound_mapcount = atomic_read(compound_mapcount_ptr(head)) + 1; - local->subpages_mapcount = atomic_read(subpages_mapcount_ptr(head)); -} - -/* - * After caller has updated subpage._mapcount, local subpages_mapcount and - * local compound_mapcount, as necessary, unlock_compound_mapcounts() converts - * and copies them back to the compound head[1] fields, and then unlocks. - */ -static void unlock_compound_mapcounts(struct page *head, - struct compound_mapcounts *local) -{ - atomic_set(compound_mapcount_ptr(head), local->compound_mapcount - 1); - atomic_set(subpages_mapcount_ptr(head), local->subpages_mapcount); - bit_spin_unlock(PG_locked, &head[1].flags); -} - int total_compound_mapcount(struct page *head) { int mapcount = head_compound_mapcount(head); @@ -1140,34 +1108,6 @@ int total_compound_mapcount(struct page *head) return mapcount; } -/* - * page_dup_compound_rmap(), used when copying mm, - * provides a simple example of using lock_ and unlock_compound_mapcounts(). - */ -void page_dup_compound_rmap(struct page *head) -{ - struct compound_mapcounts mapcounts; - - /* - * Hugetlb pages could use lock_compound_mapcounts(), like THPs do; - * but at present they are still being managed by atomic operations: - * which are likely to be somewhat faster, so don't rush to convert - * them over without evaluating the effect. - * - * Note that hugetlb does not call page_add_file_rmap(): - * here is where hugetlb shared page mapcount is raised. - */ - if (PageHuge(head)) { - atomic_inc(compound_mapcount_ptr(head)); - } else if (PageTransHuge(head)) { - /* That test is redundant: it's for safety or to optimize out */ - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.compound_mapcount++; - unlock_compound_mapcounts(head, &mapcounts); - } -} - /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma @@ -1277,7 +1217,7 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { - struct compound_mapcounts mapcounts; + atomic_t *mapped; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; bool first = true; @@ -1290,24 +1230,20 @@ void page_add_anon_rmap(struct page *page, first = atomic_inc_and_test(&page->_mapcount); nr = first; if (first && PageCompound(page)) { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount++; - nr = !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_inc_return_relaxed(mapped); + nr = !(nr & COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ - lock_compound_mapcounts(page, &mapcounts); - first = !mapcounts.compound_mapcount; - mapcounts.compound_mapcount++; + first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - mapcounts.subpages_mapcount; + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); } - unlock_compound_mapcounts(page, &mapcounts); } VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); @@ -1360,6 +1296,7 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); + atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED); nr = thp_nr_pages(page); __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } @@ -1379,7 +1316,7 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - struct compound_mapcounts mapcounts; + atomic_t *mapped; int nr = 0, nr_pmdmapped = 0; bool first; @@ -1391,24 +1328,20 @@ void page_add_file_rmap(struct page *page, first = atomic_inc_and_test(&page->_mapcount); nr = first; if (first && PageCompound(page)) { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount++; - nr = !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_inc_return_relaxed(mapped); + nr = !(nr & COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ - lock_compound_mapcounts(page, &mapcounts); - first = !mapcounts.compound_mapcount; - mapcounts.compound_mapcount++; + first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - mapcounts.subpages_mapcount; + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); } - unlock_compound_mapcounts(page, &mapcounts); } if (nr_pmdmapped) @@ -1432,7 +1365,7 @@ void page_add_file_rmap(struct page *page, void page_remove_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - struct compound_mapcounts mapcounts; + atomic_t *mapped; int nr = 0, nr_pmdmapped = 0; bool last; @@ -1452,24 +1385,20 @@ void page_remove_rmap(struct page *page, last = atomic_add_negative(-1, &page->_mapcount); nr = last; if (last && PageCompound(page)) { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount--; - nr = !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_dec_return_relaxed(mapped); + nr = !(nr & COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ - lock_compound_mapcounts(page, &mapcounts); - mapcounts.compound_mapcount--; - last = !mapcounts.compound_mapcount; + last = atomic_add_negative(-1, compound_mapcount_ptr(page)); if (last) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - mapcounts.subpages_mapcount; + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); } - unlock_compound_mapcounts(page, &mapcounts); } if (nr_pmdmapped) { -- cgit v1.2.3 From eb309ec89953d6a3e8e35a3a577bab13893858d8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:49 +0100 Subject: mm/mprotect: factor out check whether manual PTE write upgrades are required Let's factor the check out into vma_wants_manual_pte_write_upgrade(), to be reused in NUMA hinting fault context soon. Link: https://lkml.kernel.org/r/20221108174652.198904-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++-- mm/mprotect.c | 17 ++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d33639be3db3..e203e8a83e2d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2088,6 +2088,20 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) +{ + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + return vma_wants_writenotify(vma, vma->vm_page_prot); + return !!(vma->vm_flags & VM_WRITE); + +} extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, @@ -2227,8 +2241,6 @@ static inline int pte_devmap(pte_t pte) } #endif -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); - extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, diff --git a/mm/mprotect.c b/mm/mprotect.c index 72aabffb7871..fe22db2c9cdd 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -558,8 +558,8 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; + unsigned int mm_cp_flags = 0; unsigned long charged = 0; - bool try_change_writable; pgoff_t pgoff; int error; @@ -637,20 +637,11 @@ success: * held in write mode. */ vma->vm_flags = newflags; - /* - * We want to check manually if we can change individual PTEs writable - * if we can't do that automatically for all PTEs in a mapping. For - * private mappings, that's always the case when we have write - * permissions as we properly have to handle COW. - */ - if (vma->vm_flags & VM_SHARED) - try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); - else - try_change_writable = !!(vma->vm_flags & VM_WRITE); + if (vma_wants_manual_pte_write_upgrade(vma)) + mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); - change_protection(tlb, vma, start, end, vma->vm_page_prot, - try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); + change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major -- cgit v1.2.3 From 6a56ccbcf6c69538b152644107a1d7383c876ca7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:50 +0100 Subject: mm/autonuma: use can_change_(pte|pmd)_writable() to replace savedwrite commit b191f9b106ea ("mm: numa: preserve PTE write permissions across a NUMA hinting fault") added remembering write permissions using ordinary pte_write() for PROT_NONE mapped pages to avoid write faults when remapping the page !PROT_NONE on NUMA hinting faults. That commit noted: The patch looks hacky but the alternatives looked worse. The tidest was to rewalk the page tables after a hinting fault but it was more complex than this approach and the performance was worse. It's not generally safe to just mark the page writable during the fault if it's a write fault as it may have been read-only for COW so that approach was discarded. Later, commit 288bc54949fc ("mm/autonuma: let architecture override how the write bit should be stashed in a protnone pte.") introduced a family of savedwrite PTE functions that didn't necessarily improve the whole situation. One confusing thing is that nowadays, if a page is pte_protnone() and pte_savedwrite() then also pte_write() is true. Another source of confusion is that there is only a single pte_mk_savedwrite() call in the kernel. All other write-protection code seems to silently rely on pte_wrprotect(). Ever since PageAnonExclusive was introduced and we started using it in mprotect context via commit 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection"), we do have machinery in place to avoid write faults when changing protection, which is exactly what we want to do here. Let's similarly do what ordinary mprotect() does nowadays when upgrading write permissions and reuse can_change_pte_writable() and can_change_pmd_writable() to detect if we can upgrade PTE permissions to be writable. For anonymous pages there should be absolutely no change: if an anonymous page is not exclusive, it could not have been mapped writable -- because only exclusive anonymous pages can be mapped writable. However, there *might* be a change for writable shared mappings that require writenotify: if they are not dirty, we cannot map them writable. While it might not matter in practice, we'd need a different way to identify whether writenotify is actually required -- and ordinary mprotect would benefit from that as well. Note that we don't optimize for the actual migration case: (1) When migration succeeds the new PTE will not be writable because the source PTE was not writable (protnone); in the future we might just optimize that case similarly by reusing can_change_pte_writable()/can_change_pmd_writable() when removing migration PTEs. (2) When migration fails, we'd have to recalculate the "writable" flag because we temporarily dropped the PT lock; for now keep it simple and set "writable=false". We'll remove all savedwrite leftovers next. Link: https://lkml.kernel.org/r/20221108174652.198904-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/huge_memory.c | 26 +++++++++++++++----------- mm/ksm.c | 9 ++++----- mm/memory.c | 16 +++++++++++++--- mm/mprotect.c | 7 ++----- 5 files changed, 36 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index e203e8a83e2d..8597ef676fc3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2102,6 +2102,8 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma return !!(vma->vm_flags & VM_WRITE); } +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fac917b78102..29102e3ddf84 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1511,8 +1511,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = NUMA_NO_NODE; int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); - bool migrated = false; - bool was_writable = pmd_savedwrite(oldpmd); + bool migrated = false, writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1522,12 +1521,22 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } pmd = pmd_modify(oldpmd, vma->vm_page_prot); + + /* + * Detect now whether the PMD could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pmd_write(pmd); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pmd_writable(vma, vmf->address, pmd)) + writable = true; + page = vm_normal_page_pmd(vma, haddr, pmd); if (!page) goto out_map; /* See similar comment in do_numa_page for explanation */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; page_nid = page_to_nid(page); @@ -1546,6 +1555,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } spin_unlock(vmf->ptl); + writable = false; migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) { @@ -1572,7 +1582,7 @@ out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); - if (was_writable) + if (writable) pmd = pmd_mkwrite(pmd); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); @@ -1813,11 +1823,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t oldpmd, entry; - bool preserve_write; - int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + int ret = 1; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); @@ -1828,9 +1837,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; - #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); @@ -1910,8 +1916,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); if (uffd_wp) { entry = pmd_wrprotect(entry); entry = pmd_mkuffd_wp(entry); diff --git a/mm/ksm.c b/mm/ksm.c index 7ba97f86d831..a71245241d22 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1041,7 +1041,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, anon_exclusive = PageAnonExclusive(page); if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || - (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || anon_exclusive || mm_tlb_flush_pending(mm)) { pte_t entry; @@ -1079,11 +1078,11 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (pte_dirty(entry)) set_page_dirty(page); + entry = pte_mkclean(entry); + + if (pte_write(entry)) + entry = pte_wrprotect(entry); - if (pte_protnone(entry)) - entry = pte_mkclean(pte_clear_savedwrite(entry)); - else - entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = *pvmw.pte; diff --git a/mm/memory.c b/mm/memory.c index 142c4229549b..1749c638734f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4675,10 +4675,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = NUMA_NO_NODE; + bool writable = false; int last_cpupid; int target_nid; pte_t pte, old_pte; - bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* @@ -4697,6 +4697,15 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); + /* + * Detect now whether the PTE could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pte_write(pte); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pte_writable(vma, vmf->address, pte)) + writable = true; + page = vm_normal_page(vma, vmf->address, pte); if (!page || is_zone_device_page(page)) goto out_map; @@ -4713,7 +4722,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; /* @@ -4740,6 +4749,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); + writable = false; /* Migrate to the requested node */ if (migrate_misplaced_page(page, vma, target_nid)) { @@ -4768,7 +4778,7 @@ out_map: old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); - if (was_writable) + if (writable) pte = pte_mkwrite(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index fe22db2c9cdd..093cb50f2fc4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -39,8 +39,8 @@ #include "internal.h" -static inline bool can_change_pte_writable(struct vm_area_struct *vma, - unsigned long addr, pte_t pte) +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { struct page *page; @@ -121,7 +121,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool preserve_write = prot_numa && pte_write(oldpte); /* * Avoid trapping faults against the zero or KSM @@ -177,8 +176,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(oldpte, newprot); - if (preserve_write) - ptent = pte_mk_savedwrite(ptent); if (uffd_wp) { ptent = pte_wrprotect(ptent); -- cgit v1.2.3 From d6379159f47630813f06f97535cc82ce7b9eed49 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:51 +0100 Subject: mm: remove unused savedwrite infrastructure NUMA hinting no longer uses savedwrite, let's rip it out. ... and while at it, drop __pte_write() and __pmd_write() on ppc64. Link: https://lkml.kernel.org/r/20221108174652.198904-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 80 ++-------------------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- include/linux/pgtable.h | 24 --------- mm/debug_vm_pgtable.c | 32 ----------- 4 files changed, 5 insertions(+), 133 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c436d8422654..cb4c67bf45d7 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -401,35 +401,9 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #define pmdp_clear_flush_young pmdp_test_and_clear_young -static inline int __pte_write(pte_t pte) -{ - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE)); -} - -#ifdef CONFIG_NUMA_BALANCING -#define pte_savedwrite pte_savedwrite -static inline bool pte_savedwrite(pte_t pte) -{ - /* - * Saved write ptes are prot none ptes that doesn't have - * privileged bit sit. We mark prot none as one which has - * present and pviliged bit set and RWX cleared. To mark - * protnone which used to have _PAGE_WRITE set we clear - * the privileged bit. - */ - return !(pte_raw(pte) & cpu_to_be64(_PAGE_RWX | _PAGE_PRIVILEGED)); -} -#else -#define pte_savedwrite pte_savedwrite -static inline bool pte_savedwrite(pte_t pte) -{ - return false; -} -#endif - static inline int pte_write(pte_t pte) { - return __pte_write(pte) || pte_savedwrite(pte); + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE)); } static inline int pte_read(pte_t pte) @@ -441,24 +415,16 @@ static inline int pte_read(pte_t pte) static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if (__pte_write(*ptep)) + if (pte_write(*ptep)) pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); - else if (unlikely(pte_savedwrite(*ptep))) - pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); } #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - /* - * We should not find protnone for hugetlb, but this complete the - * interface. - */ - if (__pte_write(*ptep)) + if (pte_write(*ptep)) pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); - else if (unlikely(pte_savedwrite(*ptep))) - pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1); } #define __HAVE_ARCH_PTEP_GET_AND_CLEAR @@ -535,36 +501,6 @@ static inline int pte_protnone(pte_t pte) return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE | _PAGE_RWX)) == cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE); } - -#define pte_mk_savedwrite pte_mk_savedwrite -static inline pte_t pte_mk_savedwrite(pte_t pte) -{ - /* - * Used by Autonuma subsystem to preserve the write bit - * while marking the pte PROT_NONE. Only allow this - * on PROT_NONE pte - */ - VM_BUG_ON((pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_RWX | _PAGE_PRIVILEGED)) != - cpu_to_be64(_PAGE_PRESENT | _PAGE_PRIVILEGED)); - return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); -} - -#define pte_clear_savedwrite pte_clear_savedwrite -static inline pte_t pte_clear_savedwrite(pte_t pte) -{ - /* - * Used by KSM subsystem to make a protnone pte readonly. - */ - VM_BUG_ON(!pte_protnone(pte)); - return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); -} -#else -#define pte_clear_savedwrite pte_clear_savedwrite -static inline pte_t pte_clear_savedwrite(pte_t pte) -{ - VM_WARN_ON(1); - return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); -} #endif /* CONFIG_NUMA_BALANCING */ static inline bool pte_hw_valid(pte_t pte) @@ -641,8 +577,6 @@ static inline unsigned long pte_pfn(pte_t pte) /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { - if (unlikely(pte_savedwrite(pte))) - return pte_clear_savedwrite(pte); return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); } @@ -1139,8 +1073,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) -#define pmd_mk_savedwrite(pmd) pte_pmd(pte_mk_savedwrite(pmd_pte(pmd))) -#define pmd_clear_savedwrite(pmd) pte_pmd(pte_clear_savedwrite(pmd_pte(pmd))) #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd)) @@ -1162,8 +1094,6 @@ static inline int pmd_protnone(pmd_t pmd) #endif /* CONFIG_NUMA_BALANCING */ #define pmd_write(pmd) pte_write(pmd_pte(pmd)) -#define __pmd_write(pmd) __pte_write(pmd_pte(pmd)) -#define pmd_savedwrite(pmd) pte_savedwrite(pmd_pte(pmd)) #define pmd_access_permitted pmd_access_permitted static inline bool pmd_access_permitted(pmd_t pmd, bool write) @@ -1241,10 +1171,8 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (__pmd_write((*pmdp))) + if (pmd_write(*pmdp)) pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); - else if (unlikely(pmd_savedwrite(*pmdp))) - pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED); } /* diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5a05953ae13f..9182324dbef9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -265,7 +265,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } pte = kvmppc_read_update_linux_pte(ptep, writing); if (pte_present(pte) && !pte_protnone(pte)) { - if (writing && !__pte_write(pte)) + if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); is_ci = pte_ci(pte); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5f0d7d0b9471..c74cce67eec8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -503,30 +503,6 @@ static inline pte_t pte_sw_mkyoung(pte_t pte) #define pte_sw_mkyoung pte_sw_mkyoung #endif -#ifndef pte_savedwrite -#define pte_savedwrite pte_write -#endif - -#ifndef pte_mk_savedwrite -#define pte_mk_savedwrite pte_mkwrite -#endif - -#ifndef pte_clear_savedwrite -#define pte_clear_savedwrite pte_wrprotect -#endif - -#ifndef pmd_savedwrite -#define pmd_savedwrite pmd_write -#endif - -#ifndef pmd_mk_savedwrite -#define pmd_mk_savedwrite pmd_mkwrite -#endif - -#ifndef pmd_clear_savedwrite -#define pmd_clear_savedwrite pmd_wrprotect -#endif - #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 2b61fde8c38c..c631ade3f1d2 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -171,18 +171,6 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); } -static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) -{ - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); - - if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) - return; - - pr_debug("Validating PTE saved write\n"); - WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte)))); - WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte)))); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { @@ -302,22 +290,6 @@ static void __init pmd_leaf_tests(struct pgtable_debug_args *args) WARN_ON(!pmd_leaf(pmd)); } -static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) -{ - pmd_t pmd; - - if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) - return; - - if (!has_transparent_hugepage()) - return; - - pr_debug("Validating PMD saved write\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none); - WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); - WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); -} - #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { @@ -451,7 +423,6 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } -static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP @@ -1288,9 +1259,6 @@ static int __init debug_vm_pgtable(void) pmd_leaf_tests(&args); pud_leaf_tests(&args); - pte_savedwrite_tests(&args); - pmd_savedwrite_tests(&args); - pte_special_tests(&args); pte_protnone_tests(&args); pmd_protnone_tests(&args); -- cgit v1.2.3 From 70fb4fdff5826a48886152fd5c5db04eb6c59a40 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:48 -0800 Subject: mm: introduce 'encoded' page pointers with embedded extra bits We already have this notion in parts of the MM code (see the mlock code with the LRU_PAGE and NEW_PAGE bits), but I'm going to introduce a new case, and I refuse to do the same thing we've done before where we just put bits in the raw pointer and say it's still a normal pointer. So this introduces a 'struct encoded_page' pointer that cannot be used for anything else than to encode a real page pointer and a couple of extra bits in the low bits. That way the compiler can trivially track the state of the pointer and you just explicitly encode and decode the extra bits. Note that this makes the alignment of 'struct page' explicit even for the case where CONFIG_HAVE_ALIGNED_STRUCT_PAGE is not set. That is entirely redundant in almost all cases, since the page structure already contains several word-sized entries. However, on m68k, the alignment of even 32-bit data is just 16 bits, and as such in theory the alignment of 'struct page' could be too. So let's just make it very very explicit that the alignment needs to be at least 32 bits, giving us a guarantee of two unused low bits in the pointer. Now, in practice, our page struct array is aligned much more than that anyway, even on m68k, and our existing code in mm/mlock.c obviously already depended on that. But since the whole point of this change is to be careful about the type system when hiding extra bits in the pointer, let's also be explicit about the assumptions we make. NOTE! This is being very careful in another way too: it has a build-time assertion that the 'flags' added to the page pointer actually fit in the two bits. That means that this helper must be inlined, and can only be used in contexts where the compiler can statically determine that the value fits in the available bits. [akpm@linux-foundation.org: kerneldoc on a forward-declared struct confuses htmldocs] Link: https://lore.kernel.org/all/Y2tKixpO4RO6DgW5@tuxmaker.boeblingen.de.ibm.com/ Link: https://lkml.kernel.org/r/20221109203051.1835763-1-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Reviewed-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Christian Borntraeger Cc: Gerald Schaefer Cc: Heiko Carstens [s390] Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 44a1a699b5ad..6b0009e7d4ae 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -68,7 +68,7 @@ struct mem_cgroup; #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) #else -#define _struct_page_alignment +#define _struct_page_alignment __aligned(sizeof(unsigned long)) #endif struct page { @@ -251,6 +251,38 @@ struct page { #endif } _struct_page_alignment; +/* + * struct encoded_page - a nonexistent type marking this pointer + * + * An 'encoded_page' pointer is a pointer to a regular 'struct page', but + * with the low bits of the pointer indicating extra context-dependent + * information. Not super-common, but happens in mmu_gather and mlock + * handling, and this acts as a type system check on that use. + * + * We only really have two guaranteed bits in general, although you could + * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + * for more. + * + * Use the supplied helper functions to endcode/decode the pointer and bits. + */ +struct encoded_page; +#define ENCODE_PAGE_BITS 3ul +static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) +{ + BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); + return (struct encoded_page *)(flags | (unsigned long)page); +} + +static inline unsigned long encoded_page_flags(struct encoded_page *page) +{ + return ENCODE_PAGE_BITS & (unsigned long)page; +} + +static inline struct page *encoded_page_ptr(struct encoded_page *page) +{ + return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); +} + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. -- cgit v1.2.3 From 449c796768c9a1c738d1fa8671fb01663380b8a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:49 -0800 Subject: mm: teach release_pages() to take an array of encoded page pointers too release_pages() already could take either an array of page pointers, or an array of folio pointers. Expand it to also accept an array of encoded page pointers, which is what both the existing mlock() use and the upcoming mmu_gather use of encoded page pointers wants. Note that release_pages() won't actually use, or react to, any extra encoded bits. Instead, this is very much a case of "I have walked the array of encoded pages and done everything the extra bits tell me to do, now release it all". Also, while the "either page or folio pointers" dual use was handled with a cast of the pointer in "release_folios()", this takes a slightly different approach and uses the "transparent union" attribute to describe the set of arguments to the function: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html which has been supported by gcc forever, but the kernel hasn't used before. That allows us to avoid using various wrappers with casts, and just use the same function regardless of use. Link: https://lkml.kernel.org/r/20221109203051.1835763-2-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/mm.h | 21 +++++++++++++++++++-- mm/swap.c | 16 ++++++++++++---- 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8597ef676fc3..f873441303b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1245,7 +1245,24 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } -void release_pages(struct page **pages, int nr); +/** + * release_pages - release an array of pages or folios + * + * This just releases a simple array of multiple pages, and + * accepts various different forms of said page array: either + * a regular old boring array of pages, an array of folios, or + * an array of encoded page pointers. + * + * The transparent union syntax for this kind of "any of these + * argument types" is all kinds of ugly, so look away. + */ +typedef union { + struct page **pages; + struct folio **folios; + struct encoded_page **encoded_pages; +} release_pages_arg __attribute__ ((__transparent_union__)); + +void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. @@ -1261,7 +1278,7 @@ void release_pages(struct page **pages, int nr); */ static inline void folios_put(struct folio **folios, unsigned int nr) { - release_pages((struct page **)folios, nr); + release_pages(folios, nr); } static inline void put_page(struct page *page) diff --git a/mm/swap.c b/mm/swap.c index b9a6817e07ff..70e2063ef43a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -981,22 +981,30 @@ void lru_cache_disable(void) /** * release_pages - batched put_page() - * @pages: array of pages to release + * @arg: array of pages to release * @nr: number of pages * - * Decrement the reference count on all the pages in @pages. If it + * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. */ -void release_pages(struct page **pages, int nr) +void release_pages(release_pages_arg arg, int nr) { int i; + struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { - struct folio *folio = page_folio(pages[i]); + struct folio *folio; + + /* Turn any of the argument types into a folio */ + folio = page_folio(encoded_page_ptr(encoded[i])); /* * Make sure the IRQ-safe lock-holding time does not get -- cgit v1.2.3 From 7cc8f9c7146a5c2dad6e71653c4f69972e73df6b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:50 -0800 Subject: mm: mmu_gather: prepare to gather encoded page pointers with flags This is purely a preparatory patch that makes all the data structures ready for encoding flags with the mmu_gather page pointers. The code currently always sets the flag to zero and doesn't use it yet, but now it's tracking the type state along. The next step will be to actually start using it. Link: https://lkml.kernel.org/r/20221109203051.1835763-3-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- arch/s390/include/asm/tlb.h | 8 +++++--- include/asm-generic/tlb.h | 9 +++++---- include/linux/swap.h | 2 +- mm/mmu_gather.c | 8 ++++---- mm/swap_state.c | 11 ++++------- 5 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 3a5c8fb590e5..05142226d65d 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -25,7 +25,8 @@ void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size); + struct encoded_page *page, + int page_size); #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb @@ -42,9 +43,10 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, * has already been freed, so just do free_page_and_swap_cache. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) + struct encoded_page *page, + int page_size) { - free_page_and_swap_cache(page); + free_page_and_swap_cache(encoded_page_ptr(page)); return false; } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index cab7cfebf40b..54d03d1e712e 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -246,7 +246,7 @@ struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; - struct page *pages[]; + struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ @@ -260,7 +260,8 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) -extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct encoded_page *page, int page_size); #endif @@ -435,13 +436,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) + if (__tlb_remove_page_size(tlb, encode_page(page, 0), page_size)) tlb_flush_mmu(tlb); } static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - return __tlb_remove_page_size(tlb, page, PAGE_SIZE); + return __tlb_remove_page_size(tlb, encode_page(page, 0), PAGE_SIZE); } /* tlb_remove_page diff --git a/include/linux/swap.h b/include/linux/swap.h index fec6647a289a..b61e2007d156 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -463,7 +463,7 @@ static inline unsigned long total_swapcache_pages(void) extern void free_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct page **, int); +extern void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 3a2c3f8cad2f..382581c4a9f6 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -48,7 +48,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb) struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct page **pages = batch->pages; + struct encoded_page **pages = batch->encoded_pages; do { /* @@ -77,7 +77,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size) { struct mmu_gather_batch *batch; @@ -92,13 +92,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * Add the page and check if we are full. If so * force a flush. */ - batch->pages[batch->nr++] = page; + batch->encoded_pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, page); + VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page)); return false; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 40fe6f23e105..2927507b43d8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -303,15 +303,12 @@ void free_page_and_swap_cache(struct page *page) * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ -void free_pages_and_swap_cache(struct page **pages, int nr) +void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { - struct page **pagep = pages; - int i; - lru_add_drain(); - for (i = 0; i < nr; i++) - free_swap_cache(pagep[i]); - release_pages(pagep, nr); + for (int i = 0; i < nr; i++) + free_swap_cache(encoded_page_ptr(pages[i])); + release_pages(pages, nr); } static inline bool swap_use_vma_readahead(void) -- cgit v1.2.3 From 5df397dec7c4c08c23bd14f162f1228836faa4ce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:51 -0800 Subject: mm: delay page_remove_rmap() until after the TLB has been flushed When we remove a page table entry, we are very careful to only free the page after we have flushed the TLB, because other CPUs could still be using the page through stale TLB entries until after the flush. However, we have removed the rmap entry for that page early, which means that functions like folio_mkclean() would end up not serializing with the page table lock because the page had already been made invisible to rmap. And that is a problem, because while the TLB entry exists, we could end up with the following situation: (a) one CPU could come in and clean it, never seeing our mapping of the page (b) another CPU could continue to use the stale and dirty TLB entry and continue to write to said page resulting in a page that has been dirtied, but then marked clean again, all while another CPU might have dirtied it some more. End result: possibly lost dirty data. This extends our current TLB gather infrastructure to optionally track a "should I do a delayed page_remove_rmap() for this page after flushing the TLB". It uses the newly introduced 'encoded page pointer' to do that without having to keep separate data around. Note, this is complicated by a couple of issues: - we want to delay the rmap removal, but not past the page table lock, because that simplifies the memcg accounting - only SMP configurations want to delay TLB flushing, since on UP there are obviously no remote TLBs to worry about, and the page table lock means there are no preemption issues either - s390 has its own mmu_gather model that doesn't delay TLB flushing, and as a result also does not want the delayed rmap. As such, we can treat S390 like the UP case and use a common fallback for the "no delays" case. - we can track an enormous number of pages in our mmu_gather structure, with MAX_GATHER_BATCH_COUNT batches of MAX_TABLE_BATCH pages each, all set up to be approximately 10k pending pages. We do not want to have a huge number of batched pages that we then need to check for delayed rmap handling inside the page table lock. Particularly that last point results in a noteworthy detail, where the normal page batch gathering is limited once we have delayed rmaps pending, in such a way that only the last batch (the so-called "active batch") in the mmu_gather structure can have any delayed entries. NOTE! While the "possibly lost dirty data" sounds catastrophic, for this all to happen you need to have a user thread doing either madvise() with MADV_DONTNEED or a full re-mmap() of the area concurrently with another thread continuing to use said mapping. So arguably this is about user space doing crazy things, but from a VM consistency standpoint it's better if we track the dirty bit properly even when user space goes off the rails. [akpm@linux-foundation.org: fix UP build, per Linus] Link: https://lore.kernel.org/all/B88D3073-440A-41C7-95F4-895D3F657EF2@gmail.com/ Link: https://lkml.kernel.org/r/20221109203051.1835763-4-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Reported-by: Nadav Amit Tested-by: Nadav Amit Signed-off-by: Andrew Morton --- arch/s390/include/asm/tlb.h | 3 +++ include/asm-generic/tlb.h | 31 +++++++++++++++++++++++++++++-- mm/memory.c | 23 +++++++++++++++++------ mm/mmu_gather.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 05142226d65d..b91f4a9b044c 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -41,6 +41,9 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, * Release the page cache reference for a pte removed by * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. + * + * s390 doesn't delay rmap removal, so there is nothing encoded in + * the page pointer. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 54d03d1e712e..b46617207c93 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -263,6 +263,28 @@ struct mmu_gather_batch { extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size); + +#ifdef CONFIG_SMP +/* + * This both sets 'delayed_rmap', and returns true. It would be an inline + * function, except we define it before the 'struct mmu_gather'. + */ +#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) +extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); +#endif + +#endif + +/* + * We have a no-op version of the rmap removal that doesn't + * delay anything. That is used on S390, which flushes remote + * TLBs synchronously, and on UP, which doesn't have any + * remote TLBs to flush and is not preemptible due to this + * all happening under the page table lock. + */ +#ifndef tlb_delay_rmap +#define tlb_delay_rmap(tlb) (false) +static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* @@ -295,6 +317,11 @@ struct mmu_gather { */ unsigned int freed_tables : 1; + /* + * Do we have pending delayed rmap removals? + */ + unsigned int delayed_rmap : 1; + /* * at which levels have we cleared entries? */ @@ -440,9 +467,9 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, tlb_flush_mmu(tlb); } -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags) { - return __tlb_remove_page_size(tlb, encode_page(page, 0), PAGE_SIZE); + return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE); } /* tlb_remove_page diff --git a/mm/memory.c b/mm/memory.c index 1749c638734f..6c85cba02113 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1374,6 +1374,8 @@ again: break; if (pte_present(ptent)) { + unsigned int delay_rmap; + page = vm_normal_page(vma, addr, ptent); if (unlikely(!should_zap_page(details, page))) continue; @@ -1385,20 +1387,26 @@ again: if (unlikely(!page)) continue; + delay_rmap = 0; if (!PageAnon(page)) { if (pte_dirty(ptent)) { - force_flush = 1; set_page_dirty(page); + if (tlb_delay_rmap(tlb)) { + delay_rmap = 1; + force_flush = 1; + } } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, vma, false); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + if (!delay_rmap) { + page_remove_rmap(page, vma, false); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { force_flush = 1; addr += PAGE_SIZE; break; @@ -1455,8 +1463,11 @@ again: arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ - if (force_flush) + if (force_flush) { tlb_flush_mmu_tlbonly(tlb); + if (tlb->delayed_rmap) + tlb_flush_rmaps(tlb, vma); + } pte_unmap_unlock(start_pte, ptl); /* diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 382581c4a9f6..1de1cf9ba581 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; + /* No more batching if we have delayed rmaps pending */ + if (tlb->delayed_rmap) + return false; + batch = tlb->active; if (batch->next) { tlb->active = batch->next; @@ -43,6 +48,33 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } +#ifdef CONFIG_SMP +/** + * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB + * @tlb: the current mmu_gather + * + * Note that because of how tlb_next_batch() above works, we will + * never start new batches with pending delayed rmaps, so we only + * need to walk through the current active batch. + */ +void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + struct mmu_gather_batch *batch; + + batch = tlb->active; + for (int i = 0; i < batch->nr; i++) { + struct encoded_page *enc = batch->encoded_pages[i]; + + if (encoded_page_flags(enc)) { + struct page *page = encoded_page_ptr(enc); + page_remove_rmap(page, vma, false); + } + } + + tlb->delayed_rmap = 0; +} +#endif + static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; @@ -284,6 +316,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, tlb->active = &tlb->local; tlb->batch_count = 0; #endif + tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE -- cgit v1.2.3 From 7c2af309abd24ff4e313246bf9b68f398d95c871 Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Wed, 9 Nov 2022 20:50:42 +0900 Subject: zram: add size class equals check into recompression It makes no sense for us to recompress the object if it will be in the same size class. We anyway don't get any memory gain. But, at the same time, we get a CPU time overhead when inserting this object into zspage and decompressing it afterwards. [senozhatsky: rebased and fixed conflicts] Link: https://lkml.kernel.org/r/20221109115047.2921851-9-senozhatsky@chromium.org Signed-off-by: Alexey Romanov Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 11 ++++++++++- include/linux/zsmalloc.h | 2 ++ mm/zsmalloc.c | 21 +++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 473fc5eb71a1..66659f16f6c8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1672,6 +1672,8 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, unsigned long handle_new; unsigned int comp_len_old; unsigned int comp_len_new; + unsigned int class_index_old; + unsigned int class_index_new; void *src, *dst; int ret; @@ -1690,6 +1692,7 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, if (ret) return ret; + class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old); /* * Iterate the secondary comp algorithms list (in order of priority) * and try to recompress the page. @@ -1715,9 +1718,13 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, return ret; } + class_index_new = zs_lookup_class_index(zram->mem_pool, + comp_len_new); + /* Continue until we make progress */ if (comp_len_new >= huge_class_size || comp_len_new >= comp_len_old || + class_index_new >= class_index_old || (threshold && comp_len_new >= threshold)) { zcomp_stream_put(zram->comps[prio]); continue; @@ -1740,7 +1747,9 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, * that would save memory, mark the object as incompressible so that * we will not try to compress it again. */ - if (comp_len_new >= huge_class_size || comp_len_new >= comp_len_old) { + if (comp_len_new >= huge_class_size || + comp_len_new >= comp_len_old || + class_index_new >= class_index_old) { zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); return 0; } diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2a430e713ce5..a48cd0ffe57d 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -55,5 +55,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle); unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); + void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b52b7bb88b52..78feda34ad9a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1205,6 +1205,27 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage) return get_zspage_inuse(zspage) == class->objs_per_zspage; } +/** + * zs_lookup_class_index() - Returns index of the zsmalloc &size_class + * that hold objects of the provided size. + * @pool: zsmalloc pool to use + * @size: object size + * + * Context: Any context. + * + * Return: the index of the zsmalloc &size_class that hold objects of the + * provided size. + */ +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) +{ + struct size_class *class; + + class = pool->size_class[get_size_class_index(size)]; + + return class->index; +} +EXPORT_SYMBOL_GPL(zs_lookup_class_index); + unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); -- cgit v1.2.3 From b7217a0bbe00a98a8f4b15ebc2a8355a31a59e1e Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Mon, 14 Nov 2022 23:59:49 +0000 Subject: mm: shrinkers: add missing includes for undeclared types The shrinker.h header depends on a user including other headers before it for types used by shrinker.h. Fix this by including the appropriate headers in shrinker.h. ./include/linux/shrinker.h:13:9: error: unknown type name `gfp_t' 13 | gfp_t gfp_mask; | ^~~~~ ./include/linux/shrinker.h:71:26: error: field `list' has incomplete type 71 | struct list_head list; | ^~~~ ./include/linux/shrinker.h:82:9: error: unknown type name `atomic_long_t' 82 | atomic_long_t *nr_deferred; | Link: https://lkml.kernel.org/r/20221114235949.201749-1-tjmercier@google.com Fixes: 83aeeada7c69 ("vmscan: use atomic-long for shrinker batching") Fixes: b0d40c92adaf ("superblock: introduce per-sb cache shrinker infrastructure") Signed-off-by: T.J. Mercier Cc: Al Viro Cc: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 08e6054e061f..71310efe2fab 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -2,6 +2,9 @@ #ifndef _LINUX_SHRINKER_H #define _LINUX_SHRINKER_H +#include +#include + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. -- cgit v1.2.3 From d09e8ca6cb93bb4b97517a18fbbf7eccb0e9ff43 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 15 Nov 2022 02:06:01 +0000 Subject: mm: anonymous shared memory naming Since commit 9a10064f5625 ("mm: add a field to store names for private anonymous memory"), name for private anonymous memory, but not shared anonymous, can be set. However, naming shared anonymous memory just as useful for tracking purposes. Extend the functionality to be able to set names for shared anon. There are two ways to create anonymous shared memory, using memfd or directly via mmap(): 1. fd = memfd_create(...) mem = mmap(..., MAP_SHARED, fd, ...) 2. mem = mmap(..., MAP_SHARED | MAP_ANONYMOUS, -1, ...) In both cases the anonymous shared memory is created the same way by mapping an unlinked file on tmpfs. The memfd way allows to give a name for anonymous shared memory, but not useful when parts of shared memory require to have distinct names. Example use case: The VMM maps VM memory as anonymous shared memory (not private because VMM is sandboxed and drivers are running in their own processes). However, the VM tells back to the VMM how parts of the memory are actually used by the guest, how each of the segments should be backed (i.e. 4K pages, 2M pages), and some other information about the segments. The naming allows us to monitor the effective memory footprint for each of these segments from the host without looking inside the guest. Sample output: /* Create shared anonymous segmenet */ anon_shmem = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); /* Name the segment: "MY-NAME" */ rv = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, anon_shmem, SIZE, "MY-NAME"); cat /proc//maps (and smaps): 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 [anon_shmem:MY-NAME] If the segment is not named, the output is: 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 /dev/zero (deleted) Link: https://lkml.kernel.org/r/20221115020602.804224-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Hildenbrand Cc: Arnd Bergmann Cc: Bagas Sanjaya Cc: Colin Cross Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Paul Gortmaker Cc: Peter Xu Cc: Sean Christopherson Cc: Vincent Whitchurch Cc: Vlastimil Babka Cc: xu xin Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 8 +++++--- fs/proc/task_mmu.c | 15 +++++++++++---- include/linux/mm.h | 2 ++ include/linux/mm_types.h | 26 ++++++++++++-------------- mm/madvise.c | 7 ++----- mm/shmem.c | 29 +++++++++++++++++++++++++---- 6 files changed, 57 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 898c99eae8e4..b8f175ae4853 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,14 +426,16 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ============= ==================================== + =================== =========================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - [anon:] an anonymous mapping that has been + [anon:] a private anonymous mapping that has been named by userspace - ============= ==================================== + [anon_shmem:] an anonymous shared memory mapping that has + been named by userspace + =================== =========================================== or if empty, the mapping is anonymous. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a74cdcc9af0..89338950afd3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -274,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m, static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { + struct anon_vma_name *anon_name = NULL; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; vm_flags_t flags = vma->vm_flags; @@ -293,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); + if (mm) + anon_name = anon_vma_name(vma); /* * Print the dentry name for named mappings, and a @@ -300,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) */ if (file) { seq_pad(m, ' '); - seq_file_path(m, file, "\n"); + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) + seq_printf(m, "[anon_shmem:%s]", anon_name->name); + else + seq_file_path(m, file, "\n"); goto done; } @@ -312,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - struct anon_vma_name *anon_name; - if (!mm) { name = "[vdso]"; goto done; @@ -330,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); seq_printf(m, "[anon:%s]", anon_name->name); diff --git a/include/linux/mm.h b/include/linux/mm.h index f873441303b7..686879dbb0bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -700,8 +700,10 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); +bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b0009e7d4ae..157c2e22cc7f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -549,21 +549,11 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * - * For private anonymous mappings, a pointer to a null terminated string - * containing the name given to the vma, or NULL if unnamed. */ - - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* - * Serialized by mmap_sem. Never use directly because it is - * valid only when vm_file is NULL. Use anon_vma_name instead. - */ - struct anon_vma_name *anon_name; - }; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -584,6 +574,14 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_sem. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif diff --git a/mm/madvise.c b/mm/madvise.c index b913ba6efc10..83b0c91a126b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -95,9 +95,6 @@ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); - if (vma->vm_file) - return NULL; - return vma->anon_name; } @@ -183,7 +180,7 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; - if (!vma->vm_file) { + if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; @@ -1273,7 +1270,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, int error; /* Only anonymous mappings can be named */ - if (vma->vm_file) + if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, diff --git a/mm/shmem.c b/mm/shmem.c index 7428ae3fa4b9..f418d21205be 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -237,11 +237,17 @@ static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; +bool vma_is_anon_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_anon_vm_ops; +} + bool vma_is_shmem(struct vm_area_struct *vma) { - return vma->vm_ops == &shmem_vm_ops; + return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); @@ -2263,7 +2269,8 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { - struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); int ret; ret = seal_check_future_write(info->seals, vma); @@ -2274,7 +2281,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_MTE_ALLOWED; file_accessed(file); - vma->vm_ops = &shmem_vm_ops; + /* This is anonymous shared memory if it is unlinked at the time of mmap */ + if (inode->i_nlink) + vma->vm_ops = &shmem_vm_ops; + else + vma->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -3988,6 +3999,15 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; +static const struct vm_operations_struct shmem_anon_vm_ops = { + .fault = shmem_fault, + .map_pages = filemap_map_pages, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; @@ -4163,6 +4183,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops +#define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 @@ -4268,7 +4289,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; + vma->vm_ops = &shmem_anon_vm_ops; return 0; } -- cgit v1.2.3 From 8d6a0ac09a16c026e1e2a03a61e12e95c48a25a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:47 +0100 Subject: mm: extend FAULT_FLAG_UNSHARE support to anything in a COW mapping Extend FAULT_FLAG_UNSHARE to break COW on anything mapped into a COW (i.e., private writable) mapping and adjust the documentation accordingly. FAULT_FLAG_UNSHARE will now also break COW when encountering the shared zeropage, a pagecache page, a PFNMAP, ... inside a COW mapping, by properly replacing the mapped page/pfn by a private copy (an exclusive anonymous page). Note that only do_wp_page() needs care: hugetlb_wp() already handles FAULT_FLAG_UNSHARE correctly. wp_huge_pmd()/wp_huge_pud() also handles it correctly, for example, splitting the huge zeropage on FAULT_FLAG_UNSHARE such that we can handle FAULT_FLAG_UNSHARE on the PTE level. This change is a requirement for reliable long-term R/O pinning in COW mappings. Link: https://lkml.kernel.org/r/20221116102659.70287-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 ++++---- mm/memory.c | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 157c2e22cc7f..018b1c098173 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1039,9 +1039,9 @@ typedef struct { * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark - * exclusive) a possibly shared anonymous page that is - * mapped R/O. + * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a + * COW mapping, making sure that an exclusive anon page is + * mapped after the fault. * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. * @@ -1066,7 +1066,7 @@ typedef struct { * * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when - * no existing R/O-mapped anonymous page is encountered. + * applied to mappings that are not COW mappings. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, diff --git a/mm/memory.c b/mm/memory.c index 6cec0adab37f..815d2ff05c62 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3431,10 +3431,6 @@ reuse: } wp_page_reuse(vmf); return VM_FAULT_WRITE; - } else if (unshare) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; } copy: /* -- cgit v1.2.3 From 84209e87c6963f928194a890399e24e8ad299db1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:48 +0100 Subject: mm/gup: reliable R/O long-term pinning in COW mappings We already support reliable R/O pinning of anonymous memory. However, assume we end up pinning (R/O long-term) a pagecache page or the shared zeropage inside a writable private ("COW") mapping. The next write access will trigger a write-fault and replace the pinned page by an exclusive anonymous page in the process page tables to break COW: the pinned page no longer corresponds to the page mapped into the process' page table. Now that FAULT_FLAG_UNSHARE can break COW on anything mapped into a COW mapping, let's properly break COW first before R/O long-term pinning something that's not an exclusive anon page inside a COW mapping. FAULT_FLAG_UNSHARE will break COW and map an exclusive anon page instead that can get pinned safely. With this change, we can stop using FOLL_FORCE|FOLL_WRITE for reliable R/O long-term pinning in COW mappings. With this change, the new R/O long-term pinning tests for non-anonymous memory succeed: # [RUN] R/O longterm GUP pin ... with shared zeropage ok 151 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd ok 152 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with tmpfile ok 153 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with huge zeropage ok 154 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (2048 kB) ok 155 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (1048576 kB) ok 156 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with shared zeropage ok 157 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd ok 158 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with tmpfile ok 159 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with huge zeropage ok 160 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (2048 kB) ok 161 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (1048576 kB) ok 162 Longterm R/O pin is reliable Note 1: We don't care about short-term R/O-pinning, because they have snapshot semantics: they are not supposed to observe modifications that happen after pinning. As one example, assume we start direct I/O to read from a page and store page content into a file: modifications to page content after starting direct I/O are not guaranteed to end up in the file. So even if we'd pin the shared zeropage, the end result would be as expected -- getting zeroes stored to the file. Note 2: For shared mappings we'll now always fallback to the slow path to lookup the VMA when R/O long-term pining. While that's the necessary price we have to pay right now, it's actually not that bad in practice: most FOLL_LONGTERM users already specify FOLL_WRITE, for example, along with FOLL_FORCE because they tried dealing with COW mappings correctly ... Note 3: For users that use FOLL_LONGTERM right now without FOLL_WRITE, such as VFIO, we'd now no longer pin the shared zeropage. Instead, we'd populate exclusive anon pages that we can pin. There was a concern that this could affect the memlock limit of existing setups. For example, a VM running with VFIO could run into the memlock limit and fail to run. However, we essentially had the same behavior already in commit 17839856fd58 ("gup: document and work around "COW can break either way" issue") which got merged into some enterprise distros, and there were not any such complaints. So most probably, we're fine. Link: https://lkml.kernel.org/r/20221116102659.70287-10-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Daniel Vetter Reviewed-by: Vlastimil Babka Reviewed-by: John Hubbard Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 ++++++++++++++++++++++++--- mm/gup.c | 10 +++++----- mm/huge_memory.c | 2 +- mm/hugetlb.c | 7 ++++--- 4 files changed, 34 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 686879dbb0bd..d8363ac34a7c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3149,8 +3149,12 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. + * + * If the vma is NULL, we're coming from the GUP-fast path and might have + * to fallback to the slow path just to lookup the vma. */ -static inline bool gup_must_unshare(unsigned int flags, struct page *page) +static inline bool gup_must_unshare(struct vm_area_struct *vma, + unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry @@ -3163,8 +3167,25 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) * Note: PageAnon(page) is stable until the page is actually getting * freed. */ - if (!PageAnon(page)) - return false; + if (!PageAnon(page)) { + /* + * We only care about R/O long-term pining: R/O short-term + * pinning does not have the semantics to observe successive + * changes through the process page tables. + */ + if (!(flags & FOLL_LONGTERM)) + return false; + + /* We really need the vma ... */ + if (!vma) + return true; + + /* + * ... because we only care about writable private ("COW") + * mappings where we have to break COW early. + */ + return is_cow_mapping(vma->vm_flags); + } /* Paired with a memory barrier in page_try_share_anon_rmap(). */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) diff --git a/mm/gup.c b/mm/gup.c index 2500d00db51b..39c84a200f06 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -603,7 +603,7 @@ retry: } } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } @@ -2380,7 +2380,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, goto pte_unmap; } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2566,7 +2566,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } - if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2632,7 +2632,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2672,7 +2672,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5eb702726a0e..86a30041a2e1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1480,7 +1480,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) return NULL; - if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) + if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3d381b26d553..9d97c9a2a15d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6197,7 +6197,8 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, } } -static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, +static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, + unsigned int flags, pte_t *pte, bool *unshare) { pte_t pteval = huge_ptep_get(pte); @@ -6209,7 +6210,7 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, return false; if (flags & FOLL_WRITE) return true; - if (gup_must_unshare(flags, pte_page(pteval))) { + if (gup_must_unshare(vma, flags, pte_page(pteval))) { *unshare = true; return true; } @@ -6338,7 +6339,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * directly from any kind of swap entries. */ if (absent || - __follow_hugetlb_must_fault(flags, pte, &unshare)) { + __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) { vm_fault_t ret; unsigned int fault_flags = 0; -- cgit v1.2.3 From 7438899b0b8df16a73d9a5d49b2a345d165adfe8 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:55 -0800 Subject: folio-compat: remove try_to_release_page() There are no more callers of try_to_release_page(), so remove it. This saves 85 bytes of kernel text. Link: https://lkml.kernel.org/r/20221118073055.55694-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - mm/folio-compat.c | 6 ------ 2 files changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b33ab86d5dca..2ec0ca1f3d38 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1105,7 +1105,6 @@ void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); -int try_to_release_page(struct page *page, gfp_t gfp); bool filemap_release_folio(struct folio *folio, gfp_t gfp); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, int whence); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index cbfe51091c39..86933fa8f3e1 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -118,12 +118,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -int try_to_release_page(struct page *page, gfp_t gfp) -{ - return filemap_release_folio(page_folio(page), gfp); -} -EXPORT_SYMBOL(try_to_release_page); - int isolate_lru_page(struct page *page) { if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) -- cgit v1.2.3 From 8e9d5ead865a1a7af74a444d2f00f1ef4539bfba Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:51:56 -0800 Subject: mm: add bdi_set_strict_limit() function Patch series "mm/block: add bdi sysfs knobs", v4. At meta network block devices (nbd) are used to implement remote block storage. In testing and during production it has been observed that these network block devices can consume a huge portion of the dirty writeback cache and writeback can take a considerable time. To be able to give stricter limits, I'm proposing the following changes: 1) introduce strictlimit knob Currently the max_ratio knob exists to limit the dirty_memory. However this knob only applies once (dirty_ratio + dirty_background_ratio) / 2 has been reached. With the BDI_CAP_STRICTLIMIT flag, the max_ratio can be applied without reaching that limit. This change exposes that knob. This knob can also be useful for NFS, fuse filesystems and USB devices. 2) Use part of 1000000 internal calculation The max_ratio is based on percentage. With the current machine sizes percentage values can be very high (1% of a 256GB main memory is already 2.5GB). This change uses part of 1000000 instead of percentages for the internal calculations. 3) Introduce two new sysfs knobs: min_bytes and max_bytes. Currently all calculations are based on ratio, but for a user it often more convenient to specify a limit in bytes. The new knobs will not store bytes values, instead they will translate the byte value to a corresponding ratio. As the internal values are now part of 1000, the ratio is closer to the specified value. However the value should be more seen as an approximation as it can fluctuate over time. 3) Introduce two new sysfs knobs: min_ratio_fine and max_ratio_fine. The granularity for the existing sysfs bdi knobs min_ratio and max_ratio is based on percentage values. The new sysfs bdi knobs min_ratio_fine and max_ratio_fine allow to specify the ratio as part of 1 million. This patch (of 20): This adds the bdi_set_strict_limit function to be able to set/unset the BDI_CAP_STRICTLIMIT flag. Link: https://lkml.kernel.org/r/20221119005215.3052436-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20221119005215.3052436-2-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Jens Axboe Cc: Chris Mason Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 439815cc1ab9..9c984ffc8a0a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -104,6 +104,7 @@ static inline unsigned long wb_stat_error(void) int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e9d8d857ecc..3745b886722f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -698,6 +698,21 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) +{ + if (strict_limit > 1) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (strict_limit) + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + else + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + spin_unlock_bh(&bdi_lock); + + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { -- cgit v1.2.3 From ae82291e9ca47c3d6da6b77a00f427754aca413e Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:51:59 -0800 Subject: mm: use part per 1000000 for bdi ratios To get finer granularity for ratio calculations use part per million instead of percentiles. This is especially important if we want to automatically convert byte values to ratios. Otherwise the values that are actually used can be quite different. This is also important for machines with more main memory (1% of 256GB is already 2.5GB). Link: https://lkml.kernel.org/r/20221119005215.3052436-5-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 3 +++ mm/backing-dev.c | 6 +++--- mm/page-writeback.c | 15 +++++++++------ 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9c984ffc8a0a..1b50c028e5ad 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -102,6 +102,9 @@ static inline unsigned long wb_stat_error(void) #endif } +/* BDI ratio is expressed as part per 1000000 for finer granularity. */ +#define BDI_RATIO_SCALE 10000 + int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a0899cce72ef..90fa517123dc 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -178,7 +178,7 @@ static ssize_t min_ratio_store(struct device *dev, return ret; } -BDI_SHOW(min_ratio, bdi->min_ratio) +BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -197,7 +197,7 @@ static ssize_t max_ratio_store(struct device *dev, return ret; } -BDI_SHOW(max_ratio, bdi->max_ratio) +BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, @@ -809,7 +809,7 @@ int bdi_init(struct backing_dev_info *bdi) kref_init(&bdi->refcnt); bdi->min_ratio = 0; - bdi->max_ratio = 100; + bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3745b886722f..dd98b2654302 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -197,7 +197,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, min *= this_bw; min = div64_ul(min, tot_bw); } - if (max < 100) { + if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; max = div64_ul(max, tot_bw); } @@ -655,6 +655,8 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) unsigned int delta; int ret = 0; + min_ratio *= BDI_RATIO_SCALE; + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; @@ -665,7 +667,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) bdi->min_ratio = min_ratio; } else { delta = min_ratio - bdi->min_ratio; - if (bdi_min_ratio + delta < 100) { + if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { bdi_min_ratio += delta; bdi->min_ratio = min_ratio; } else { @@ -684,6 +686,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) if (max_ratio > 100) return -EINVAL; + max_ratio *= BDI_RATIO_SCALE; spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { @@ -775,15 +778,15 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); - wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); - wb_thresh += (thresh * wb_min_ratio) / 100; - if (wb_thresh > (thresh * wb_max_ratio) / 100) - wb_thresh = thresh * wb_max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE)) + wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); return wb_thresh; } -- cgit v1.2.3 From 00df7d51263b46ed93f7572e2d09579746f7b1eb Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:00 -0800 Subject: mm: add bdi_get_max_bytes() function This adds a function to return the specified value for max_bytes. It converts the stored max_ratio of the bdi to the corresponding bytes value. It introduces the bdi_get_bytes helper function to do the conversion. This is an approximation as it is based on the value that is returned by global_dirty_limits(), which can change. The helper function will also be used by the min_bytes bdi knob. Link: https://lkml.kernel.org/r/20221119005215.3052436-6-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1b50c028e5ad..473686c32775 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -105,6 +105,7 @@ static inline unsigned long wb_stat_error(void) /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 +u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dd98b2654302..719404e0d03d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -650,6 +650,18 @@ void wb_domain_exit(struct wb_domain *dom) */ static unsigned int bdi_min_ratio; +static u64 bdi_get_bytes(unsigned int ratio) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + u64 bytes; + + global_dirty_limits(&background_thresh, &dirty_thresh); + bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; + + return bytes; +} + int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { unsigned int delta; @@ -701,6 +713,11 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_max_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->max_ratio); +} + int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) { if (strict_limit > 1) -- cgit v1.2.3 From 1bf27e98d26d1e62166a456ef17460be085cbe0b Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:02 -0800 Subject: mm: add bdi_set_max_bytes() function This introduces the bdi_set_max_bytes() function. The max_bytes function does not store the max_bytes value. Instead it converts the max_bytes value into the corresponding ratio value. Link: https://lkml.kernel.org/r/20221119005215.3052436-8-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 473686c32775..ea6c993433d5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -108,6 +108,7 @@ static inline unsigned long wb_stat_error(void) u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e74ef596dc27..20ae9adeb22f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -650,6 +651,28 @@ void wb_domain_exit(struct wb_domain *dom) */ static unsigned int bdi_min_ratio; +static int bdi_check_pages_limit(unsigned long pages) +{ + unsigned long max_dirty_pages = global_dirtyable_memory(); + + if (pages > max_dirty_pages) + return -EINVAL; + + return 0; +} + +static unsigned long bdi_ratio_from_pages(unsigned long pages) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long ratio; + + global_dirty_limits(&background_thresh, &dirty_thresh); + ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); + + return ratio; +} + static u64 bdi_get_bytes(unsigned int ratio) { unsigned long background_thresh; @@ -722,6 +745,20 @@ u64 bdi_get_max_bytes(struct backing_dev_info *bdi) return bdi_get_bytes(bdi->max_ratio); } +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) +{ + int ret; + unsigned long pages = max_bytes >> PAGE_SHIFT; + unsigned long max_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + max_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_max_ratio(bdi, max_ratio); +} + int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) { if (strict_limit > 1) -- cgit v1.2.3 From 712c00d66a342a3ed375df41c3df7d3d2abad2c0 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:05 -0800 Subject: mm: add bdi_get_min_bytes() function This adds a function to return the specified value for min_bytes. It converts the stored min_ratio of the bdi to the corresponding bytes value. This is an approximation as it is based on the value that is returned by global_dirty_limits(), which can change. The returned value can be different than the value when the min_bytes value was set. Link: https://lkml.kernel.org/r/20221119005215.3052436-11-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ea6c993433d5..8e04567727e6 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -105,6 +105,7 @@ static inline unsigned long wb_stat_error(void) /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 +u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 20ae9adeb22f..c47824464f4c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -740,6 +740,11 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_min_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->min_ratio); +} + u64 bdi_get_max_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->max_ratio); -- cgit v1.2.3 From 803c98050569850be5fd51a2025c67622de887d9 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:07 -0800 Subject: mm: add bdi_set_min_bytes() function This introduces the bdi_set_min_bytes() function. The min_bytes function does not store the min_bytes value. Instead it converts the min_bytes value into the corresponding ratio value. Link: https://lkml.kernel.org/r/20221119005215.3052436-13-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8e04567727e6..572669758c7f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cefee7210d83..3d151e7a9b6c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -750,6 +750,20 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi) return bdi_get_bytes(bdi->min_ratio); } +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) +{ + int ret; + unsigned long pages = min_bytes >> PAGE_SHIFT; + unsigned long min_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + min_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_min_ratio(bdi, min_ratio); +} + u64 bdi_get_max_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->max_ratio); -- cgit v1.2.3 From 4e230b406eda9bdf7f8a71e2cc3df18a824abcb0 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:10 -0800 Subject: mm: add bdi_set_max_ratio_no_scale() function This introduces bdi_set_max_ratio_no_scale(). It uses the max granularity for the ratio. This function by the new sysfs knob max_ratio_fine. Link: https://lkml.kernel.org/r/20221119005215.3052436-16-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 572669758c7f..d9acbb22ff25 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3d151e7a9b6c..f44ade72966c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -719,6 +719,9 @@ static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ra { int ret = 0; + if (max_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; + spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; @@ -731,6 +734,11 @@ static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ra return ret; } +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio); +} + int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); @@ -738,9 +746,6 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { - if (max_ratio > 100) - return -EINVAL; - return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); } EXPORT_SYMBOL(bdi_set_max_ratio); -- cgit v1.2.3 From 2c44af4f2aaa260199f218f11920c406e688693c Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:13 -0800 Subject: mm: add bdi_set_min_ratio_no_scale() function This introduces bdi_set_min_ratio_no_scale(). It uses the max granularity for the ratio. This function by the new sysfs knob min_ratio_fine. Link: https://lkml.kernel.org/r/20221119005215.3052436-19-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d9acbb22ff25..fbad4fcd408e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f44ade72966c..ad608ef2a243 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -690,6 +690,8 @@ static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ra unsigned int delta; int ret = 0; + if (min_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; min_ratio *= BDI_RATIO_SCALE; spin_lock_bh(&bdi_lock); @@ -734,6 +736,11 @@ static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ra return ret; } +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio); +} + int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) { return __bdi_set_max_ratio(bdi, max_ratio); -- cgit v1.2.3 From 373dfda2bac1173971099dc76254a016646f62ab Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 24 Nov 2022 18:46:41 +0530 Subject: mm/thp: rename pmd_to_page() as pmd_pgtable_page() Current pmd_to_page(), which derives the page table page containing the pmd address has a very misleading name. The problem being, it sounds similar to pmd_page() which derives page embedded in a given pmd entry either for next level page or a mapped huge page. Rename it as pmd_pgtable_page() instead. Link: https://lkml.kernel.org/r/20221124131641.1523772-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d8363ac34a7c..2c73dc112ffc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2510,7 +2510,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) #if USE_SPLIT_PMD_PTLOCKS -static struct page *pmd_to_page(pmd_t *pmd) +static struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); @@ -2518,7 +2518,7 @@ static struct page *pmd_to_page(pmd_t *pmd) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_to_page(pmd)); + return ptlock_ptr(pmd_pgtable_page(pmd)); } static inline bool pmd_ptlock_init(struct page *page) @@ -2537,7 +2537,7 @@ static inline void pmd_ptlock_free(struct page *page) ptlock_free(page); } -#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte) +#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte) #else -- cgit v1.2.3 From 7e25de77bc5ea56cc3ff618fc8f4ea1896a4dbb3 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 25 Nov 2022 09:15:02 +0530 Subject: s390/mm: use pmd_pgtable_page() helper in __gmap_segment_gaddr() In __gmap_segment_gaddr() pmd level page table page is being extracted from the pmd pointer, similar to pmd_pgtable_page() implementation. This reduces some redundancy by directly using pmd_pgtable_page() instead, though first making it available. Link: https://lkml.kernel.org/r/20221125034502.1559986-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Alexander Gordeev Cc: Christian Borntraeger Cc: David Hildenbrand Cc: Heiko Carstens Signed-off-by: Andrew Morton --- arch/s390/mm/gmap.c | 5 ++--- include/linux/mm.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 02d15c8dc92e..8947451ae021 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -336,12 +336,11 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, static unsigned long __gmap_segment_gaddr(unsigned long *entry) { struct page *page; - unsigned long offset, mask; + unsigned long offset; offset = (unsigned long) entry / sizeof(unsigned long); offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); + page = pmd_pgtable_page((pmd_t *) entry); return page->index + offset; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c73dc112ffc..8df5cae69c80 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2510,7 +2510,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) #if USE_SPLIT_PMD_PTLOCKS -static struct page *pmd_pgtable_page(pmd_t *pmd) +static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); -- cgit v1.2.3 From c31783eeae7b22dc3f6edde7339de6112959225d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:38 +0200 Subject: mm/pagewalk: don't trigger test_walk() in walk_page_vma() As Peter points out, the caller passes a single VMA and can just do that check itself. And in fact, no existing users rely on test_walk() getting called. So let's just remove it and make the implementation slightly more efficient. Link: https://lkml.kernel.org/r/20221021101141.84170-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 2 ++ mm/pagewalk.c | 7 ------- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index f3fafb731ffd..37dc0208862d 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -27,6 +27,8 @@ struct mm_walk; * "do page table walk over the current vma", returning * a negative value means "abort current page table walk * right now" and returning 1 means "skip the current vma" + * Note that this callback is not called when the caller + * passes in a single VMA as for walk_page_vma(). * @pre_vma: if set, called before starting walk on a non-null vma. * @post_vma: if set, called after a walk on a non-null vma, provided * that @pre_vma and the vma walk succeeded. diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2ff3a5bebceb..0a5d71aaf9c7 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -526,18 +526,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, .vma = vma, .private = private, }; - int err; if (!walk.mm) return -EINVAL; mmap_assert_locked(walk.mm); - - err = walk_page_test(vma->vm_start, vma->vm_end, &walk); - if (err > 0) - return 0; - if (err < 0) - return err; return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } -- cgit v1.2.3 From cb8d863313436339fb60f7dd5131af2e5854621e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:35 +0200 Subject: mm: remove VM_FAULT_WRITE All users -- GUP and KSM -- are gone, let's just remove it. Link: https://lkml.kernel.org/r/20221021101141.84170-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 --- mm/huge_memory.c | 2 +- mm/memory.c | 9 ++++----- 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 018b1c098173..199f98be6f9c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -929,7 +929,6 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR: Page read from storage - * @VM_FAULT_WRITE: Special case for get_user_pages * @VM_FAULT_HWPOISON: Hit poisoned small page * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits @@ -950,7 +949,6 @@ enum vm_fault_reason { VM_FAULT_OOM = (__force vm_fault_t)0x000001, VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, - VM_FAULT_WRITE = (__force vm_fault_t)0x000008, VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, @@ -976,7 +974,6 @@ enum vm_fault_reason { { VM_FAULT_OOM, "OOM" }, \ { VM_FAULT_SIGBUS, "SIGBUS" }, \ { VM_FAULT_MAJOR, "MAJOR" }, \ - { VM_FAULT_WRITE, "WRITE" }, \ { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8f10afba17a6..1d9ad909c87c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,7 +1376,7 @@ reuse: if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - return VM_FAULT_WRITE; + return 0; } unlock_fallback: diff --git a/mm/memory.c b/mm/memory.c index 815d2ff05c62..aad226daf41b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3213,7 +3213,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } delayacct_wpcopy_end(); - return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; + return 0; oom_free_new: put_page(new_page); oom: @@ -3277,14 +3277,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - vm_fault_t ret = VM_FAULT_WRITE; + vm_fault_t ret = 0; get_page(vmf->page); @@ -3430,7 +3430,7 @@ reuse: return 0; } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } copy: /* @@ -3944,7 +3944,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; - ret |= VM_FAULT_WRITE; } rmap_flags |= RMAP_EXCLUSIVE; } -- cgit v1.2.3 From e07cda5f232fac4de0925d8a4c92e51e41fa2f6e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:39 +0200 Subject: mm/pagewalk: add walk_page_range_vma() Let's add walk_page_range_vma(), which is similar to walk_page_vma(), however, is only interested in a subset of the VMA range. To be used in KSM code to stop using follow_page() next. Link: https://lkml.kernel.org/r/20221021101141.84170-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 3 +++ mm/pagewalk.c | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 37dc0208862d..959f52e5867d 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -101,6 +101,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 0a5d71aaf9c7..7f1c9b274906 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -517,6 +517,26 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, return walk_pgd_range(start, end, &walk); } +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; + + if (start >= end || !walk.mm) + return -EINVAL; + if (start < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + mmap_assert_locked(walk.mm); + return __walk_page_range(start, end, &walk); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { -- cgit v1.2.3 From f7355e99d9f71fcde093193fd4b569a648ba5ce3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:41 +0200 Subject: mm/gup: remove FOLL_MIGRATION Fortunately, the last user (KSM) is gone, so let's just remove this rather special code from generic GUP handling -- especially because KSM never required the PMD handling as KSM only deals with individual base pages. [akpm@linux-foundation.org: fix merge snafu]Link: https://lkml.kernel.org/r/20221021101141.84170-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/gup.c | 55 +++++------------------------------------------------- 2 files changed, 5 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8df5cae69c80..767c8c522e70 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,7 +3057,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * and return without waiting upon it */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_ANON 0x8000 /* don't do file mappings */ diff --git a/mm/gup.c b/mm/gup.c index 2860cf4a85e1..82b275bbaad5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -537,30 +537,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); -retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto retry; - } + if (!pte_present(pte)) + goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto no_page; @@ -668,28 +651,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); -retry: - if (!pmd_present(pmdval)) { - /* - * Should never reach here, if thp migration is not supported; - * Otherwise, it must be a thp migration entry. - */ - VM_BUG_ON(!thp_migration_supported() || - !is_pmd_migration_entry(pmdval)); - - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - - pmd_migration_entry_wait(mm, pmd); - pmdval = READ_ONCE(*pmd); - /* - * MADV_DONTNEED may convert the pmd to null because - * mmap_lock is held in read mode - */ - if (pmd_none(pmdval)) - return no_page_table(vma, flags); - goto retry; - } + if (!pmd_present(pmdval)) + return no_page_table(vma, flags); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); @@ -703,18 +666,10 @@ retry: if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) return no_page_table(vma, flags); -retry_locked: ptl = pmd_lock(mm, pmd); - if (unlikely(pmd_none(*pmd))) { - spin_unlock(ptl); - return no_page_table(vma, flags); - } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - pmd_migration_entry_wait(mm, pmd); - goto retry_locked; + return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); -- cgit v1.2.3 From 4c9473e87e75a2a77ccd02e55c91ffe6a52b5df6 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Wed, 26 Oct 2022 10:52:18 +0530 Subject: mm/khugepaged: add tracepoint to collapse_file() "mm_khugepaged_collapse_file" for capturing is_shmem. Currently, is_shmem is not being captured. Capturing is_shmem is useful as it can indicate if tmpfs is being used as a backing store instead of persistent storage. Add the tracepoint in collapse_file() named "mm_khugepaged_collapse_file" for capturing is_shmem. [gautammenghani201@gmail.com: swap is_shmem and addr to save space, per Steven Rostedt] Link: https://lkml.kernel.org/r/20221202201807.182829-1-gautammenghani201@gmail.com Link: https://lkml.kernel.org/r/20221026052218.148234-1-gautammenghani201@gmail.com Signed-off-by: Gautam Menghani Reviewed-by: Steven Rostedt (Google) [tracing] Cc: David Hildenbrand Cc: Masami Hiramatsu (Google) Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 38 ++++++++++++++++++++++++++++++++++++++ mm/khugepaged.c | 7 ++++--- 2 files changed, 42 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 760455dfa860..3e6fb05852f9 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -203,5 +203,43 @@ TRACE_EVENT(mm_khugepaged_scan_file, __print_symbolic(__entry->result, SCAN_STATUS)) ); +TRACE_EVENT(mm_khugepaged_collapse_file, + TP_PROTO(struct mm_struct *mm, struct page *hpage, pgoff_t index, + bool is_shmem, unsigned long addr, struct file *file, + int nr, int result), + TP_ARGS(mm, hpage, index, addr, is_shmem, file, nr, result), + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned long, hpfn) + __field(pgoff_t, index) + __field(unsigned long, addr) + __field(bool, is_shmem) + __string(filename, file->f_path.dentry->d_iname) + __field(int, nr) + __field(int, result) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->hpfn = hpage ? page_to_pfn(hpage) : -1; + __entry->index = index; + __entry->addr = addr; + __entry->is_shmem = is_shmem; + __assign_str(filename, file->f_path.dentry->d_iname); + __entry->nr = nr; + __entry->result = result; + ), + + TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s", + __entry->mm, + __entry->hpfn, + __entry->index, + __entry->addr, + __entry->is_shmem, + __get_str(filename), + __entry->nr, + __print_symbolic(__entry->result, SCAN_STATUS)) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 913b0f489352..78ec2771cc65 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1744,12 +1744,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, { struct address_space *mapping = file->f_mapping; struct page *hpage; - pgoff_t index, end = start + HPAGE_PMD_NR; + pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); - int nr; + int nr = 0; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -2102,7 +2102,8 @@ out: mem_cgroup_uncharge(page_folio(hpage)); put_page(hpage); } - /* TODO: tracepoints */ + + trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); return result; } -- cgit v1.2.3 From d3a89233583bf8edab18ac09732759c71dbe0173 Mon Sep 17 00:00:00 2001 From: zhang songyi Date: Mon, 28 Nov 2022 21:07:43 +0800 Subject: include/linux/pgtable.h: : remove redundant pte variable Return value from ptep_get_and_clear_full() directly instead of taking this in another redundant variable. Link: https://lkml.kernel.org/r/202211282107437343474@zte.com.cn Signed-off-by: zhang songyi Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c74cce67eec8..dfabd549d2e7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -425,9 +425,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { - pte_t pte; - pte = ptep_get_and_clear(mm, address, ptep); - return pte; + return ptep_get_and_clear(mm, address, ptep); } #endif -- cgit v1.2.3 From 7b09f5af01ede480cbe7abcb281cf17550a46ff5 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Thu, 27 Oct 2022 20:52:51 +0800 Subject: LoongArch: add sparse memory vmemmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sparse memory vmemmap support for LoongArch. SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. Link: https://lkml.kernel.org/r/20221027125253.3458989-3-chenhuacai@loongson.cn Signed-off-by: Min Zhou Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen Reviewed-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Dinh Nguyen Cc: Guo Ren Cc: Jiaxun Yang Cc: Peter Zijlstra Cc: Philippe Mathieu-Daudé Cc: Thomas Bogendoerfer Cc: Will Deacon Cc: Xuefeng Li Cc: Xuerui Wang Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/pgtable.h | 7 +++- arch/loongarch/include/asm/sparsemem.h | 8 ++++ arch/loongarch/mm/init.c | 72 ++++++++++++++++++++++++++++++++-- include/linux/mm.h | 2 + mm/sparse-vmemmap.c | 10 +++++ 6 files changed, 96 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 903096bd87f8..6f7fa0c0ca08 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -487,6 +487,7 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y + select SPARSEMEM_VMEMMAP_ENABLE help Say Y to support efficient handling of sparse physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 9e6651846db9..022ec6be3602 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -11,6 +11,7 @@ #include #include +#include #include #if CONFIG_PGTABLE_LEVELS == 2 @@ -59,6 +60,7 @@ #include #include #include +#include struct mm_struct; struct vm_area_struct; @@ -86,7 +88,10 @@ extern unsigned long zero_page_mask; #define VMALLOC_START MODULES_END #define VMALLOC_END \ (vm_map_base + \ - min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE) + min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE) + +#define vmemmap ((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK)) +#define VMEMMAP_END ((unsigned long)vmemmap + VMEMMAP_SIZE - 1) #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) diff --git a/arch/loongarch/include/asm/sparsemem.h b/arch/loongarch/include/asm/sparsemem.h index 3d18cdf1b069..8d4af6aff8a8 100644 --- a/arch/loongarch/include/asm/sparsemem.h +++ b/arch/loongarch/include/asm/sparsemem.h @@ -11,8 +11,16 @@ #define SECTION_SIZE_BITS 29 /* 2^29 = Largest Huge Page Size */ #define MAX_PHYSMEM_BITS 48 +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#define VMEMMAP_SIZE (sizeof(struct page) * (1UL << (cpu_pabits + 1 - PAGE_SHIFT))) +#endif + #endif /* CONFIG_SPARSEMEM */ +#ifndef VMEMMAP_SIZE +#define VMEMMAP_SIZE 0 /* 1, For FLATMEM; 2, For SPARSEMEM without VMEMMAP. */ +#endif + #ifdef CONFIG_MEMORY_HOTPLUG int memory_add_physaddr_to_nid(u64 addr); #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 080061793c85..451d93667bcc 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -152,6 +152,72 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ + unsigned long addr = start; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = NULL; + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, NULL); + if (p) { + pmd_t entry; + + entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL); + pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL; + set_pmd_at(&init_mm, addr, pmd, entry); + + continue; + } + } else if (pmd_val(*pmd) & _PAGE_HUGE) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; + } + if (vmemmap_populate_basepages(addr, next, node, NULL)) + return -ENOMEM; + } + + return 0; +} + +int __meminit vmemmap_populate(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ +#if CONFIG_PGTABLE_LEVELS == 2 + return vmemmap_populate_basepages(start, end, node, NULL); +#else + return vmemmap_populate_hugepages(start, end, node, NULL); +#endif +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) +{ +} +#endif +#endif + static pte_t *fixmap_pte(unsigned long addr) { pgd_t *pgd; @@ -168,7 +234,7 @@ static pte_t *fixmap_pte(unsigned long addr) new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); pgd_populate(&init_mm, pgd, new); #ifndef __PAGETABLE_PUD_FOLDED - pud_init((unsigned long)new, (unsigned long)invalid_pmd_table); + pud_init(new); #endif } @@ -179,7 +245,7 @@ static pte_t *fixmap_pte(unsigned long addr) new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, new); #ifndef __PAGETABLE_PMD_FOLDED - pmd_init((unsigned long)new, (unsigned long)invalid_pte_table); + pmd_init(new); #endif } diff --git a/include/linux/mm.h b/include/linux/mm.h index 767c8c522e70..7c31f898337c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3360,6 +3360,8 @@ void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); +void pmd_init(void *addr); +void pud_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 46ae542118c0..797b30e9050c 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -196,6 +196,10 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } +void __weak __meminit pmd_init(void *addr) +{ +} + pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); @@ -203,11 +207,16 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + pmd_init(p); pud_populate(&init_mm, pud, p); } return pud; } +void __weak __meminit pud_init(void *addr) +{ +} + p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -215,6 +224,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + pud_init(p); p4d_populate(&init_mm, p4d, p); } return p4d; -- cgit v1.2.3 From 2045a3b8911b6ee64dd9b522d61abc468ecdcdb5 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Thu, 27 Oct 2022 20:52:52 +0800 Subject: mm/sparse-vmemmap: generalise vmemmap_populate_hugepages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generalise vmemmap_populate_hugepages() so ARM64 & X86 & LoongArch can share its implementation. Link: https://lkml.kernel.org/r/20221027125253.3458989-4-chenhuacai@loongson.cn Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen Acked-by: Will Deacon Acked-by: Dave Hansen Reviewed-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dinh Nguyen Cc: Guo Ren Cc: Jiaxun Yang Cc: Min Zhou Cc: Peter Zijlstra Cc: Philippe Mathieu-Daudé Cc: Thomas Bogendoerfer Cc: Xuefeng Li Cc: Xuerui Wang Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 55 ++++++++--------------------- arch/loongarch/mm/init.c | 59 +++++++++---------------------- arch/x86/mm/init_64.c | 92 +++++++++++++++++------------------------------- include/linux/mm.h | 6 ++++ mm/sparse-vmemmap.c | 63 +++++++++++++++++++++++++++++++++ 5 files changed, 132 insertions(+), 143 deletions(-) (limited to 'include') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 556154d821bf..27217ba12e57 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1137,53 +1137,28 @@ static void free_empty_tables(unsigned long addr, unsigned long end, } #endif +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); +} + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + vmemmap_verify((pte_t *)pmdp, node, addr, next); + return 1; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { - unsigned long addr = start; - unsigned long next; - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp; - WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); if (!ARM64_KERNEL_USES_PMD_MAPS) return vmemmap_populate_basepages(start, end, node, altmap); - - do { - next = pmd_addr_end(addr, end); - - pgdp = vmemmap_pgd_populate(addr, node); - if (!pgdp) - return -ENOMEM; - - p4dp = vmemmap_p4d_populate(pgdp, addr, node); - if (!p4dp) - return -ENOMEM; - - pudp = vmemmap_pud_populate(p4dp, addr, node); - if (!pudp) - return -ENOMEM; - - pmdp = pmd_offset(pudp, addr); - if (pmd_none(READ_ONCE(*pmdp))) { - void *p = NULL; - - p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); - if (!p) { - if (vmemmap_populate_basepages(addr, next, node, altmap)) - return -ENOMEM; - continue; - } - - pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); - } else - vmemmap_verify((pte_t *)pmdp, node, addr, next); - } while (addr = next, addr != end); - - return 0; + else + return vmemmap_populate_hugepages(start, end, node, altmap); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 451d93667bcc..e018aed34586 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -153,52 +153,25 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, - int node, struct vmem_altmap *altmap) +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) { - unsigned long addr = start; - unsigned long next; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; + pmd_t entry; - for (addr = start; addr < end; addr = next) { - next = pmd_addr_end(addr, end); - - pgd = vmemmap_pgd_populate(addr, node); - if (!pgd) - return -ENOMEM; - p4d = vmemmap_p4d_populate(pgd, addr, node); - if (!p4d) - return -ENOMEM; - pud = vmemmap_pud_populate(p4d, addr, node); - if (!pud) - return -ENOMEM; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - void *p = NULL; - - p = vmemmap_alloc_block_buf(PMD_SIZE, node, NULL); - if (p) { - pmd_t entry; - - entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL); - pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL; - set_pmd_at(&init_mm, addr, pmd, entry); - - continue; - } - } else if (pmd_val(*pmd) & _PAGE_HUGE) { - vmemmap_verify((pte_t *)pmd, node, addr, next); - continue; - } - if (vmemmap_populate_basepages(addr, next, node, NULL)) - return -ENOMEM; - } + entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL); + pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL; + set_pmd_at(&init_mm, addr, pmd, entry); +} + +int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + int huge = pmd_val(*pmd) & _PAGE_HUGE; + + if (huge) + vmemmap_verify((pte_t *)pmd, node, addr, next); - return 0; + return huge; } int __meminit vmemmap_populate(unsigned long start, unsigned long end, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e8db4edd7cc9..a190aae8ceaf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1492,72 +1492,44 @@ static long __meminitdata addr_start, addr_end; static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; -static int __meminit vmemmap_populate_hugepages(unsigned long start, - unsigned long end, int node, struct vmem_altmap *altmap) +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) { - unsigned long addr; - unsigned long next; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - for (addr = start; addr < end; addr = next) { - next = pmd_addr_end(addr, end); - - pgd = vmemmap_pgd_populate(addr, node); - if (!pgd) - return -ENOMEM; - - p4d = vmemmap_p4d_populate(pgd, addr, node); - if (!p4d) - return -ENOMEM; - - pud = vmemmap_pud_populate(p4d, addr, node); - if (!pud) - return -ENOMEM; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - void *p; - - p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); - if (p) { - pte_t entry; - - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, - PAGE_KERNEL_LARGE); - set_pmd(pmd, __pmd(pte_val(entry))); + pte_t entry; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } - /* check to see if we have contiguous blocks */ - if (p_end != p || node_start != node) { - if (p_start) - pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", - addr_start, addr_end-1, p_start, p_end-1, node_start); - addr_start = addr; - node_start = node; - p_start = p; - } + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; - addr_end = addr + PMD_SIZE; - p_end = p + PMD_SIZE; + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) + vmemmap_use_new_sub_pmd(addr, next); +} - if (!IS_ALIGNED(addr, PMD_SIZE) || - !IS_ALIGNED(next, PMD_SIZE)) - vmemmap_use_new_sub_pmd(addr, next); +int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_large(*pmd); - continue; - } else if (altmap) - return -ENOMEM; /* no fallback */ - } else if (pmd_large(*pmd)) { - vmemmap_verify((pte_t *)pmd, node, addr, next); - vmemmap_use_sub_pmd(addr, next); - continue; - } - if (vmemmap_populate_basepages(addr, next, node, NULL)) - return -ENOMEM; + if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + vmemmap_use_sub_pmd(addr, next); } - return 0; + + return large; } int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c31f898337c..472cb60ace07 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3373,8 +3373,14 @@ struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); +void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next); +int vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 797b30e9050c..c5398a5960d0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -295,6 +295,69 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, return vmemmap_populate_range(start, end, node, altmap, NULL); } +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) +{ +} + +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + return 0; +} + +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + continue; + } else if (altmap) { + /* + * No fallback: In any case we care about, the + * altmap should be reasonably sized and aligned + * such that vmemmap_alloc_block_buf() will always + * succeed. For consistency with the PTE case, + * return an error here as failure could indicate + * a configuration issue with the size of the altmap. + */ + return -ENOMEM; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) + continue; + if (vmemmap_populate_basepages(addr, next, node, altmap)) + return -ENOMEM; + } + return 0; +} + /* * For compound pages bigger than section size (e.g. x86 1G compound * pages with 2M subsection size) fill the rest of sections as tail -- cgit v1.2.3 From 3720dd6dcac38d03424d6ba38107f39af5318bcf Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:22 -0700 Subject: filemap: convert replace_page_cache_page() to replace_page_cache_folio() Patch series "Removing the lru_cache_add() wrapper". This patchset replaces all calls of lru_cache_add() with the folio equivalent: folio_add_lru(). This is allows us to get rid of the wrapper The series passes xfstests and the userfaultfd selftests. This patch (of 5): Eliminates 7 calls to compound_head(). Link: https://lkml.kernel.org/r/20221101175326.13265-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221101175326.13265-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/fuse/dev.c | 2 +- include/linux/pagemap.h | 2 +- mm/filemap.c | 52 ++++++++++++++++++++++++------------------------- 3 files changed, 27 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4a6e0a1b945..26817a2db463 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -837,7 +837,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (WARN_ON(PageMlocked(oldpage))) goto out_fallback_unlock; - replace_page_cache_page(oldpage, newpage); + replace_page_cache_folio(page_folio(oldpage), page_folio(newpage)); get_page(newpage); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2ec0ca1f3d38..29e1f9e76eb6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1102,7 +1102,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); void __filemap_remove_folio(struct folio *folio, void *shadow); -void replace_page_cache_page(struct page *old, struct page *new); +void replace_page_cache_folio(struct folio *old, struct folio *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); bool filemap_release_folio(struct folio *folio, gfp_t gfp); diff --git a/mm/filemap.c b/mm/filemap.c index 242cd8bd8330..c4d4ace9cc70 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -788,56 +788,54 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) EXPORT_SYMBOL(file_write_and_wait_range); /** - * replace_page_cache_page - replace a pagecache page with a new one - * @old: page to be replaced - * @new: page to replace with - * - * This function replaces a page in the pagecache with a new one. On - * success it acquires the pagecache reference for the new page and - * drops it for the old page. Both the old and new pages must be - * locked. This function does not add the new page to the LRU, the + * replace_page_cache_folio - replace a pagecache folio with a new one + * @old: folio to be replaced + * @new: folio to replace with + * + * This function replaces a folio in the pagecache with a new one. On + * success it acquires the pagecache reference for the new folio and + * drops it for the old folio. Both the old and new folios must be + * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ -void replace_page_cache_page(struct page *old, struct page *new) +void replace_page_cache_folio(struct folio *old, struct folio *new) { - struct folio *fold = page_folio(old); - struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); - VM_BUG_ON_PAGE(!PageLocked(old), old); - VM_BUG_ON_PAGE(!PageLocked(new), new); - VM_BUG_ON_PAGE(new->mapping, new); + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(new->mapping, new); - get_page(new); + folio_get(new); new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(fold, fnew); + mem_cgroup_migrate(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(old)) - __dec_lruvec_page_state(old, NR_FILE_PAGES); - if (!PageHuge(new)) - __inc_lruvec_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(old)) - __dec_lruvec_page_state(old, NR_SHMEM); - if (PageSwapBacked(new)) - __inc_lruvec_page_state(new, NR_SHMEM); + if (!folio_test_hugetlb(old)) + __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + if (!folio_test_hugetlb(new)) + __lruvec_stat_add_folio(new, NR_FILE_PAGES); + if (folio_test_swapbacked(old)) + __lruvec_stat_sub_folio(old, NR_SHMEM); + if (folio_test_swapbacked(new)) + __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) - free_folio(fold); - folio_put(fold); + free_folio(old); + folio_put(old); } -EXPORT_SYMBOL_GPL(replace_page_cache_page); +EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) -- cgit v1.2.3 From 6e1ca48d0669b0f5efcbaa051b23cd8e651a1614 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:26 -0700 Subject: folio-compat: remove lru_cache_add() There are no longer any callers of lru_cache_add(), so remove it. This saves 79 bytes of kernel text. Also cleanup some comments such that they reference the new folio_add_lru() instead. Link: https://lkml.kernel.org/r/20221101175326.13265-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/folio-compat.c | 6 ------ mm/truncate.c | 2 +- mm/workingset.c | 5 ++++- 4 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index b61e2007d156..0ceed49516ad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -381,7 +381,6 @@ void lru_note_cost(struct lruvec *lruvec, bool file, void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); -void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 86933fa8f3e1..69ed25790c68 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -76,12 +76,6 @@ bool redirty_page_for_writepage(struct writeback_control *wbc, } EXPORT_SYMBOL(redirty_page_for_writepage); -void lru_cache_add(struct page *page) -{ - folio_add_lru(page_folio(page)); -} -EXPORT_SYMBOL(lru_cache_add); - void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { diff --git a/mm/truncate.c b/mm/truncate.c index c7bfd247a651..7b4ea4c4a46b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -565,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because * shrink_page_list() has a temp ref on them, or because they're transiently - * sitting in the lru_cache_add() pagevecs. + * sitting in the folio_add_lru() pagevecs. */ static int invalidate_complete_folio2(struct address_space *mapping, struct folio *folio) diff --git a/mm/workingset.c b/mm/workingset.c index d2d02978588c..1a86645b7b3c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -492,7 +492,10 @@ void workingset_refault(struct folio *folio, void *shadow) /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); - /* XXX: Move to lru_cache_add() when it supports new vs putback */ + /* + * XXX: Move to folio_add_lru() when it supports new vs + * putback + */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } -- cgit v1.2.3 From 9fd330582b2fe43c49ebcd02b2480f051f85aad4 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:30 -0800 Subject: mm: add folio dtor and order setter functions Patch series "convert core hugetlb functions to folios", v5. ============== OVERVIEW =========================== Now that many hugetlb helper functions that deal with hugetlb specific flags[1] and hugetlb cgroups[2] are converted to folios, higher level allocation, prep, and freeing functions within hugetlb can also be converted to operate in folios. Patch 1 of this series implements the wrapper functions around setting the compound destructor and compound order for a folio. Besides the user added in patch 1, patch 2 and patch 9 also use these helper functions. Patches 2-10 convert the higher level hugetlb functions to folios. ============== TESTING =========================== LTP: Ran 10 back to back rounds of the LTP hugetlb test suite. Gigantic Huge Pages: Test allocation and freeing via hugeadm commands: hugeadm --pool-pages-min 1GB:10 hugeadm --pool-pages-min 1GB:0 Demote: Demote 1 1GB hugepages to 512 2MB hugepages echo 1 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages echo 1 > /sys/kernel/mm/hugepages/hugepages-1048576kB/demote cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # 512 cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages # 0 [1] https://lore.kernel.org/lkml/20220922154207.1575343-1-sidhartha.kumar@oracle.com/ [2] https://lore.kernel.org/linux-mm/20221101223059.460937-1-sidhartha.kumar@oracle.com/ This patch (of 10): Add folio equivalents for set_compound_order() and set_compound_page_dtor(). Also remove extra new-lines introduced by mm/hugetlb: convert move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert hugetlb_cgroup_uncharge_page() to folios. [sidhartha.kumar@oracle.com: clarify folio_set_compound_order() zero support] Link: https://lkml.kernel.org/r/20221207223731.32784-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221129225039.82257-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221129225039.82257-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Suggested-by: Mike Kravetz Suggested-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Tarun Sahu Cc: Rasmus Villemoes Cc: Wei Chen Signed-off-by: Andrew Morton --- include/linux/mm.h | 23 +++++++++++++++++++++++ mm/hugetlb.c | 4 +--- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 472cb60ace07..7dc376052d40 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,6 +997,13 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } +static inline void folio_set_compound_dtor(struct folio *folio, + enum compound_dtor_id compound_dtor) +{ + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio); + folio->_folio_dtor = compound_dtor; +} + void destroy_large_folio(struct folio *folio); static inline int head_compound_pincount(struct page *head) @@ -1012,6 +1019,22 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } +/* + * folio_set_compound_order is generally passed a non-zero order to + * initialize a large folio. However, hugetlb code abuses this by + * passing in zero when 'dissolving' a large folio. + */ +static inline void folio_set_compound_order(struct folio *folio, + unsigned int order) +{ + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + folio->_folio_nr_pages = order ? 1U << order : 0; +#endif +} + /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9d97c9a2a15d..22512f7b0237 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1780,7 +1780,7 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio->_folio_dtor = HUGETLB_PAGE_DTOR; + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -2938,7 +2938,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * a reservation exists for the allocation. */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); - if (!page) { spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); @@ -7343,7 +7342,6 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re int old_nid = folio_nid(old_folio); int new_nid = folio_nid(new_folio); - folio_set_hugetlb_temporary(old_folio); folio_clear_hugetlb_temporary(new_folio); -- cgit v1.2.3 From 169004265860327182ecf92297b25b6271e81e96 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:51 +0000 Subject: fsdax: introduce page->share for fsdax in reflink mode Patch series "fsdax,xfs: fix warning messages", v2. Many testcases failed in dax+reflink mode with warning message in dmesg. Such as generic/051,075,127. The warning message is like this: [ 775.509337] ------------[ cut here ]------------ [ 775.509636] WARNING: CPU: 1 PID: 16815 at fs/dax.c:386 dax_insert_entry.cold+0x2e/0x69 [ 775.510151] Modules linked in: auth_rpcgss oid_registry nfsv4 algif_hash af_alg af_packet nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink ip6table_filter ip6_tables iptable_filter ip_tables x_tables dax_pmem nd_pmem nd_btt sch_fq_codel configfs xfs libcrc32c fuse [ 775.524288] CPU: 1 PID: 16815 Comm: fsx Kdump: loaded Tainted: G W 6.1.0-rc4+ #164 eb34e4ee4200c7cbbb47de2b1892c5a3e027fd6d [ 775.524904] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.0-3-3 04/01/2014 [ 775.525460] RIP: 0010:dax_insert_entry.cold+0x2e/0x69 [ 775.525797] Code: c7 c7 18 eb e0 81 48 89 4c 24 20 48 89 54 24 10 e8 73 6d ff ff 48 83 7d 18 00 48 8b 54 24 10 48 8b 4c 24 20 0f 84 e3 e9 b9 ff <0f> 0b e9 dc e9 b9 ff 48 c7 c6 a0 20 c3 81 48 c7 c7 f0 ea e0 81 48 [ 775.526708] RSP: 0000:ffffc90001d57b30 EFLAGS: 00010082 [ 775.527042] RAX: 000000000000002a RBX: 0000000000000000 RCX: 0000000000000042 [ 775.527396] RDX: ffffea000a0f6c80 RSI: ffffffff81dfab1b RDI: 00000000ffffffff [ 775.527819] RBP: ffffea000a0f6c40 R08: 0000000000000000 R09: ffffffff820625e0 [ 775.528241] R10: ffffc90001d579d8 R11: ffffffff820d2628 R12: ffff88815fc98320 [ 775.528598] R13: ffffc90001d57c18 R14: 0000000000000000 R15: 0000000000000001 [ 775.528997] FS: 00007f39fc75d740(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 775.529474] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 775.529800] CR2: 00007f39fc772040 CR3: 0000000107eb6001 CR4: 00000000003706e0 [ 775.530214] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 775.530592] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 775.531002] Call Trace: [ 775.531230] [ 775.531444] dax_fault_iter+0x267/0x6c0 [ 775.531719] dax_iomap_pte_fault+0x198/0x3d0 [ 775.532002] __xfs_filemap_fault+0x24a/0x2d0 [xfs aa8d25411432b306d9554da38096f4ebb86bdfe7] [ 775.532603] __do_fault+0x30/0x1e0 [ 775.532903] do_fault+0x314/0x6c0 [ 775.533166] __handle_mm_fault+0x646/0x1250 [ 775.533480] handle_mm_fault+0xc1/0x230 [ 775.533810] do_user_addr_fault+0x1ac/0x610 [ 775.534110] exc_page_fault+0x63/0x140 [ 775.534389] asm_exc_page_fault+0x22/0x30 [ 775.534678] RIP: 0033:0x7f39fc55820a [ 775.534950] Code: 00 01 00 00 00 74 99 83 f9 c0 0f 87 7b fe ff ff c5 fe 6f 4e 20 48 29 fe 48 83 c7 3f 49 8d 0c 10 48 83 e7 c0 48 01 fe 48 29 f9 a4 c4 c1 7e 7f 00 c4 c1 7e 7f 48 20 c5 f8 77 c3 0f 1f 44 00 00 [ 775.535839] RSP: 002b:00007ffc66a08118 EFLAGS: 00010202 [ 775.536157] RAX: 00007f39fc772001 RBX: 0000000000042001 RCX: 00000000000063c1 [ 775.536537] RDX: 0000000000006400 RSI: 00007f39fac42050 RDI: 00007f39fc772040 [ 775.536919] RBP: 0000000000006400 R08: 00007f39fc772001 R09: 0000000000042000 [ 775.537304] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 [ 775.537694] R13: 00007f39fc772000 R14: 0000000000006401 R15: 0000000000000003 [ 775.538086] [ 775.538333] ---[ end trace 0000000000000000 ]--- This also affects dax+noreflink mode if we run the test after a dax+reflink test. So, the most urgent thing is solving the warning messages. With these fixes, most warning messages in dax_associate_entry() are gone. But honestly, generic/388 will randomly failed with the warning. The case shutdown the xfs when fsstress is running, and do it for many times. I think the reason is that dax pages in use are not able to be invalidated in time when fs is shutdown. The next time dax page to be associated, it still remains the mapping value set last time. I'll keep on solving it. The warning message in dax_writeback_one() can also be fixed because of the dax unshare. This patch (of 8): fsdax page is used not only when CoW, but also mapread. To make the it easily understood, use 'share' to indicate that the dax page is shared by more than one extent. And add helper functions to use it. Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED. [ruansy.fnst@fujitsu.com: rename several functions] Link: https://lkml.kernel.org/r/1669972991-246-1-git-send-email-ruansy.fnst@fujitsu.com [ruansy.fnst@fujitsu.com: v2.2] Link: https://lkml.kernel.org/r/1670381359-53-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-2-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: Alistair Popple Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 38 ++++++++++++++++++++++---------------- include/linux/mm_types.h | 5 ++++- include/linux/page-flags.h | 2 +- 3 files changed, 27 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/fs/dax.c b/fs/dax.c index 1c6867810cbd..84fadea08705 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) -static inline bool dax_mapping_is_cow(struct address_space *mapping) +static inline bool dax_page_is_shared(struct page *page) { - return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; + return page->mapping == PAGE_MAPPING_DAX_SHARED; } /* - * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the + * refcount. */ -static inline void dax_mapping_set_cow(struct page *page) +static inline void dax_page_share_get(struct page *page) { - if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + if (page->mapping != PAGE_MAPPING_DAX_SHARED) { /* * Reset the index if the page was already mapped * regularly before. */ if (page->mapping) - page->index = 1; - page->mapping = (void *)PAGE_MAPPING_DAX_COW; + page->share = 1; + page->mapping = PAGE_MAPPING_DAX_SHARED; } - page->index++; + page->share++; +} + +static inline unsigned long dax_page_share_put(struct page *page) +{ + return --page->share; } /* - * When it is called in dax_insert_entry(), the cow flag will indicate that + * When it is called in dax_insert_entry(), the shared flag will indicate that * whether this entry is shared by multiple files. If so, set the page->mapping - * FS_DAX_MAPPING_COW, and use page->index as refcount. + * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address, bool cow) + struct vm_area_struct *vma, unsigned long address, bool shared) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - if (cow) { - dax_mapping_set_cow(page); + if (shared) { + dax_page_share_get(page); } else { WARN_ON_ONCE(page->mapping); page->mapping = mapping; @@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - if (dax_mapping_is_cow(page->mapping)) { - /* keep the CoW flag if this page is still shared */ - if (page->index-- > 0) + if (dax_page_is_shared(page)) { + /* keep the shared flag if this page is still shared */ + if (dax_page_share_put(page) > 0) continue; } else WARN_ON_ONCE(page->mapping && page->mapping != mapping); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 199f98be6f9c..3b8475007734 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -104,7 +104,10 @@ struct page { }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; - pgoff_t index; /* Our offset within mapping. */ + union { + pgoff_t index; /* Our offset within mapping. */ + unsigned long share; /* share count for fsdax */ + }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e42c55a7e012..9aec9fd8c50b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -638,7 +638,7 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ -#define PAGE_MAPPING_DAX_COW 0x1 +#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) static __always_inline bool folio_mapping_flags(struct folio *folio) { -- cgit v1.2.3 From d984648e428bf88cbd94ebe346c73632cb92fffb Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:33 +0000 Subject: fsdax,xfs: port unshare to fsdax Implement unshare in fsdax mode: copy data from srcmap to iomap. Link: https://lkml.kernel.org/r/1669908753-169-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_reflink.c | 8 ++++++-- include/linux/dax.h | 2 ++ 3 files changed, 60 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/dax.c b/fs/dax.c index 8fb928cd9dce..c48a3a93ab29 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1245,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ +static s64 dax_unshare_iter(struct iomap_iter *iter) +{ + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); + int id = 0; + s64 ret = 0; + void *daddr = NULL, *saddr = NULL; + + /* don't bother with blocks that are not shared to start with */ + if (!(iomap->flags & IOMAP_F_SHARED)) + return length; + /* don't bother with holes or unwritten extents */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + + id = dax_read_lock(); + ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = copy_mc_to_kernel(daddr, saddr, length); + if (ret) + ret = -EIO; + +out_unlock: + dax_read_unlock(id); + return ret; +} + +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = dax_unshare_iter(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(dax_file_unshare); + static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { const struct iomap *iomap = &iter->iomap; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 93bdd25680bc..fe46bce8cae6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1693,8 +1693,12 @@ xfs_reflink_unshare( inode_dio_wait(inode); - error = iomap_file_unshare(inode, offset, len, - &xfs_buffered_write_iomap_ops); + if (IS_DAX(inode)) + error = dax_file_unshare(inode, offset, len, + &xfs_dax_write_iomap_ops); + else + error = iomap_file_unshare(inode, offset, len, + &xfs_buffered_write_iomap_ops); if (error) goto out; diff --git a/include/linux/dax.h b/include/linux/dax.h index ba985333e26b..2b5ecb591059 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -205,6 +205,8 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping, } #endif +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops); int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, -- cgit v1.2.3 From adb8213014b25c7f1d75d5b219becaadcd695efb Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 2 Dec 2022 03:15:10 +0000 Subject: mm: memcg: fix stale protection of reclaim target memcg Patch series "mm: memcg: fix protection of reclaim target memcg", v3. This series fixes a bug in calculating the protection of the reclaim target memcg where we end up using stale effective protection values from the last reclaim operation, instead of completely ignoring the protection of the reclaim target as intended. More detailed explanation and examples in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest case that catches the bug. This patch (of 3): When we are doing memcg reclaim, the intended behavior is that we ignore any protection (memory.min, memory.low) of the target memcg (but not its children). Ever since the patch pointed to by the "Fixes" tag, we actually read a stale value for the target memcg protection when deciding whether to skip the memcg or not because it is protected. If the stale value happens to be high enough, we don't reclaim from the target memcg. Essentially, in some cases we may falsely skip reclaiming from the target memcg of reclaim because we read a stale protection value from last time we reclaimed from it. During reclaim, mem_cgroup_calculate_protection() is used to determine the effective protection (emin and elow) values of a memcg. The protection of the reclaim target is ignored, but we cannot set their effective protection to 0 due to a limitation of the current implementation (see comment in mem_cgroup_protection()). Instead, we leave their effective protection values unchaged, and later ignore it in mem_cgroup_protection(). However, mem_cgroup_protection() is called later in shrink_lruvec()->get_scan_count(), which is after the mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result, the stale effective protection values of the target memcg may lead us to skip reclaiming from the target memcg entirely, before calling shrink_lruvec(). This can be even worse with recursive protection, where the stale target memcg protection can be higher than its standalone protection. See two examples below (a similar version of example (a) is added to test_memcontrol in a later patch). (a) A simple example with proactive reclaim is as follows. Consider the following hierarchy: ROOT | A | B (memory.min = 10M) Consider the following scenario: - B has memory.current = 10M. - The system undergoes global reclaim (or memcg reclaim in A). - In shrink_node_memcgs(): - mem_cgroup_calculate_protection() calculates the effective min (emin) of B as 10M. - mem_cgroup_below_min() returns true for B, we do not reclaim from B. - Now if we want to reclaim 5M from B using proactive reclaim (memory.reclaim), we should be able to, as the protection of the target memcg should be ignored. - In shrink_node_memcgs(): - mem_cgroup_calculate_protection() immediately returns for B without doing anything, as B is the target memcg, relying on mem_cgroup_protection() to ignore B's stale effective min (still 10M). - mem_cgroup_below_min() reads the stale effective min for B and we skip it instead of ignoring its protection as intended, as we never reach mem_cgroup_protection(). (b) An more complex example with recursive protection is as follows. Consider the following hierarchy with memory_recursiveprot: ROOT | A (memory.min = 50M) | B (memory.min = 10M, memory.high = 40M) Consider the following scenario: - B has memory.current = 35M. - The system undergoes global reclaim (target memcg is NULL). - B will have an effective min of 50M (all of A's unclaimed protection). - B will not be reclaimed from. - Now allocate 10M more memory in B, pushing it above it's high limit. - The system undergoes memcg reclaim from B (target memcg is B). - Like example (a), we do nothing in mem_cgroup_calculate_protection(), then call mem_cgroup_below_min(), which will read the stale effective min for B (50M) and skip it. In this case, it's even worse because we are not just considering B's standalone protection (10M), but we are reading a much higher stale protection (50M) which will cause us to not reclaim from B at all. This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") which made mem_cgroup_calculate_protection() only change the state without returning any value. Before that commit, we used to return MEMCG_PROT_NONE for the target memcg, which would cause us to skip the mem_cgroup_below_{min/low}() checks. After that commit we do not return anything and we end up checking the min & low effective protections for the target memcg, which are stale. Update mem_cgroup_supports_protection() to also check if we are reclaiming from the target, and rename it to mem_cgroup_unprotected() (now returns true if we should not protect the memcg, much simpler logic). Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") Signed-off-by: Yosry Ahmed Reviewed-by: Roman Gushchin Cc: Chris Down Cc: David Rientjes Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 31 +++++++++++++++++++++---------- mm/vmscan.c | 11 ++++++----- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e1644a24009c..d3c8203cab6c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -615,28 +615,32 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root, void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); -static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg) +static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, + struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support - * protection. + * protection. The target memcg's protection is ignored, see + * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ - return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg); - + return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || + memcg == target; } -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, + struct mem_cgroup *memcg) { - if (!mem_cgroup_supports_protection(memcg)) + if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, + struct mem_cgroup *memcg) { - if (!mem_cgroup_supports_protection(memcg)) + if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= @@ -1209,12 +1213,19 @@ static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, { } -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) +static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, + struct mem_cgroup *memcg) +{ + return true; +} +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, + struct mem_cgroup *memcg) { return false; } -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, + struct mem_cgroup *memcg) { return false; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9356a3ee639c..dcd476a66a59 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4513,7 +4513,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned mem_cgroup_calculate_protection(NULL, memcg); - if (mem_cgroup_below_min(memcg)) + if (mem_cgroup_below_min(NULL, memcg)) return false; need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); @@ -5100,8 +5100,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - if (mem_cgroup_below_min(memcg) || - (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || + (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && + !sc->memcg_low_reclaim)) return 0; *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); @@ -6096,13 +6097,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) mem_cgroup_calculate_protection(target_memcg, memcg); - if (mem_cgroup_below_min(memcg)) { + if (mem_cgroup_below_min(target_memcg, memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; - } else if (mem_cgroup_below_low(memcg)) { + } else if (mem_cgroup_below_low(target_memcg, memcg)) { /* * Soft protection. * Respect the protection only as long as -- cgit v1.2.3 From 12a5d3955227b0d7e04fb793ccceeb2a1dd275c5 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 2 Dec 2022 14:35:31 -0800 Subject: mm: add nodes= arg to memory.reclaim The nodes= arg instructs the kernel to only scan the given nodes for proactive reclaim. For example use cases, consider a 2 tier memory system: nodes 0,1 -> top tier nodes 2,3 -> second tier $ echo "1m nodes=0" > memory.reclaim This instructs the kernel to attempt to reclaim 1m memory from node 0. Since node 0 is a top tier node, demotion will be attempted first. This is useful to direct proactive reclaim to specific nodes that are under pressure. $ echo "1m nodes=2,3" > memory.reclaim This instructs the kernel to attempt to reclaim 1m memory in the second tier, since this tier of memory has no demotion targets the memory will be reclaimed. $ echo "1m nodes=0,1" > memory.reclaim Instructs the kernel to reclaim memory from the top tier nodes, which can be desirable according to the userspace policy if there is pressure on the top tiers. Since these nodes have demotion targets, the kernel will attempt demotion first. Since commit 3f1509c57b1b ("Revert "mm/vmscan: never demote for memcg reclaim""), the proactive reclaim interface memory.reclaim does both reclaim and demotion. Reclaim and demotion incur different latency costs to the jobs in the cgroup. Demoted memory would still be addressable by the userspace at a higher latency, but reclaimed memory would need to incur a pagefault. The 'nodes' arg is useful to allow the userspace to control demotion and reclaim independently according to its policy: if the memory.reclaim is called on a node with demotion targets, it will attempt demotion first; if it is called on a node without demotion targets, it will only attempt reclaim. Link: https://lkml.kernel.org/r/20221202223533.1785418-1-almasrymina@google.com Signed-off-by: Mina Almasry Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Muchun Song Cc: Bagas Sanjaya Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Xu Cc: Yang Shi Cc: Yosry Ahmed Cc: zefan li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 15 +++++--- include/linux/swap.h | 3 +- mm/memcontrol.c | 67 ++++++++++++++++++++++++++------- mm/vmscan.c | 4 +- 4 files changed, 68 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 74cec76be9f2..c8ae7c897f14 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,17 +1245,13 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported. + This file accepts a string which contains the number of bytes to + reclaim. Example:: echo "1G" > memory.reclaim - The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). - Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1267,6 +1263,13 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. + This file also allows the user to specify the nodes to reclaim from, + via the 'nodes=' key, for example:: + + echo "1G nodes=0,1" > memory.reclaim + + The above instructs the kernel to reclaim memory from nodes 0,1. + memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ceed49516ad..2787b84eaf12 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,7 +418,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + nodemask_t *nodemask); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2c7a91689fef..ff65bc23be13 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,6 +63,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -2392,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP); + MEMCG_RECLAIM_MAY_SWAP, + NULL); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2683,7 +2685,8 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options); + gfp_mask, reclaim_options, + NULL); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3503,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, + NULL)) { ret = -EBUSY; break; } @@ -3614,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP)) + MEMCG_RECLAIM_MAY_SWAP, + NULL)) nr_retries--; } @@ -6418,7 +6423,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, + NULL); if (!reclaimed && !nr_retries--) break; @@ -6467,7 +6473,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, + NULL)) nr_reclaims--; continue; } @@ -6590,21 +6597,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +enum { + MEMORY_RECLAIM_NODES = 0, + MEMORY_RECLAIM_NULL, +}; + +static const match_table_t if_tokens = { + { MEMORY_RECLAIM_NODES, "nodes=%s" }, + { MEMORY_RECLAIM_NULL, NULL }, +}; + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options; - int err; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + int token; + char value[256]; + nodemask_t nodemask = NODE_MASK_ALL; buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); - if (err) - return err; - reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + token = match_token(start, if_tokens, args); + match_strlcpy(value, args, sizeof(value)); + switch (token) { + case MEMORY_RECLAIM_NODES: + if (nodelist_parse(value, nodemask) < 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6621,7 +6661,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options); + GFP_KERNEL, reclaim_options, + &nodemask); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index 1a59171c6695..2b42ac9ad755 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6758,7 +6758,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options) + unsigned int reclaim_options, + nodemask_t *nodemask) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6773,6 +6774,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put -- cgit v1.2.3 From c7cdf94e9cd7a03549e61b0f85949959191b8a10 Mon Sep 17 00:00:00 2001 From: Wang Yong Date: Wed, 7 Dec 2022 07:40:11 +0000 Subject: mm: fix typo in struct pglist_data code comment change "stat" to "start". Link: https://lkml.kernel.org/r/20221207074011.GA151242@cloud Fixes: c959924b0dc5 ("memory tiering: adjust hot threshold automatically") Signed-off-by: Wang Yong Reviewed-by: "Huang, Ying" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f74891556f3..128f3cde800c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1200,7 +1200,7 @@ typedef struct pglist_data { /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* - * number of promote candidate pages at stat time of current promote + * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; -- cgit v1.2.3