From 198b62f83eef1d605d70eca32759c92cdcc14175 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:20 -0700 Subject: mm/filemap: fix storing to a THP shadow entry When a THP is removed from the page cache by reclaim, we replace it with a shadow entry that occupies all slots of the XArray previously occupied by the THP. If the user then accesses that page again, we only allocate a single page, but storing it into the shadow entry replaces all entries with that one page. That leads to bugs like page dumped because: VM_BUG_ON_PAGE(page_to_pgoff(page) != offset) ------------[ cut here ]------------ kernel BUG at mm/filemap.c:2529! https://bugzilla.kernel.org/show_bug.cgi?id=206569 This is hard to reproduce with mainline, but happens regularly with the THP patchset (as so many more THPs are created). This solution is take from the THP patchset. It splits the shadow entry into order-0 pieces at the time that we bring a new page into cache. Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS") Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Song Liu Cc: "Kirill A . Shutemov" Cc: Qian Cai Link: https://lkml.kernel.org/r/20200903183029.14930-4-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/filemap.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 38546dca58fe..3b76d79fd0cf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -829,13 +829,12 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); static int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask, + pgoff_t offset, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); int error; - void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); @@ -846,25 +845,46 @@ static int __add_to_page_cache_locked(struct page *page, page->index = offset; if (!huge) { - error = mem_cgroup_charge(page, current->mm, gfp_mask); + error = mem_cgroup_charge(page, current->mm, gfp); if (error) goto error; } + gfp &= GFP_RECLAIM_MASK; + do { + unsigned int order = xa_get_order(xas.xa, xas.xa_index); + void *entry, *old = NULL; + + if (order > thp_order(page)) + xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), + order, gfp); xas_lock_irq(&xas); - old = xas_load(&xas); - if (old && !xa_is_value(old)) - xas_set_err(&xas, -EEXIST); + xas_for_each_conflict(&xas, entry) { + old = entry; + if (!xa_is_value(entry)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + } + + if (old) { + if (shadowp) + *shadowp = old; + /* entry may have been split before we acquired lock */ + order = xa_get_order(xas.xa, xas.xa_index); + if (order > thp_order(page)) { + xas_split(&xas, old, order); + xas_reset(&xas); + } + } + xas_store(&xas, page); if (xas_error(&xas)) goto unlock; - if (xa_is_value(old)) { + if (old) mapping->nrexceptional--; - if (shadowp) - *shadowp = old; - } mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting */ @@ -872,7 +892,7 @@ static int __add_to_page_cache_locked(struct page *page, __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { error = xas_error(&xas); -- cgit v1.2.3 From 887b22c628c615811be3b877e02c9c1fe9d85d6d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:23 -0700 Subject: mm/filemap: fix page cache removal for arbitrary sized THPs Patch series "Remove assumptions of THP size". There are a number of places in the VM which assume that a THP is a PMD in size. That's true today, and remains true after this patch series, but this is a prerequisite for switching to arbitrary-sized THPs. thp_nr_pages() still returns either HPAGE_PMD_NR or 1, but will be changed later. This patch (of 11): page_cache_free_page() assumes THPs are PMD_SIZE; fix that assumption. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200908195539.25896-2-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 3b76d79fd0cf..2937b57329be 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping, freepage(page); if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, HPAGE_PMD_NR); + page_ref_sub(page, thp_nr_pages(page)); VM_BUG_ON_PAGE(page_count(page) <= 0, page); } else { put_page(page); -- cgit v1.2.3 From db660d462525c4a152b25e033c3dfa9c25d188e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Oct 2020 20:06:31 -0700 Subject: mm/filemap: fold ra_submit into do_sync_mmap_readahead Fold ra_submit() into its last remaining user and pass the readahead_control struct to both do_page_cache_ra() and page_cache_sync_ra(). Signed-off-by: David Howells Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-9-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/filemap.c | 10 +++++----- mm/internal.h | 10 ---------- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2937b57329be..8aad142ec4be 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2588,8 +2588,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff); struct file *fpin = NULL; - pgoff_t offset = vmf->pgoff; unsigned int mmap_miss; /* If we don't want any read-ahead, don't bother */ @@ -2600,8 +2600,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (vmf->vma->vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_readahead(mapping, ra, file, offset, - ra->ra_pages); + page_cache_sync_ra(&ractl, ra, ra->ra_pages); return fpin; } @@ -2621,10 +2620,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * mmap read-around */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ra->start = max_t(long, 0, offset - ra->ra_pages / 2); + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; - ra_submit(ra, mapping, file); + ractl._index = ra->start; + do_page_cache_ra(&ractl, ra->size, ra->async_size); return fpin; } diff --git a/mm/internal.h b/mm/internal.h index 059ad0115ec1..03654797a3a9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -59,16 +59,6 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, nr_to_read); } -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -static inline void ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *file) -{ - DEFINE_READAHEAD(ractl, file, mapping, ra->start); - do_page_cache_ra(&ractl, ra->size, ra->async_size); -} - struct page *find_get_entry(struct address_space *mapping, pgoff_t index); struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); -- cgit v1.2.3 From 0e9aa6755757ea5e06e6130d5e50a93442456499 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 15 Oct 2020 20:09:58 -0700 Subject: mm: fix some broken comments Fix some broken comments including typo, grammar error and wrong function name. Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/20200913095456.54873-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- mm/swap_state.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 8aad142ec4be..45e564a20f8e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1445,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem * unlock_page - unlock a locked page * @page: the page * - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). + * Unlocks the page and wakes up sleepers in wait_on_page_locked(). * Also wakes sleepers in wait_on_page_writeback() because the wakeup * mechanism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. @@ -3004,7 +3004,7 @@ filler: goto out; /* - * Page is not up to date and may be locked due one of the following + * Page is not up to date and may be locked due to one of the following * case a: Page is being filled and the page lock is held * case b: Read/write error clearing the page uptodate status * case c: Truncation in progress (page locked) diff --git a/mm/swap_state.c b/mm/swap_state.c index aa40e706604c..ee465827420e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -246,7 +246,7 @@ int add_to_swap(struct page *page) goto fail; /* * Normally the page will be dirtied in unmap because its pte should be - * dirty. A special case is MADV_FREE page. The page'e pte could have + * dirty. A special case is MADV_FREE page. The page's pte could have * dirty bit cleared but the page's SwapBacked bit is still set because * clearing the dirty bit and SwapBacked bit has no lock protected. For * such page, unmap will not set dirty bit for it, so page reclaim will -- cgit v1.2.3