From c12cd77cb028255663810e6c4528f0325facff66 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 14 Apr 2022 19:14:01 -0700
Subject: mm/vmalloc: fix spinning drain_vmap_work after reading from
 /proc/vmcore

Commit 3ee48b6af49c ("mm, x86: Saving vmcore with non-lazy freeing of
vmas") introduced set_iounmap_nonlazy(), which sets vmap_lazy_nr to
lazy_max_pages() + 1, ensuring that any future vunmaps() immediately
purge the vmap areas instead of doing it lazily.

Commit 690467c81b1a ("mm/vmalloc: Move draining areas out of caller
context") moved the purging from the vunmap() caller to a worker thread.
Unfortunately, set_iounmap_nonlazy() can cause the worker thread to spin
(possibly forever).  For example, consider the following scenario:

 1. Thread reads from /proc/vmcore. This eventually calls
    __copy_oldmem_page() -> set_iounmap_nonlazy(), which sets
    vmap_lazy_nr to lazy_max_pages() + 1.

 2. Then it calls free_vmap_area_noflush() (via iounmap()), which adds 2
    pages (one page plus the guard page) to the purge list and
    vmap_lazy_nr. vmap_lazy_nr is now lazy_max_pages() + 3, so the
    drain_vmap_work is scheduled.

 3. Thread returns from the kernel and is scheduled out.

 4. Worker thread is scheduled in and calls drain_vmap_area_work(). It
    frees the 2 pages on the purge list. vmap_lazy_nr is now
    lazy_max_pages() + 1.

 5. This is still over the threshold, so it tries to purge areas again,
    but doesn't find anything.

 6. Repeat 5.

If the system is running with only one CPU (which is typicial for kdump)
and preemption is disabled, then this will never make forward progress:
there aren't any more pages to purge, so it hangs.  If there is more
than one CPU or preemption is enabled, then the worker thread will spin
forever in the background.  (Note that if there were already pages to be
purged at the time that set_iounmap_nonlazy() was called, this bug is
avoided.)

This can be reproduced with anything that reads from /proc/vmcore
multiple times.  E.g., vmcore-dmesg /proc/vmcore.

It turns out that improvements to vmap() over the years have obsoleted
the need for this "optimization".  I benchmarked `dd if=/proc/vmcore
of=/dev/null` with 4k and 1M read sizes on a system with a 32GB vmcore.
The test was run on 5.17, 5.18-rc1 with a fix that avoided the hang, and
5.18-rc1 with set_iounmap_nonlazy() removed entirely:

    |5.17  |5.18+fix|5.18+removal
  4k|40.86s|  40.09s|      26.73s
  1M|24.47s|  23.98s|      21.84s

The removal was the fastest (by a wide margin with 4k reads).  This
patch removes set_iounmap_nonlazy().

Link: https://lkml.kernel.org/r/52f819991051f9b865e9ce25605509bfdbacadcd.1649277321.git.osandov@fb.com
Fixes: 690467c81b1a  ("mm/vmalloc: Move draining areas out of caller context")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'mm/vmalloc.c')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e163372d3967..0b17498a34f1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1671,17 +1671,6 @@ static DEFINE_MUTEX(vmap_purge_lock);
 /* for per-CPU blocks */
 static void purge_fragmented_blocks_allcpus(void);
 
-#ifdef CONFIG_X86_64
-/*
- * called before a call to iounmap() if the caller wants vm_area_struct's
- * immediately freed.
- */
-void set_iounmap_nonlazy(void)
-{
-	atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
-}
-#endif /* CONFIG_X86_64 */
-
 /*
  * Purges all lazily-freed vmap areas.
  */
-- 
cgit v1.2.3


From 559089e0a93d44280ec3ab478830af319c56dbe3 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 15 Apr 2022 09:44:10 -0700
Subject: vmalloc: replace VM_NO_HUGE_VMAP with VM_ALLOW_HUGE_VMAP

Huge page backed vmalloc memory could benefit performance in many cases.
However, some users of vmalloc may not be ready to handle huge pages for
various reasons: hardware constraints, potential pages split, etc.
VM_NO_HUGE_VMAP was introduced to allow vmalloc users to opt-out huge
pages.  However, it is not easy to track down all the users that require
the opt-out, as the allocation are passed different stacks and may cause
issues in different layers.

To address this issue, replace VM_NO_HUGE_VMAP with an opt-in flag,
VM_ALLOW_HUGE_VMAP, so that users that benefit from huge pages could ask
specificially.

Also, remove vmalloc_no_huge() and add opt-in helper vmalloc_huge().

Fixes: fac54e2bfb5b ("x86/Kconfig: Select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP")
Link: https://lore.kernel.org/netdev/14444103-d51b-0fb3-ee63-c3f182f0b546@molgen.mpg.de/"
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                 |  6 ++----
 arch/powerpc/kernel/module.c |  2 +-
 arch/s390/kvm/pv.c           |  7 +------
 include/linux/vmalloc.h      |  4 ++--
 mm/vmalloc.c                 | 17 ++++++++++-------
 5 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'mm/vmalloc.c')

diff --git a/arch/Kconfig b/arch/Kconfig
index 29b0167c088b..31c4fdc4a4ba 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -854,10 +854,8 @@ config HAVE_ARCH_HUGE_VMAP
 
 #
 #  Archs that select this would be capable of PMD-sized vmaps (i.e.,
-#  arch_vmap_pmd_supported() returns true), and they must make no assumptions
-#  that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
-#  can be used to prohibit arch-specific allocations from using hugepages to
-#  help with this (e.g., modules may require it).
+#  arch_vmap_pmd_supported() returns true). The VM_ALLOW_HUGE_VMAP flag
+#  must be used to enable allocations to use hugepages.
 #
 config HAVE_ARCH_HUGE_VMALLOC
 	depends on HAVE_ARCH_HUGE_VMAP
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 40a583e9d3c7..97a76a8619fb 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -101,7 +101,7 @@ __module_alloc(unsigned long size, unsigned long start, unsigned long end, bool
 	 * too.
 	 */
 	return __vmalloc_node_range(size, 1, start, end, gfp, prot,
-				    VM_FLUSH_RESET_PERMS | VM_NO_HUGE_VMAP,
+				    VM_FLUSH_RESET_PERMS,
 				    NUMA_NO_NODE, __builtin_return_address(0));
 }
 
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 7f7c0d6af2ce..cc7c9599f43e 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -137,12 +137,7 @@ static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
 	/* Allocate variable storage */
 	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
 	vlen += uv_info.guest_virt_base_stor_len;
-	/*
-	 * The Create Secure Configuration Ultravisor Call does not support
-	 * using large pages for the virtual memory area.
-	 * This is a hardware limitation.
-	 */
-	kvm->arch.pv.stor_var = vmalloc_no_huge(vlen);
+	kvm->arch.pv.stor_var = vzalloc(vlen);
 	if (!kvm->arch.pv.stor_var)
 		goto out_err;
 	return 0;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3b1df7da402d..b159c2789961 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -26,7 +26,7 @@ struct notifier_block;		/* in notifier.h */
 #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
 #define VM_FLUSH_RESET_PERMS	0x00000100	/* reset direct map and flush TLB on unmap, can't be freed in atomic context */
 #define VM_MAP_PUT_PAGES	0x00000200	/* put pages and free array in vfree */
-#define VM_NO_HUGE_VMAP		0x00000400	/* force PAGE_SIZE pte mapping */
+#define VM_ALLOW_HUGE_VMAP	0x00000400      /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */
 
 #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
 	!defined(CONFIG_KASAN_VMALLOC)
@@ -153,7 +153,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			const void *caller) __alloc_size(1);
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
-void *vmalloc_no_huge(unsigned long size) __alloc_size(1);
+void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
 
 extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
 extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0b17498a34f1..07da85ae825b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3095,7 +3095,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 		return NULL;
 	}
 
-	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
+	if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
 		unsigned long size_per_node;
 
 		/*
@@ -3262,21 +3262,24 @@ void *vmalloc(unsigned long size)
 EXPORT_SYMBOL(vmalloc);
 
 /**
- * vmalloc_no_huge - allocate virtually contiguous memory using small pages
- * @size:    allocation size
+ * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * @size:      allocation size
+ * @gfp_mask:  flags for the page level allocator
  *
- * Allocate enough non-huge pages to cover @size from the page level
+ * Allocate enough pages to cover @size from the page level
  * allocator and map them into contiguous kernel virtual space.
+ * If @size is greater than or equal to PMD_SIZE, allow using
+ * huge pages for the memory
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_no_huge(unsigned long size)
+void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
 {
 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-				    GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
+				    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
 				    NUMA_NO_NODE, __builtin_return_address(0));
 }
-EXPORT_SYMBOL(vmalloc_no_huge);
+EXPORT_SYMBOL_GPL(vmalloc_huge);
 
 /**
  * vzalloc - allocate virtually contiguous memory with zero fill
-- 
cgit v1.2.3


From 3b8000ae185cb068adbda5f966a3835053c85fd4 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 22 Apr 2022 16:01:05 +1000
Subject: mm/vmalloc: huge vmalloc backing pages should be split rather than
 compound

Huge vmalloc higher-order backing pages were allocated with __GFP_COMP
in order to allow the sub-pages to be refcounted by callers such as
"remap_vmalloc_page [sic]" (remap_vmalloc_range).

However a similar problem exists for other struct page fields callers
use, for example fb_deferred_io_fault() takes a vmalloc'ed page and
not only refcounts it but uses ->lru, ->mapping, ->index.

This is not compatible with compound sub-pages, and can cause bad page
state issues like

  BUG: Bad page state in process swapper/0  pfn:00743
  page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x743
  flags: 0x7ffff000000000(node=0|zone=0|lastcpupid=0x7ffff)
  raw: 007ffff000000000 c00c00000001d0c8 c00c00000001d0c8 0000000000000000
  raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
  page dumped because: corrupted mapping in tail page
  Modules linked in:
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.18.0-rc3-00082-gfc6fff4a7ce1-dirty #2810
  Call Trace:
    dump_stack_lvl+0x74/0xa8 (unreliable)
    bad_page+0x12c/0x170
    free_tail_pages_check+0xe8/0x190
    free_pcp_prepare+0x31c/0x4e0
    free_unref_page+0x40/0x1b0
    __vunmap+0x1d8/0x420
    ...

The correct approach is to use split high-order pages for the huge
vmalloc backing. These allow callers to treat them in exactly the same
way as individually-allocated order-0 pages.

Link: https://lore.kernel.org/all/14444103-d51b-0fb3-ee63-c3f182f0b546@molgen.mpg.de/
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Menzel <pmenzel@molgen.mpg.de>
Cc: Song Liu <songliubraving@fb.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'mm/vmalloc.c')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 07da85ae825b..cadfbb5155ea 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2653,15 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages)
 	vm_remove_mappings(area, deallocate_pages);
 
 	if (deallocate_pages) {
-		unsigned int page_order = vm_area_page_order(area);
-		int i, step = 1U << page_order;
+		int i;
 
-		for (i = 0; i < area->nr_pages; i += step) {
+		for (i = 0; i < area->nr_pages; i++) {
 			struct page *page = area->pages[i];
 
 			BUG_ON(!page);
-			mod_memcg_page_state(page, MEMCG_VMALLOC, -step);
-			__free_pages(page, page_order);
+			mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
+			/*
+			 * High-order allocs for huge vmallocs are split, so
+			 * can be freed as an array of order-0 allocations
+			 */
+			__free_pages(page, 0);
 			cond_resched();
 		}
 		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2914,12 +2917,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
 			if (nr != nr_pages_request)
 				break;
 		}
-	} else
-		/*
-		 * Compound pages required for remap_vmalloc_page if
-		 * high-order pages.
-		 */
-		gfp |= __GFP_COMP;
+	}
 
 	/* High-order pages or fallback path if "bulk" fails. */
 
@@ -2933,6 +2931,15 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
 			page = alloc_pages_node(nid, gfp, order);
 		if (unlikely(!page))
 			break;
+		/*
+		 * Higher order allocations must be able to be treated as
+		 * indepdenent small pages by callers (as they can with
+		 * small-page vmallocs). Some drivers do their own refcounting
+		 * on vmalloc_to_page() pages, some use page->mapping,
+		 * page->lru, etc.
+		 */
+		if (order)
+			split_page(page, order);
 
 		/*
 		 * Careful, we allocate and map page-order pages, but
@@ -2992,11 +2999,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
 	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
 	if (gfp_mask & __GFP_ACCOUNT) {
-		int i, step = 1U << page_order;
+		int i;
 
-		for (i = 0; i < area->nr_pages; i += step)
-			mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC,
-					     step);
+		for (i = 0; i < area->nr_pages; i++)
+			mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
 	}
 
 	/*
-- 
cgit v1.2.3