From 6cc22dc08a247b7b4a173e4561e39705a557d300 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 18 Jun 2018 14:15:30 -0700
Subject: revert "mm/memblock: add missing include <linux/bootmem.h>"

The patch fixed a W=1 warning but broke the ia64 build:

    CC      mm/memblock.o
  mm/memblock.c:1340: error: redefinition of `memblock_virt_alloc_try_nid_raw'
  ./include/linux/bootmem.h:335: error: previous definition of `memblock_virt_alloc_try_nid_raw' was here

Because inlcude/linux/bootmem.h says

	#if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM)

whereas mm/Makefile says

	obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o

So revert 26f09e9b3a06 ("mm/memblock: add missing include
<linux/bootmem.h>") while a full fix can be worked on.

Fixes: 26f09e9b3a06 ("mm/memblock: add missing include <linux/bootmem.h>")
Reported-by: Tony Luck <tony.luck@gmail.com>
Cc: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index cc16d70b8333..03d48d8835ba 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,7 +20,6 @@
 #include <linux/kmemleak.h>
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
-#include <linux/bootmem.h>
 
 #include <asm/sections.h>
 #include <linux/io.h>
-- 
cgit v1.2.3


From 3ee7e8697d5860b173132606d80a9cd35e7113ee Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 18 Jun 2018 15:46:58 +0200
Subject: bdi: Fix another oops in wb_workfn()

syzbot is reporting NULL pointer dereference at wb_workfn() [1] due to
wb->bdi->dev being NULL. And Dmitry confirmed that wb->state was
WB_shutting_down after wb->bdi->dev became NULL. This indicates that
unregister_bdi() failed to call wb_shutdown() on one of wb objects.

The problem is in cgwb_bdi_unregister() which does cgwb_kill() and thus
drops bdi's reference to wb structures before going through the list of
wbs again and calling wb_shutdown() on each of them. This way the loop
iterating through all wbs can easily miss a wb if that wb has already
passed through cgwb_remove_from_bdi_list() called from wb_shutdown()
from cgwb_release_workfn() and as a result fully shutdown bdi although
wb_workfn() for this wb structure is still running. In fact there are
also other ways cgwb_bdi_unregister() can race with
cgwb_release_workfn() leading e.g. to use-after-free issues:

CPU1                            CPU2
                                cgwb_bdi_unregister()
                                  cgwb_kill(*slot);

cgwb_release()
  queue_work(cgwb_release_wq, &wb->release_work);
cgwb_release_workfn()
                                  wb = list_first_entry(&bdi->wb_list, ...)
                                  spin_unlock_irq(&cgwb_lock);
  wb_shutdown(wb);
  ...
  kfree_rcu(wb, rcu);
                                  wb_shutdown(wb); -> oops use-after-free

We solve these issues by synchronizing writeback structure shutdown from
cgwb_bdi_unregister() with cgwb_release_workfn() using a new mutex. That
way we also no longer need synchronization using WB_shutting_down as the
mutex provides it for CONFIG_CGROUP_WRITEBACK case and without
CONFIG_CGROUP_WRITEBACK wb_shutdown() can be called only once from
bdi_unregister().

Reported-by: syzbot <syzbot+4a7438e774b21ddd8eca@syzkaller.appspotmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev-defs.h |  2 +-
 mm/backing-dev.c                 | 20 +++++++-------------
 2 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 0bd432a4d7bd..24251762c20c 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -22,7 +22,6 @@ struct dentry;
  */
 enum wb_state {
 	WB_registered,		/* bdi_register() was done */
-	WB_shutting_down,	/* wb_shutdown() in progress */
 	WB_writeback_running,	/* Writeback is in progress */
 	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
 	WB_start_all,		/* nr_pages == 0 (all) work pending */
@@ -189,6 +188,7 @@ struct backing_dev_info {
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
+	struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
 #else
 	struct bdi_writeback_congested *wb_congested;
 #endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 347cc834c04a..2e5d3df0853d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	spin_lock_bh(&wb->work_lock);
 	if (!test_and_clear_bit(WB_registered, &wb->state)) {
 		spin_unlock_bh(&wb->work_lock);
-		/*
-		 * Wait for wb shutdown to finish if someone else is just
-		 * running wb_shutdown(). Otherwise we could proceed to wb /
-		 * bdi destruction before wb_shutdown() is finished.
-		 */
-		wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
 		return;
 	}
-	set_bit(WB_shutting_down, &wb->state);
 	spin_unlock_bh(&wb->work_lock);
 
 	cgwb_remove_from_bdi_list(wb);
@@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	mod_delayed_work(bdi_wq, &wb->dwork, 0);
 	flush_delayed_work(&wb->dwork);
 	WARN_ON(!list_empty(&wb->work_list));
-	/*
-	 * Make sure bit gets cleared after shutdown is finished. Matches with
-	 * the barrier provided by test_and_clear_bit() above.
-	 */
-	smp_wmb();
-	clear_and_wake_up_bit(WB_shutting_down, &wb->state);
 }
 
 static void wb_exit(struct bdi_writeback *wb)
@@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work)
 	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
 						release_work);
 
+	mutex_lock(&wb->bdi->cgwb_release_mutex);
 	wb_shutdown(wb);
 
 	css_put(wb->memcg_css);
 	css_put(wb->blkcg_css);
+	mutex_unlock(&wb->bdi->cgwb_release_mutex);
 
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 	percpu_ref_exit(&wb->refcnt);
@@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 
 	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 	bdi->cgwb_congested_tree = RB_ROOT;
+	mutex_init(&bdi->cgwb_release_mutex);
 
 	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
 	if (!ret) {
@@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
 	spin_lock_irq(&cgwb_lock);
 	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
 		cgwb_kill(*slot);
+	spin_unlock_irq(&cgwb_lock);
 
+	mutex_lock(&bdi->cgwb_release_mutex);
+	spin_lock_irq(&cgwb_lock);
 	while (!list_empty(&bdi->wb_list)) {
 		wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
 				      bdi_node);
@@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
 		spin_lock_irq(&cgwb_lock);
 	}
 	spin_unlock_irq(&cgwb_lock);
+	mutex_unlock(&bdi->cgwb_release_mutex);
 }
 
 /**
-- 
cgit v1.2.3


From 28557cc106e6d2aa8b8c5c7687ea9f8055ff3911 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 27 Jun 2018 23:26:05 -0700
Subject: Revert mm/vmstat.c: fix vmstat_update() preemption BUG

Revert commit c7f26ccfb2c3 ("mm/vmstat.c: fix vmstat_update() preemption
BUG").  Steven saw a "using smp_processor_id() in preemptible" message
and added a preempt_disable() section around it to keep it quiet.  This
is not the right thing to do it does not fix the real problem.

vmstat_update() is invoked by a kworker on a specific CPU.  This worker
it bound to this CPU.  The name of the worker was "kworker/1:1" so it
should have been a worker which was bound to CPU1.  A worker which can
run on any CPU would have a `u' before the first digit.

smp_processor_id() can be used in a preempt-enabled region as long as
the task is bound to a single CPU which is the case here.  If it could
run on an arbitrary CPU then this is the problem we have an should seek
to resolve.

Not only this smp_processor_id() must not be migrated to another CPU but
also refresh_cpu_vm_stats() which might access wrong per-CPU variables.
Not to mention that other code relies on the fact that such a worker
runs on one specific CPU only.

Therefore revert that commit and we should look instead what broke the
affinity mask of the kworker.

Link: http://lkml.kernel.org/r/20180504104451.20278-1-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Steven J. Hill <steven.hill@cavium.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 75eda9c2b260..8ba0870ecddd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1796,11 +1796,9 @@ static void vmstat_update(struct work_struct *w)
 		 * to occur in the future. Keep on running the
 		 * update worker thread.
 		 */
-		preempt_disable();
 		queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
 				this_cpu_ptr(&vmstat_work),
 				round_jiffies_relative(sysctl_stat_interval));
-		preempt_enable();
 	}
 }
 
-- 
cgit v1.2.3


From d50d82faa0c964e31f7a946ba8aba7c715ca7ab0 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 27 Jun 2018 23:26:09 -0700
Subject: slub: fix failure when we delete and create a slab cache

In kernel 4.17 I removed some code from dm-bufio that did slab cache
merging (commit 21bb13276768: "dm bufio: remove code that merges slab
caches") - both slab and slub support merging caches with identical
attributes, so dm-bufio now just calls kmem_cache_create and relies on
implicit merging.

This uncovered a bug in the slub subsystem - if we delete a cache and
immediatelly create another cache with the same attributes, it fails
because of duplicate filename in /sys/kernel/slab/.  The slub subsystem
offloads freeing the cache to a workqueue - and if we create the new
cache before the workqueue runs, it complains because of duplicate
filename in sysfs.

This patch fixes the bug by moving the call of kobject_del from
sysfs_slab_remove_workfn to shutdown_cache.  kobject_del must be called
while we hold slab_mutex - so that the sysfs entry is deleted before a
cache with the same attributes could be created.

Running device-mapper-test-suite with:

  dmtest run --suite thin-provisioning -n /commit_failure_causes_fallback/

triggered:

  Buffer I/O error on dev dm-0, logical block 1572848, async page read
  device-mapper: thin: 253:1: metadata operation 'dm_pool_alloc_data_block' failed: error = -5
  device-mapper: thin: 253:1: aborting current metadata transaction
  sysfs: cannot create duplicate filename '/kernel/slab/:a-0000144'
  CPU: 2 PID: 1037 Comm: kworker/u48:1 Not tainted 4.17.0.snitm+ #25
  Hardware name: Supermicro SYS-1029P-WTR/X11DDW-L, BIOS 2.0a 12/06/2017
  Workqueue: dm-thin do_worker [dm_thin_pool]
  Call Trace:
   dump_stack+0x5a/0x73
   sysfs_warn_dup+0x58/0x70
   sysfs_create_dir_ns+0x77/0x80
   kobject_add_internal+0xba/0x2e0
   kobject_init_and_add+0x70/0xb0
   sysfs_slab_add+0xb1/0x250
   __kmem_cache_create+0x116/0x150
   create_cache+0xd9/0x1f0
   kmem_cache_create_usercopy+0x1c1/0x250
   kmem_cache_create+0x18/0x20
   dm_bufio_client_create+0x1ae/0x410 [dm_bufio]
   dm_block_manager_create+0x5e/0x90 [dm_persistent_data]
   __create_persistent_data_objects+0x38/0x940 [dm_thin_pool]
   dm_pool_abort_metadata+0x64/0x90 [dm_thin_pool]
   metadata_operation_failed+0x59/0x100 [dm_thin_pool]
   alloc_data_block.isra.53+0x86/0x180 [dm_thin_pool]
   process_cell+0x2a3/0x550 [dm_thin_pool]
   do_worker+0x28d/0x8f0 [dm_thin_pool]
   process_one_work+0x171/0x370
   worker_thread+0x49/0x3f0
   kthread+0xf8/0x130
   ret_from_fork+0x35/0x40
  kobject_add_internal failed for :a-0000144 with -EEXIST, don't try to register things with the same name in the same directory.
  kmem_cache_create(dm_bufio_buffer-16) failed with error -17

Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1806151817130.6333@file01.intranet.prod.int.rdu2.redhat.com
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reported-by: Mike Snitzer <snitzer@redhat.com>
Tested-by: Mike Snitzer <snitzer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 4 ++++
 mm/slab_common.c         | 4 ++++
 mm/slub.c                | 7 ++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 09fa2c6f0e68..3a1a1dbc6f49 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -155,8 +155,12 @@ struct kmem_cache {
 
 #ifdef CONFIG_SYSFS
 #define SLAB_SUPPORTS_SYSFS
+void sysfs_slab_unlink(struct kmem_cache *);
 void sysfs_slab_release(struct kmem_cache *);
 #else
+static inline void sysfs_slab_unlink(struct kmem_cache *s)
+{
+}
 static inline void sysfs_slab_release(struct kmem_cache *s)
 {
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 890b1f04a03a..2296caf87bfb 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -567,10 +567,14 @@ static int shutdown_cache(struct kmem_cache *s)
 	list_del(&s->list);
 
 	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
+#ifdef SLAB_SUPPORTS_SYSFS
+		sysfs_slab_unlink(s);
+#endif
 		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
 		schedule_work(&slab_caches_to_rcu_destroy_work);
 	} else {
 #ifdef SLAB_SUPPORTS_SYSFS
+		sysfs_slab_unlink(s);
 		sysfs_slab_release(s);
 #else
 		slab_kmem_cache_release(s);
diff --git a/mm/slub.c b/mm/slub.c
index a3b8467c14af..51258eff4178 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5667,7 +5667,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
 	kset_unregister(s->memcg_kset);
 #endif
 	kobject_uevent(&s->kobj, KOBJ_REMOVE);
-	kobject_del(&s->kobj);
 out:
 	kobject_put(&s->kobj);
 }
@@ -5752,6 +5751,12 @@ static void sysfs_slab_remove(struct kmem_cache *s)
 	schedule_work(&s->kobj_remove_work);
 }
 
+void sysfs_slab_unlink(struct kmem_cache *s)
+{
+	if (slab_state >= FULL)
+		kobject_del(&s->kobj);
+}
+
 void sysfs_slab_release(struct kmem_cache *s)
 {
 	if (slab_state >= FULL)
-- 
cgit v1.2.3


From 520495fe96d74e05db585fc748351e0504d8f40d Mon Sep 17 00:00:00 2001
From: Cannon Matthews <cannonmatthews@google.com>
Date: Tue, 3 Jul 2018 17:02:43 -0700
Subject: mm: hugetlb: yield when prepping struct pages

When booting with very large numbers of gigantic (i.e.  1G) pages, the
operations in the loop of gather_bootmem_prealloc, and specifically
prep_compound_gigantic_page, takes a very long time, and can cause a
softlockup if enough pages are requested at boot.

For example booting with 3844 1G pages requires prepping
(set_compound_head, init the count) over 1 billion 4K tail pages, which
takes considerable time.

Add a cond_resched() to the outer loop in gather_bootmem_prealloc() to
prevent this lockup.

Tested: Booted with softlockup_panic=1 hugepagesz=1G hugepages=3844 and
no softlockup is reported, and the hugepages are reported as
successfully setup.

Link: http://lkml.kernel.org/r/20180627214447.260804-1-cannonmatthews@google.com
Signed-off-by: Cannon Matthews <cannonmatthews@google.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3612fbb32e9d..039ddbc574e9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2163,6 +2163,7 @@ static void __init gather_bootmem_prealloc(void)
 		 */
 		if (hstate_is_gigantic(h))
 			adjust_managed_page_count(page, 1 << h->order);
+		cond_resched();
 	}
 }
 
-- 
cgit v1.2.3


From 1e8e18f694a52d703665012ca486826f64bac29d Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 3 Jul 2018 17:02:46 -0700
Subject: kasan: fix shadow_size calculation error in kasan_module_alloc

There is a special case that the size is "(N << KASAN_SHADOW_SCALE_SHIFT)
Pages plus X", the value of X is [1, KASAN_SHADOW_SCALE_SIZE-1].  The
operation "size >> KASAN_SHADOW_SCALE_SHIFT" will drop X, and the
roundup operation can not retrieve the missed one page.  For example:
size=0x28006, PAGE_SIZE=0x1000, KASAN_SHADOW_SCALE_SHIFT=3, we will get
shadow_size=0x5000, but actually we need 6 pages.

  shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT, PAGE_SIZE);

This can lead to a kernel crash when kasan is enabled and the value of
mod->core_layout.size or mod->init_layout.size is like above.  Because
the shadow memory of X has not been allocated and mapped.

move_module:
  ptr = module_alloc(mod->core_layout.size);
  ...
  memset(ptr, 0, mod->core_layout.size);		//crashed

  Unable to handle kernel paging request at virtual address ffff0fffff97b000
  ......
  Call trace:
    __asan_storeN+0x174/0x1a8
    memset+0x24/0x48
    layout_and_allocate+0xcd8/0x1800
    load_module+0x190/0x23e8
    SyS_finit_module+0x148/0x180

Link: http://lkml.kernel.org/r/1529659626-12660-1-git-send-email-thunder.leizhen@huawei.com
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Dmitriy Vyukov <dvyukov@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Libin <huawei.libin@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index f185455b3406..c3bd5209da38 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -619,12 +619,13 @@ void kasan_kfree_large(void *ptr, unsigned long ip)
 int kasan_module_alloc(void *addr, size_t size)
 {
 	void *ret;
+	size_t scaled_size;
 	size_t shadow_size;
 	unsigned long shadow_start;
 
 	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
-	shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
-			PAGE_SIZE);
+	scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
+	shadow_size = round_up(scaled_size, PAGE_SIZE);
 
 	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
 		return -EINVAL;
-- 
cgit v1.2.3


From fc36def997cfd6cbff3eda4f82853a5c311c5466 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Tue, 3 Jul 2018 17:02:53 -0700
Subject: mm: teach dump_page() to correctly output poisoned struct pages

If struct page is poisoned, and uninitialized access is detected via
PF_POISONED_CHECK(page) dump_page() is called to output the page.  But,
the dump_page() itself accesses struct page to determine how to print
it, and therefore gets into a recursive loop.

For example:

  dump_page()
   __dump_page()
    PageSlab(page)
     PF_POISONED_CHECK(page)
      VM_BUG_ON_PGFLAGS(PagePoisoned(page), page)
       dump_page() recursion loop.

Link: http://lkml.kernel.org/r/20180702180536.2552-1-pasha.tatashin@oracle.com
Fixes: f165b378bbdf ("mm: uninitialized struct page poisoning sanity checking")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/debug.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/debug.c b/mm/debug.c
index 56e2d9125ea5..38c926520c97 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -43,12 +43,25 @@ const struct trace_print_flags vmaflag_names[] = {
 
 void __dump_page(struct page *page, const char *reason)
 {
+	bool page_poisoned = PagePoisoned(page);
+	int mapcount;
+
+	/*
+	 * If struct page is poisoned don't access Page*() functions as that
+	 * leads to recursive loop. Page*() check for poisoned pages, and calls
+	 * dump_page() when detected.
+	 */
+	if (page_poisoned) {
+		pr_emerg("page:%px is uninitialized and poisoned", page);
+		goto hex_only;
+	}
+
 	/*
 	 * Avoid VM_BUG_ON() in page_mapcount().
 	 * page->_mapcount space in struct page is used by sl[aou]b pages to
 	 * encode own info.
 	 */
-	int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
+	mapcount = PageSlab(page) ? 0 : page_mapcount(page);
 
 	pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
 		  page, page_ref_count(page), mapcount,
@@ -60,6 +73,7 @@ void __dump_page(struct page *page, const char *reason)
 
 	pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
 
+hex_only:
 	print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
 			sizeof(unsigned long), page,
 			sizeof(struct page), false);
@@ -68,7 +82,7 @@ void __dump_page(struct page *page, const char *reason)
 		pr_alert("page dumped because: %s\n", reason);
 
 #ifdef CONFIG_MEMCG
-	if (page->mem_cgroup)
+	if (!page_poisoned && page->mem_cgroup)
 		pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup);
 #endif
 }
-- 
cgit v1.2.3


From e181ae0c5db9544de9c53239eb22bc012ce75033 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Sat, 14 Jul 2018 09:15:07 -0400
Subject: mm: zero unavailable pages before memmap init

We must zero struct pages for memory that is not backed by physical
memory, or kernel does not have access to.

Recently, there was a change which zeroed all memmap for all holes in
e820.  Unfortunately, it introduced a bug that is discussed here:

  https://www.spinics.net/lists/linux-mm/msg156764.html

Linus, also saw this bug on his machine, and confirmed that reverting
commit 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into
memblock.reserved") fixes the issue.

The problem is that we incorrectly zero some struct pages after they
were setup.

The fix is to zero unavailable struct pages prior to initializing of
struct pages.

A more detailed fix should come later that would avoid double zeroing
cases: one in __init_single_page(), the other one in
zero_resv_unavail().

Fixes: 124049decbb1 ("x86/e820: put !E820_TYPE_RAM regions into memblock.reserved")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100f1e63..5d800d61ddb7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6847,6 +6847,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
+	zero_resv_unavail();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
@@ -6857,7 +6858,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
-	zero_resv_unavail();
 }
 
 static int __init cmdline_parse_core(char *p, unsigned long *core,
@@ -7033,9 +7033,9 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 
 void __init free_area_init(unsigned long *zones_size)
 {
+	zero_resv_unavail();
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
-	zero_resv_unavail();
 }
 
 static int page_alloc_cpu_dead(unsigned int cpu)
-- 
cgit v1.2.3


From bce73e4842390f7b7309c8e253e139db71288ac3 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 13 Jul 2018 16:58:52 -0700
Subject: mm: do not drop unused pages when userfaultd is running

KVM guests on s390 can notify the host of unused pages.  This can result
in pte_unused callbacks to be true for KVM guest memory.

If a page is unused (checked with pte_unused) we might drop this page
instead of paging it.  This can have side-effects on userfaultd, when
the page in question was already migrated:

The next access of that page will trigger a fault and a user fault
instead of faulting in a new and empty zero page.  As QEMU does not
expect a userfault on an already migrated page this migration will fail.

The most straightforward solution is to ignore the pte_unused hint if a
userfault context is active for this VMA.

Link: http://lkml.kernel.org/r/20180703171854.63981-1-borntraeger@de.ibm.com
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 6db729dc4c50..eb477809a5c0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -64,6 +64,7 @@
 #include <linux/backing-dev.h>
 #include <linux/page_idle.h>
 #include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/tlbflush.h>
 
@@ -1481,11 +1482,16 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
-		} else if (pte_unused(pteval)) {
+		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
 			/*
 			 * The guest indicated that the page content is of no
 			 * interest anymore. Simply discard the pte, vmscan
 			 * will take care of the rest.
+			 * A future reference will then fault in a new zero
+			 * page. When userfaultfd is active, we must not drop
+			 * this page though, as its main user (postcopy
+			 * migration) will not expect userfaults on already
+			 * copied pages.
 			 */
 			dec_mm_counter(mm, mm_counter(page));
 			/* We have to invalidate as we cleared the pte */
-- 
cgit v1.2.3


From e3d301cae0092062cbcd6b4e7ceebbab9d87e263 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 13 Jul 2018 16:59:16 -0700
Subject: mm/memblock.c: do not complain about top-down allocations for
 !MEMORY_HOTREMOVE

Mike Rapoport is converting architectures from bootmem to nobootmem
allocator.  While doing so for m68k Geert has noticed that he gets a
scary looking warning:

  WARNING: CPU: 0 PID: 0 at mm/memblock.c:230
  memblock_find_in_range_node+0x11c/0x1be
  memblock: bottom-up allocation failed, memory hotunplug may be affected
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted
  4.18.0-rc3-atari-01343-gf2fb5f2e09a97a3c-dirty #7
  Call Trace: __warn+0xa8/0xc2
    kernel_pg_dir+0x0/0x1000
    netdev_lower_get_next+0x2/0x22
    warn_slowpath_fmt+0x2e/0x36
    memblock_find_in_range_node+0x11c/0x1be
    memblock_find_in_range_node+0x11c/0x1be
    memblock_find_in_range_node+0x0/0x1be
    vprintk_func+0x66/0x6e
    memblock_virt_alloc_internal+0xd0/0x156
    netdev_lower_get_next+0x2/0x22
    netdev_lower_get_next+0x2/0x22
    kernel_pg_dir+0x0/0x1000
    memblock_virt_alloc_try_nid_nopanic+0x58/0x7a
    netdev_lower_get_next+0x2/0x22
    kernel_pg_dir+0x0/0x1000
    kernel_pg_dir+0x0/0x1000
    EXPTBL+0x234/0x400
    EXPTBL+0x234/0x400
    alloc_node_mem_map+0x4a/0x66
    netdev_lower_get_next+0x2/0x22
    free_area_init_node+0xe2/0x29e
    EXPTBL+0x234/0x400
    paging_init+0x430/0x462
    kernel_pg_dir+0x0/0x1000
    printk+0x0/0x1a
    EXPTBL+0x234/0x400
    setup_arch+0x1b8/0x22c
    start_kernel+0x4a/0x40a
    _sinittext+0x344/0x9e8

The warning is basically saying that a top-down allocation can break
memory hotremove because memblock allocation is not movable.  But m68k
doesn't even support MEMORY_HOTREMOVE so there is no point to warn about
it.

Make the warning conditional only to configurations that care.

Link: http://lkml.kernel.org/r/20180706061750.GH32658@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: Sam Creasey <sammy@sammy.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 03d48d8835ba..11e46f83e1ad 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -227,7 +227,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 		 * so we use WARN_ONCE() here to see the stack trace if
 		 * fail happens.
 		 */
-		WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
+		WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE),
+			  "memblock: bottom-up allocation failed, memory hotremove may be affected\n");
 	}
 
 	return __memblock_find_range_top_down(start, end, size, align, nid,
-- 
cgit v1.2.3


From bb177a732c4369bb58a1fe1df8f552b6f0f7db5f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 13 Jul 2018 16:59:20 -0700
Subject: mm: do not bug_on on incorrect length in __mm_populate()

syzbot has noticed that a specially crafted library can easily hit
VM_BUG_ON in __mm_populate

  kernel BUG at mm/gup.c:1242!
  invalid opcode: 0000 [#1] SMP
  CPU: 2 PID: 9667 Comm: a.out Not tainted 4.18.0-rc3 #644
  Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017
  RIP: 0010:__mm_populate+0x1e2/0x1f0
  Code: 55 d0 65 48 33 14 25 28 00 00 00 89 d8 75 21 48 83 c4 20 5b 41 5c 41 5d 41 5e 41 5f 5d c3 e8 75 18 f1 ff 0f 0b e8 6e 18 f1 ff <0f> 0b 31 db eb c9 e8 93 06 e0 ff 0f 1f 00 55 48 89 e5 53 48 89 fb
  Call Trace:
     vm_brk_flags+0xc3/0x100
     vm_brk+0x1f/0x30
     load_elf_library+0x281/0x2e0
     __ia32_sys_uselib+0x170/0x1e0
     do_fast_syscall_32+0xca/0x420
     entry_SYSENTER_compat+0x70/0x7f

The reason is that the length of the new brk is not page aligned when we
try to populate the it.  There is no reason to bug on that though.
do_brk_flags already aligns the length properly so the mapping is
expanded as it should.  All we need is to tell mm_populate about it.
Besides that there is absolutely no reason to to bug_on in the first
place.  The worst thing that could happen is that the last page wouldn't
get populated and that is far from putting system into an inconsistent
state.

Fix the issue by moving the length sanitization code from do_brk_flags
up to vm_brk_flags.  The only other caller of do_brk_flags is brk
syscall entry and it makes sure to provide the proper length so t here
is no need for sanitation and so we can use do_brk_flags without it.

Also remove the bogus BUG_ONs.

[osalvador@techadventures.net: fix up vm_brk_flags s@request@len@]
Link: http://lkml.kernel.org/r/20180706090217.GI32658@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: syzbot <syzbot+5dcb560fe12aa5091c06@syzkaller.appspotmail.com>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/gup.c  |  2 --
 mm/mmap.c | 29 ++++++++++++-----------------
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/gup.c b/mm/gup.c
index b70d7ba7cc13..fc5f98069f4e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1238,8 +1238,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 	int locked = 0;
 	long ret = 0;
 
-	VM_BUG_ON(start & ~PAGE_MASK);
-	VM_BUG_ON(len != PAGE_ALIGN(len));
 	end = start + len;
 
 	for (nstart = start; nstart < end; nstart = nend) {
diff --git a/mm/mmap.c b/mm/mmap.c
index d1eb87ef4b1a..5801b5f0a634 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -186,8 +186,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
-
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
+		struct list_head *uf);
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
 	unsigned long retval;
@@ -245,7 +245,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
+	if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
 		goto out;
 
 set_brk:
@@ -2929,21 +2929,14 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
+static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
-	unsigned long len;
 	struct rb_node **rb_link, *rb_parent;
 	pgoff_t pgoff = addr >> PAGE_SHIFT;
 	int error;
 
-	len = PAGE_ALIGN(request);
-	if (len < request)
-		return -ENOMEM;
-	if (!len)
-		return 0;
-
 	/* Until we need other flags, refuse anything except VM_EXEC. */
 	if ((flags & (~VM_EXEC)) != 0)
 		return -EINVAL;
@@ -3015,18 +3008,20 @@ out:
 	return 0;
 }
 
-static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
-{
-	return do_brk_flags(addr, len, 0, uf);
-}
-
-int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
+int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
+	unsigned long len;
 	int ret;
 	bool populate;
 	LIST_HEAD(uf);
 
+	len = PAGE_ALIGN(request);
+	if (len < request)
+		return -ENOMEM;
+	if (!len)
+		return 0;
+
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-- 
cgit v1.2.3


From d1b47a7c9efcf3c3384b70f6e3c8f1423b44d8c7 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Mon, 16 Jul 2018 11:16:30 -0400
Subject: mm: don't do zero_resv_unavail if memmap is not allocated

Moving zero_resv_unavail before memmap_init_zone(), caused a regression on
x86-32.

The cause is that we access struct pages before they are allocated when
CONFIG_FLAT_NODE_MEM_MAP is used.

free_area_init_nodes()
  zero_resv_unavail()
    mm_zero_struct_page(pfn_to_page(pfn)); <- struct page is not alloced
  free_area_init_node()
    if CONFIG_FLAT_NODE_MEM_MAP
      alloc_node_mem_map()
        memblock_virt_alloc_node_nopanic() <- struct page alloced here

On the other hand memblock_virt_alloc_node_nopanic() zeroes all the memory
that it returns, so we do not need to do zero_resv_unavail() here.

Fixes: e181ae0c5db9 ("mm: zero unavailable pages before memmap init")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Tested-by: Matt Hart <matt@mattface.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/page_alloc.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a0fbb9ffe380..3982c83fdcbf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2132,7 +2132,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
 					struct mminit_pfnnid_cache *state);
 #endif
 
-#ifdef CONFIG_HAVE_MEMBLOCK
+#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
 void zero_resv_unavail(void);
 #else
 static inline void zero_resv_unavail(void) {}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5d800d61ddb7..a790ef4be74e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6383,7 +6383,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 	free_area_init_core(pgdat);
 }
 
-#ifdef CONFIG_HAVE_MEMBLOCK
+#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
 /*
  * Only struct pages that are backed by physical memory are zeroed and
  * initialized by going through __init_single_page(). But, there are some
@@ -6421,7 +6421,7 @@ void __paginginit zero_resv_unavail(void)
 	if (pgcnt)
 		pr_info("Reserved but unavailable: %lld pages", pgcnt);
 }
-#endif /* CONFIG_HAVE_MEMBLOCK */
+#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 
-- 
cgit v1.2.3


From 1937367205d930deafb11c387b0a71dd215254d5 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Fri, 20 Jul 2018 17:53:31 -0700
Subject: mm/memblock: add missing include <linux/bootmem.h>

Commit 26f09e9b3a06 ("mm/memblock: add memblock memory allocation apis")
introduced two new function definitions:

  memblock_virt_alloc_try_nid_nopanic()
  memblock_virt_alloc_try_nid()

and commit ea1f5f3712af ("mm: define memblock_virt_alloc_try_nid_raw")
introduced the following function definition:

  memblock_virt_alloc_try_nid_raw()

This commit adds an include of header file <linux/bootmem.h> to provide
the missing function prototypes.  This silences the following gcc warning
(W=1):

  mm/memblock.c:1334:15: warning: no previous prototype for `memblock_virt_alloc_try_nid_raw' [-Wmissing-prototypes]
  mm/memblock.c:1371:15: warning: no previous prototype for `memblock_virt_alloc_try_nid_nopanic' [-Wmissing-prototypes]
  mm/memblock.c:1407:15: warning: no previous prototype for `memblock_virt_alloc_try_nid' [-Wmissing-prototypes]

Also adds #ifdef blockers to prevent compilation failure on mips/ia64
where CONFIG_NO_BOOTMEM=n as could be seen in commit commit 6cc22dc08a24
("revert "mm/memblock: add missing include <linux/bootmem.h>"").

Because Makefile already does:

  obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o

The #ifdef has been simplified from:

  #if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM)

to simply:

  #if defined(CONFIG_NO_BOOTMEM)

Link: http://lkml.kernel.org/r/20180626184422.24974-1-malat@debian.org
Signed-off-by: Mathieu Malaterre <malat@debian.org>
Suggested-by: Tony Luck <tony.luck@intel.com>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 11e46f83e1ad..4b5d245fafc1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,7 @@
 #include <linux/kmemleak.h>
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
+#include <linux/bootmem.h>
 
 #include <asm/sections.h>
 #include <linux/io.h>
@@ -1225,6 +1226,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
 	return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
 
+#if defined(CONFIG_NO_BOOTMEM)
 /**
  * memblock_virt_alloc_internal - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
@@ -1432,6 +1434,7 @@ void * __init memblock_virt_alloc_try_nid(
 	      (u64)max_addr);
 	return NULL;
 }
+#endif
 
 /**
  * __memblock_free_early - free boot memory block
-- 
cgit v1.2.3


From e1f1b1572e8db87a56609fd05bef76f98f0e456a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 20 Jul 2018 17:53:45 -0700
Subject: mm/huge_memory.c: fix data loss when splitting a file pmd

__split_huge_pmd_locked() must check if the cleared huge pmd was dirty,
and propagate that to PageDirty: otherwise, data may be lost when a huge
tmpfs page is modified then split then reclaimed.

How has this taken so long to be noticed?  Because there was no problem
when the huge page is written by a write system call (shmem_write_end()
calls set_page_dirty()), nor when the page is allocated for a write fault
(fault_dirty_shared_page() calls set_page_dirty()); but when allocated for
a read fault (which MAP_POPULATE simulates), no set_page_dirty().

Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1807111741430.1106@eggly.anvils
Fixes: d21b9e57c74c ("thp: handle file pages in split_huge_pmd()")
Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Ashwin Chaugule <ashwinch@google.com>
Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: <stable@vger.kernel.org>	[4.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1cd7c1a57a14..25346bd99364 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2084,6 +2084,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		if (vma_is_dax(vma))
 			return;
 		page = pmd_page(_pmd);
+		if (!PageDirty(page) && pmd_dirty(_pmd))
+			set_page_dirty(page);
 		if (!PageReferenced(page) && pmd_young(_pmd))
 			SetPageReferenced(page);
 		page_remove_rmap(page, true);
-- 
cgit v1.2.3


From 9f15bde671355c351cf20d9f879004b234353100 Mon Sep 17 00:00:00 2001
From: Jing Xia <jing.xia.mail@gmail.com>
Date: Fri, 20 Jul 2018 17:53:48 -0700
Subject: mm: memcg: fix use after free in mem_cgroup_iter()

It was reported that a kernel crash happened in mem_cgroup_iter(), which
can be triggered if the legacy cgroup-v1 non-hierarchical mode is used.

Unable to handle kernel paging request at virtual address 6b6b6b6b6b6b8f
......
Call trace:
  mem_cgroup_iter+0x2e0/0x6d4
  shrink_zone+0x8c/0x324
  balance_pgdat+0x450/0x640
  kswapd+0x130/0x4b8
  kthread+0xe8/0xfc
  ret_from_fork+0x10/0x20

  mem_cgroup_iter():
      ......
      if (css_tryget(css))    <-- crash here
	    break;
      ......

The crashing reason is that mem_cgroup_iter() uses the memcg object whose
pointer is stored in iter->position, which has been freed before and
filled with POISON_FREE(0x6b).

And the root cause of the use-after-free issue is that
invalidate_reclaim_iterators() fails to reset the value of iter->position
to NULL when the css of the memcg is released in non- hierarchical mode.

Link: http://lkml.kernel.org/r/1531994807-25639-1-git-send-email-jing.xia@unisoc.com
Fixes: 6df38689e0e9 ("mm: memcontrol: fix possible memcg leak due to interrupted reclaim")
Signed-off-by: Jing Xia <jing.xia.mail@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <chunyan.zhang@unisoc.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6f0d5ef320a..8c0280b3143e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -850,7 +850,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 	int nid;
 	int i;
 
-	while ((memcg = parent_mem_cgroup(memcg))) {
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 		for_each_node(nid) {
 			mz = mem_cgroup_nodeinfo(memcg, nid);
 			for (i = 0; i <= DEF_PRIORITY; i++) {
-- 
cgit v1.2.3


From 3928d4f5ee37cdc523894f6e549e6aae521d8980 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 13:48:51 -0700
Subject: mm: use helper functions for allocating and freeing vm_area structs

The vm_area_struct is one of the most fundamental memory management
objects, but the management of it is entirely open-coded evertwhere,
ranging from allocation and freeing (using kmem_cache_[z]alloc and
kmem_cache_free) to initializing all the fields.

We want to unify this in order to end up having some unified
initialization of the vmas, and the first step to this is to at least
have basic allocation functions.

Right now those functions are literally just wrappers around the
kmem_cache_*() calls.  This is a purely mechanical conversion:

    # new vma:
    kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL) -> vm_area_alloc()

    # copy old vma
    kmem_cache_alloc(vm_area_cachep, GFP_KERNEL) -> vm_area_dup(old)

    # free vma
    kmem_cache_free(vm_area_cachep, vma) -> vm_area_free(vma)

to the point where the old vma passed in to the vm_area_dup() function
isn't even used yet (because I've left all the old manual initialization
alone).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  4 ++--
 arch/ia64/mm/init.c        |  8 ++++----
 fs/exec.c                  |  4 ++--
 include/linux/mm.h         |  4 +++-
 kernel/fork.c              | 21 ++++++++++++++++++---
 mm/mmap.c                  | 22 +++++++++++-----------
 mm/nommu.c                 |  8 ++++----
 7 files changed, 44 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 3b38c717008a..e859246badca 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2278,7 +2278,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	DPRINT(("smpl_buf @%p\n", smpl_buf));
 
 	/* allocate vma */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (!vma) {
 		DPRINT(("Cannot allocate vma\n"));
 		goto error_kmem;
@@ -2346,7 +2346,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	return 0;
 
 error:
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 error_kmem:
 	pfm_rvfree(smpl_buf, size);
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 18278b448530..3f2321bffb72 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -114,7 +114,7 @@ ia64_init_addr_space (void)
 	 * the problem.  When the process attempts to write to the register backing store
 	 * for the first time, it will get a SEGFAULT in this case.
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (vma) {
 		INIT_LIST_HEAD(&vma->anon_vma_chain);
 		vma->vm_mm = current->mm;
@@ -125,7 +125,7 @@ ia64_init_addr_space (void)
 		down_write(&current->mm->mmap_sem);
 		if (insert_vm_struct(current->mm, vma)) {
 			up_write(&current->mm->mmap_sem);
-			kmem_cache_free(vm_area_cachep, vma);
+			vm_area_free(vma);
 			return;
 		}
 		up_write(&current->mm->mmap_sem);
@@ -133,7 +133,7 @@ ia64_init_addr_space (void)
 
 	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
 	if (!(current->personality & MMAP_PAGE_ZERO)) {
-		vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+		vma = vm_area_alloc();
 		if (vma) {
 			INIT_LIST_HEAD(&vma->anon_vma_chain);
 			vma->vm_mm = current->mm;
@@ -144,7 +144,7 @@ ia64_init_addr_space (void)
 			down_write(&current->mm->mmap_sem);
 			if (insert_vm_struct(current->mm, vma)) {
 				up_write(&current->mm->mmap_sem);
-				kmem_cache_free(vm_area_cachep, vma);
+				vm_area_free(vma);
 				return;
 			}
 			up_write(&current->mm->mmap_sem);
diff --git a/fs/exec.c b/fs/exec.c
index 2d4e0075bd24..9bd83989ea25 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
-	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	bprm->vma = vma = vm_area_alloc();
 	if (!vma)
 		return -ENOMEM;
 
@@ -326,7 +326,7 @@ err:
 	up_write(&mm->mmap_sem);
 err_free:
 	bprm->vma = NULL;
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 	return err;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3982c83fdcbf..de2fd86c6154 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -155,7 +155,9 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
  * mmap() functions).
  */
 
-extern struct kmem_cache *vm_area_cachep;
+struct vm_area_struct *vm_area_alloc(void);
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
+void vm_area_free(struct vm_area_struct *);
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61b925c..0e23deb5acfc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -303,11 +303,26 @@ struct kmem_cache *files_cachep;
 struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
+static struct kmem_cache *vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+struct vm_area_struct *vm_area_alloc(void)
+{
+	return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+}
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+	return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
 	void *stack = task_stack_page(tsk);
@@ -455,7 +470,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		tmp = vm_area_dup(mpnt);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -539,7 +554,7 @@ fail_uprobe_end:
 fail_nomem_anon_vma_fork:
 	mpol_put(vma_policy(tmp));
 fail_nomem_policy:
-	kmem_cache_free(vm_area_cachep, tmp);
+	vm_area_free(tmp);
 fail_nomem:
 	retval = -ENOMEM;
 	vm_unacct_memory(charge);
diff --git a/mm/mmap.c b/mm/mmap.c
index 5801b5f0a634..4286ad2dd1f5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -182,7 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 	return next;
 }
 
@@ -911,7 +911,7 @@ again:
 			anon_vma_merge(vma, next);
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		vm_area_free(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -1729,7 +1729,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
@@ -1832,7 +1832,7 @@ allow_write_and_free_vma:
 	if (vm_flags & VM_DENYWRITE)
 		allow_write_access(file);
 free_vma:
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
@@ -2620,7 +2620,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 			return err;
 	}
 
-	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	new = vm_area_dup(vma);
 	if (!new)
 		return -ENOMEM;
 
@@ -2669,7 +2669,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
  out_free_mpol:
 	mpol_put(vma_policy(new));
  out_free_vma:
-	kmem_cache_free(vm_area_cachep, new);
+	vm_area_free(new);
 	return err;
 }
 
@@ -2984,7 +2984,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (!vma) {
 		vm_unacct_memory(len >> PAGE_SHIFT);
 		return -ENOMEM;
@@ -3202,7 +3202,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		}
 		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 	} else {
-		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		new_vma = vm_area_dup(vma);
 		if (!new_vma)
 			goto out;
 		*new_vma = *vma;
@@ -3226,7 +3226,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 out_free_mempol:
 	mpol_put(vma_policy(new_vma));
 out_free_vma:
-	kmem_cache_free(vm_area_cachep, new_vma);
+	vm_area_free(new_vma);
 out:
 	return NULL;
 }
@@ -3350,7 +3350,7 @@ static struct vm_area_struct *__install_special_mapping(
 	int ret;
 	struct vm_area_struct *vma;
 
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (unlikely(vma == NULL))
 		return ERR_PTR(-ENOMEM);
 
@@ -3376,7 +3376,7 @@ static struct vm_area_struct *__install_special_mapping(
 	return vma;
 
 out:
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 	return ERR_PTR(ret);
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 4452d8bd9ae4..006e3fe65017 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -769,7 +769,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	put_nommu_region(vma->vm_region);
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 }
 
 /*
@@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file,
 	if (!region)
 		goto error_getting_region;
 
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = vm_area_alloc();
 	if (!vma)
 		goto error_getting_vma;
 
@@ -1368,7 +1368,7 @@ error:
 	kmem_cache_free(vm_region_jar, region);
 	if (vma->vm_file)
 		fput(vma->vm_file);
-	kmem_cache_free(vm_area_cachep, vma);
+	vm_area_free(vma);
 	return ret;
 
 sharing_violation:
@@ -1469,7 +1469,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!region)
 		return -ENOMEM;
 
-	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	new = vm_area_dup(vma);
 	if (!new) {
 		kmem_cache_free(vm_region_jar, region);
 		return -ENOMEM;
-- 
cgit v1.2.3


From 95faf6992df468f617edb788da8c21c6eed0dfa7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 14:48:45 -0700
Subject: mm: make vm_area_dup() actually copy the old vma data

.. and re-initialize th eanon_vma_chain head.

This removes some boiler-plate from the users, and also makes it clear
why it didn't need use the 'zalloc()' version.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 +++++++---
 mm/mmap.c     |  7 -------
 mm/nommu.c    |  1 -
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0e23deb5acfc..67253e41bfb0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,7 +315,13 @@ struct vm_area_struct *vm_area_alloc(void)
 
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
-	return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+	if (new) {
+		*new = *orig;
+		INIT_LIST_HEAD(&new->anon_vma_chain);
+	}
+	return new;
 }
 
 void vm_area_free(struct vm_area_struct *vma)
@@ -473,8 +479,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		tmp = vm_area_dup(mpnt);
 		if (!tmp)
 			goto fail_nomem;
-		*tmp = *mpnt;
-		INIT_LIST_HEAD(&tmp->anon_vma_chain);
 		retval = vma_dup_policy(mpnt, tmp);
 		if (retval)
 			goto fail_nomem_policy;
diff --git a/mm/mmap.c b/mm/mmap.c
index 4286ad2dd1f5..b0ed8ce1b67e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2624,11 +2624,6 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!new)
 		return -ENOMEM;
 
-	/* most fields are the same, copy all, and then fixup */
-	*new = *vma;
-
-	INIT_LIST_HEAD(&new->anon_vma_chain);
-
 	if (new_below)
 		new->vm_end = addr;
 	else {
@@ -3205,13 +3200,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = vm_area_dup(vma);
 		if (!new_vma)
 			goto out;
-		*new_vma = *vma;
 		new_vma->vm_start = addr;
 		new_vma->vm_end = addr + len;
 		new_vma->vm_pgoff = pgoff;
 		if (vma_dup_policy(vma, new_vma))
 			goto out_free_vma;
-		INIT_LIST_HEAD(&new_vma->anon_vma_chain);
 		if (anon_vma_clone(new_vma, vma))
 			goto out_free_mempol;
 		if (new_vma->vm_file)
diff --git a/mm/nommu.c b/mm/nommu.c
index 006e3fe65017..c2560e9cc803 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1476,7 +1476,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	/* most fields are the same, copy all, and then fixup */
-	*new = *vma;
 	*region = *vma->vm_region;
 	new->vm_region = region;
 
-- 
cgit v1.2.3


From 490fc053865c9cc40f1085ef8a5504f5341f79d2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 21 Jul 2018 15:24:03 -0700
Subject: mm: make vm_area_alloc() initialize core fields

Like vm_area_dup(), it initializes the anon_vma_chain head, and the
basic mm pointer.

The rest of the fields end up being different for different users,
although the plan is to also initialize the 'vm_ops' field to a dummy
entry.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  4 +---
 arch/ia64/mm/init.c        |  8 ++------
 fs/exec.c                  |  4 +---
 include/linux/mm.h         |  2 +-
 kernel/fork.c              | 10 ++++++++--
 mm/mmap.c                  | 12 +++---------
 mm/nommu.c                 |  3 +--
 7 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index e859246badca..46bff1661836 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2278,17 +2278,15 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	DPRINT(("smpl_buf @%p\n", smpl_buf));
 
 	/* allocate vma */
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(mm);
 	if (!vma) {
 		DPRINT(("Cannot allocate vma\n"));
 		goto error_kmem;
 	}
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
 	/*
 	 * partially initialize the vma for the sampling buffer
 	 */
-	vma->vm_mm	     = mm;
 	vma->vm_file	     = get_file(filp);
 	vma->vm_flags	     = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 3f2321bffb72..bdb14a369137 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -114,10 +114,8 @@ ia64_init_addr_space (void)
 	 * the problem.  When the process attempts to write to the register backing store
 	 * for the first time, it will get a SEGFAULT in this case.
 	 */
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(current->mm);
 	if (vma) {
-		INIT_LIST_HEAD(&vma->anon_vma_chain);
-		vma->vm_mm = current->mm;
 		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
 		vma->vm_end = vma->vm_start + PAGE_SIZE;
 		vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT;
@@ -133,10 +131,8 @@ ia64_init_addr_space (void)
 
 	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
 	if (!(current->personality & MMAP_PAGE_ZERO)) {
-		vma = vm_area_alloc();
+		vma = vm_area_alloc(current->mm);
 		if (vma) {
-			INIT_LIST_HEAD(&vma->anon_vma_chain);
-			vma->vm_mm = current->mm;
 			vma->vm_end = PAGE_SIZE;
 			vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
 			vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO |
diff --git a/fs/exec.c b/fs/exec.c
index 9bd83989ea25..72e961a62adb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
-	bprm->vma = vma = vm_area_alloc();
+	bprm->vma = vma = vm_area_alloc(mm);
 	if (!vma)
 		return -ENOMEM;
 
@@ -298,7 +298,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 		err = -EINTR;
 		goto err_free;
 	}
-	vma->vm_mm = mm;
 
 	/*
 	 * Place the stack at the largest stack address the architecture
@@ -311,7 +310,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
 	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
 	err = insert_vm_struct(mm, vma);
 	if (err)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index de2fd86c6154..d3a3842316b8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -155,7 +155,7 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
  * mmap() functions).
  */
 
-struct vm_area_struct *vm_area_alloc(void);
+struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 67253e41bfb0..a191c05e757d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -308,9 +308,15 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-struct vm_area_struct *vm_area_alloc(void)
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+
+	if (vma) {
+		vma->vm_mm = mm;
+		INIT_LIST_HEAD(&vma->anon_vma_chain);
+	}
+	return vma;
 }
 
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
diff --git a/mm/mmap.c b/mm/mmap.c
index b0ed8ce1b67e..ff1944d8d458 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(mm);
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
 	}
 
-	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags;
 	vma->vm_page_prot = vm_get_page_prot(vm_flags);
 	vma->vm_pgoff = pgoff;
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
 	if (file) {
 		if (vm_flags & VM_DENYWRITE) {
@@ -2979,14 +2977,12 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(mm);
 	if (!vma) {
 		vm_unacct_memory(len >> PAGE_SHIFT);
 		return -ENOMEM;
 	}
 
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_pgoff = pgoff;
@@ -3343,12 +3339,10 @@ static struct vm_area_struct *__install_special_mapping(
 	int ret;
 	struct vm_area_struct *vma;
 
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(mm);
 	if (unlikely(vma == NULL))
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 
diff --git a/mm/nommu.c b/mm/nommu.c
index c2560e9cc803..1d22fdbf7d7c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file,
 	if (!region)
 		goto error_getting_region;
 
-	vma = vm_area_alloc();
+	vma = vm_area_alloc(current->mm);
 	if (!vma)
 		goto error_getting_vma;
 
@@ -1212,7 +1212,6 @@ unsigned long do_mmap(struct file *file,
 	region->vm_flags = vm_flags;
 	region->vm_pgoff = pgoff;
 
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma->vm_flags = vm_flags;
 	vma->vm_pgoff = pgoff;
 
-- 
cgit v1.2.3


From 2c4541e24c55e2847bede93e33d749280edd429a Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 26 Jul 2018 16:37:30 -0700
Subject: mm: use vma_init() to initialize VMAs on stack and data segments

Make sure to initialize all VMAs properly, not only those which come
from vm_area_cachep.

Link: http://lkml.kernel.org/r/20180724121139.62570-3-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/process.c    | 1 +
 arch/arm/mach-rpc/ecard.c    | 2 +-
 arch/arm64/include/asm/tlb.h | 4 +++-
 arch/arm64/mm/hugetlbpage.c  | 7 +++++--
 arch/ia64/include/asm/tlb.h  | 2 +-
 arch/ia64/mm/init.c          | 2 +-
 arch/x86/um/mem_32.c         | 2 +-
 fs/hugetlbfs/inode.c         | 2 ++
 mm/mempolicy.c               | 1 +
 mm/shmem.c                   | 1 +
 10 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 225d1c58d2de..d9c299133111 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -338,6 +338,7 @@ static struct vm_area_struct gate_vma = {
 
 static int __init gate_vma_init(void)
 {
+	vma_init(&gate_vma, NULL);
 	gate_vma.vm_page_prot = PAGE_READONLY_EXEC;
 	return 0;
 }
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 39aef4876ed4..8db62cc54a6a 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -237,8 +237,8 @@ static void ecard_init_pgtables(struct mm_struct *mm)
 
 	memcpy(dst_pgd, src_pgd, sizeof(pgd_t) * (EASI_SIZE / PGDIR_SIZE));
 
+	vma_init(&vma, mm);
 	vma.vm_flags = VM_EXEC;
-	vma.vm_mm = mm;
 
 	flush_tlb_range(&vma, IO_START, IO_START + IO_SIZE);
 	flush_tlb_range(&vma, EASI_START, EASI_START + EASI_SIZE);
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index ffdaea7954bb..d87f2d646caa 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -37,7 +37,9 @@ static inline void __tlb_remove_table(void *_table)
 
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
-	struct vm_area_struct vma = { .vm_mm = tlb->mm, };
+	struct vm_area_struct vma;
+
+	vma_init(&vma, tlb->mm);
 
 	/*
 	 * The ASID allocator will either invalidate the ASID or mark
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index ecc6818191df..1854e49aa18a 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -108,11 +108,13 @@ static pte_t get_clear_flush(struct mm_struct *mm,
 			     unsigned long pgsize,
 			     unsigned long ncontig)
 {
-	struct vm_area_struct vma = { .vm_mm = mm };
+	struct vm_area_struct vma;
 	pte_t orig_pte = huge_ptep_get(ptep);
 	bool valid = pte_valid(orig_pte);
 	unsigned long i, saddr = addr;
 
+	vma_init(&vma, mm);
+
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
 		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
 
@@ -145,9 +147,10 @@ static void clear_flush(struct mm_struct *mm,
 			     unsigned long pgsize,
 			     unsigned long ncontig)
 {
-	struct vm_area_struct vma = { .vm_mm = mm };
+	struct vm_area_struct vma;
 	unsigned long i, saddr = addr;
 
+	vma_init(&vma, mm);
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
 		pte_clear(mm, addr, ptep);
 
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 44f0ac0df308..db89e7306081 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -120,7 +120,7 @@ ia64_tlb_flush_mmu_tlbonly(struct mmu_gather *tlb, unsigned long start, unsigned
 		 */
 		struct vm_area_struct vma;
 
-		vma.vm_mm = tlb->mm;
+		vma_init(&vma, tlb->mm);
 		/* flush the address range from the tlb: */
 		flush_tlb_range(&vma, start, end);
 		/* now flush the virt. page-table area mapping the address range: */
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index bdb14a369137..e6c6dfd98de2 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -273,7 +273,7 @@ static struct vm_area_struct gate_vma;
 
 static int __init gate_vma_init(void)
 {
-	gate_vma.vm_mm = NULL;
+	vma_init(&gate_vma, NULL);
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
index 744afdc18cf3..56c44d865f7b 100644
--- a/arch/x86/um/mem_32.c
+++ b/arch/x86/um/mem_32.c
@@ -16,7 +16,7 @@ static int __init gate_vma_init(void)
 	if (!FIXADDR_USER_START)
 		return 0;
 
-	gate_vma.vm_mm = NULL;
+	vma_init(&gate_vma, NULL);
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d508c7844681..40d4c66c7751 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -411,6 +411,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	bool truncate_op = (lend == LLONG_MAX);
 
 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+	vma_init(&pseudo_vma, current->mm);
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
 	pagevec_init(&pvec);
 	next = start;
@@ -595,6 +596,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 	 * as input to create an allocation policy.
 	 */
 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+	vma_init(&pseudo_vma, mm);
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
 	pseudo_vma.vm_file = file;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9ac49ef17b4e..01f1a14facc4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2505,6 +2505,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 
 		/* Create pseudo-vma that contains just the policy */
 		memset(&pvma, 0, sizeof(struct vm_area_struct));
+		vma_init(&pvma, NULL);
 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 2cab84403055..41b9bbf24e16 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1421,6 +1421,7 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
 {
 	/* Create a pseudo vma that just contains the policy */
 	memset(vma, 0, sizeof(*vma));
+	vma_init(vma, NULL);
 	/* Bias interleave by inode number to distribute better across nodes */
 	vma->vm_pgoff = index + info->vfs_inode.i_ino;
 	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-- 
cgit v1.2.3


From bfd40eaff5abb9f62c8ef94ca13ed0d94a560f10 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 26 Jul 2018 16:37:35 -0700
Subject: mm: fix vma_is_anonymous() false-positives

vma_is_anonymous() relies on ->vm_ops being NULL to detect anonymous
VMA.  This is unreliable as ->mmap may not set ->vm_ops.

False-positive vma_is_anonymous() may lead to crashes:

	next ffff8801ce5e7040 prev ffff8801d20eca50 mm ffff88019c1e13c0
	prot 27 anon_vma ffff88019680cdd8 vm_ops 0000000000000000
	pgoff 0 file ffff8801b2ec2d00 private_data 0000000000000000
	flags: 0xff(read|write|exec|shared|mayread|maywrite|mayexec|mayshare)
	------------[ cut here ]------------
	kernel BUG at mm/memory.c:1422!
	invalid opcode: 0000 [#1] SMP KASAN
	CPU: 0 PID: 18486 Comm: syz-executor3 Not tainted 4.18.0-rc3+ #136
	Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google
	01/01/2011
	RIP: 0010:zap_pmd_range mm/memory.c:1421 [inline]
	RIP: 0010:zap_pud_range mm/memory.c:1466 [inline]
	RIP: 0010:zap_p4d_range mm/memory.c:1487 [inline]
	RIP: 0010:unmap_page_range+0x1c18/0x2220 mm/memory.c:1508
	Call Trace:
	 unmap_single_vma+0x1a0/0x310 mm/memory.c:1553
	 zap_page_range_single+0x3cc/0x580 mm/memory.c:1644
	 unmap_mapping_range_vma mm/memory.c:2792 [inline]
	 unmap_mapping_range_tree mm/memory.c:2813 [inline]
	 unmap_mapping_pages+0x3a7/0x5b0 mm/memory.c:2845
	 unmap_mapping_range+0x48/0x60 mm/memory.c:2880
	 truncate_pagecache+0x54/0x90 mm/truncate.c:800
	 truncate_setsize+0x70/0xb0 mm/truncate.c:826
	 simple_setattr+0xe9/0x110 fs/libfs.c:409
	 notify_change+0xf13/0x10f0 fs/attr.c:335
	 do_truncate+0x1ac/0x2b0 fs/open.c:63
	 do_sys_ftruncate+0x492/0x560 fs/open.c:205
	 __do_sys_ftruncate fs/open.c:215 [inline]
	 __se_sys_ftruncate fs/open.c:213 [inline]
	 __x64_sys_ftruncate+0x59/0x80 fs/open.c:213
	 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
	 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Reproducer:

	#include <stdio.h>
	#include <stddef.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <sys/ioctl.h>
	#include <sys/mman.h>
	#include <unistd.h>
	#include <fcntl.h>

	#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
	#define KCOV_ENABLE			_IO('c', 100)
	#define KCOV_DISABLE			_IO('c', 101)
	#define COVER_SIZE			(1024<<10)

	#define KCOV_TRACE_PC  0
	#define KCOV_TRACE_CMP 1

	int main(int argc, char **argv)
	{
		int fd;
		unsigned long *cover;

		system("mount -t debugfs none /sys/kernel/debug");
		fd = open("/sys/kernel/debug/kcov", O_RDWR);
		ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE);
		cover = mmap(NULL, COVER_SIZE * sizeof(unsigned long),
				PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
		munmap(cover, COVER_SIZE * sizeof(unsigned long));
		cover = mmap(NULL, COVER_SIZE * sizeof(unsigned long),
				PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
		memset(cover, 0, COVER_SIZE * sizeof(unsigned long));
		ftruncate(fd, 3UL << 20);
		return 0;
	}

This can be fixed by assigning anonymous VMAs own vm_ops and not relying
on it being NULL.

If ->mmap() failed to set ->vm_ops, mmap_region() will set it to
dummy_vm_ops.  This way we will have non-NULL ->vm_ops for all VMAs.

Link: http://lkml.kernel.org/r/20180724121139.62570-4-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: syzbot+3f84280d52be9b7083cc@syzkaller.appspotmail.com
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/mem.c | 1 +
 fs/exec.c          | 1 +
 include/linux/mm.h | 8 ++++++++
 mm/mmap.c          | 3 +++
 mm/nommu.c         | 2 ++
 5 files changed, 15 insertions(+)

(limited to 'mm')

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index ffeb60d3434c..df66a9dd0aae 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -708,6 +708,7 @@ static int mmap_zero(struct file *file, struct vm_area_struct *vma)
 #endif
 	if (vma->vm_flags & VM_SHARED)
 		return shmem_zero_setup(vma);
+	vma_set_anonymous(vma);
 	return 0;
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index 72e961a62adb..bdd0eacefdf5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -293,6 +293,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	bprm->vma = vma = vm_area_alloc(mm);
 	if (!vma)
 		return -ENOMEM;
+	vma_set_anonymous(vma);
 
 	if (down_write_killable(&mm->mmap_sem)) {
 		err = -EINTR;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 31540f166987..7ba6d356d18f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -454,10 +454,18 @@ struct vm_operations_struct {
 
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
+	static const struct vm_operations_struct dummy_vm_ops = {};
+
 	vma->vm_mm = mm;
+	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 }
 
+static inline void vma_set_anonymous(struct vm_area_struct *vma)
+{
+	vma->vm_ops = NULL;
+}
+
 struct mmu_gather;
 struct inode;
 
diff --git a/mm/mmap.c b/mm/mmap.c
index ff1944d8d458..17bbf4d3e24f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1778,6 +1778,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		error = shmem_zero_setup(vma);
 		if (error)
 			goto free_vma;
+	} else {
+		vma_set_anonymous(vma);
 	}
 
 	vma_link(mm, vma, prev, rb_link, rb_parent);
@@ -2983,6 +2985,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
 		return -ENOMEM;
 	}
 
+	vma_set_anonymous(vma);
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_pgoff = pgoff;
diff --git a/mm/nommu.c b/mm/nommu.c
index 1d22fdbf7d7c..9fc9e43335b6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1145,6 +1145,8 @@ static int do_mmap_private(struct vm_area_struct *vma,
 		if (ret < len)
 			memset(base + ret, 0, len - ret);
 
+	} else {
+		vma_set_anonymous(vma);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 16e536ef47f567289a5699abee9ff7bb304bc12d Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@redhat.com>
Date: Thu, 26 Jul 2018 16:37:42 -0700
Subject: zswap: re-check zswap_is_full() after do zswap_shrink()

/sys/../zswap/stored_pages keeps rising in a zswap test with
"zswap.max_pool_percent=0" parameter.  But it should not compress or
store pages any more since there is no space in the compressed pool.

Reproduce steps:
  1. Boot kernel with "zswap.enabled=1"
  2. Set the max_pool_percent to 0
      # echo 0 > /sys/module/zswap/parameters/max_pool_percent
  3. Do memory stress test to see if some pages have been compressed
      # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
  4. Watching the 'stored_pages' number increasing or not

The root cause is:

  When zswap_max_pool_percent is set to 0 via kernel parameter,
  zswap_is_full() will always return true due to zswap_shrink().  But if
  the shinking is able to reclain a page successfully the code then
  proceeds to compressing/storing another page, so the value of
  stored_pages will keep changing.

To solve the issue, this patch adds a zswap_is_full() check again after
  zswap_shrink() to make sure it's now under the max_pool_percent, and to
  not compress/store if we reached the limit.

Link: http://lkml.kernel.org/r/20180530103936.17812-1-liwang@redhat.com
Signed-off-by: Li Wang <liwang@redhat.com>
Acked-by: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Huang Ying <huang.ying.caritas@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zswap.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'mm')

diff --git a/mm/zswap.c b/mm/zswap.c
index 7d34e69507e3..cd91fd9d96b8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 			ret = -ENOMEM;
 			goto reject;
 		}
+
+		/* A second zswap_is_full() check after
+		 * zswap_shrink() to make sure it's now
+		 * under the max_pool_percent
+		 */
+		if (zswap_is_full()) {
+			ret = -ENOMEM;
+			goto reject;
+		}
 	}
 
 	/* allocate entry */
-- 
cgit v1.2.3