From 0633da48f0793aeba27f82d30605624416723a91 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:51:45 -0700
Subject: autofs: fix autofs_sbi() does not check super block type

autofs_sbi() does not check the superblock magic number to verify it has
been given an autofs super block.

Link: http://lkml.kernel.org/r/153475422934.17131.7563724552005298277.stgit@pluto.themaw.net
Reported-by: <syzbot+87c3c541582e56943277@syzkaller.appspotmail.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/autofs_i.h | 4 +++-
 fs/autofs/inode.c    | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 9400a9f6318a..5057b9f0f846 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -26,6 +26,7 @@
 #include <linux/list.h>
 #include <linux/completion.h>
 #include <linux/file.h>
+#include <linux/magic.h>
 
 /* This is the range of ioctl() numbers we claim as ours */
 #define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
@@ -124,7 +125,8 @@ struct autofs_sb_info {
 
 static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
 {
-	return (struct autofs_sb_info *)(sb->s_fs_info);
+	return sb->s_magic != AUTOFS_SUPER_MAGIC ?
+		NULL : (struct autofs_sb_info *)(sb->s_fs_info);
 }
 
 static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index b51980fc274e..846c052569dd 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -10,7 +10,6 @@
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
 #include <linux/parser.h>
-#include <linux/magic.h>
 
 #include "autofs_i.h"
 
-- 
cgit v1.2.3


From 8df4a44cc46bf9dcfb80a37a59ae5dea2232dc58 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 21 Aug 2018 21:51:49 -0700
Subject: mm: check shrinker is memcg-aware in register_shrinker_prepared()

There is a sad BUG introduced in patch adding SHRINKER_REGISTERING.
shrinker_idr business is only for memcg-aware shrinkers.  Only such type
of shrinkers have id and they must be finaly installed via idr_replace()
in this function.  For !memcg-aware shrinkers we never initialize
shrinker->id field.

But there are all types of shrinkers passed to idr_replace(), and every
!memcg-aware shrinker with random ID (most probably, its id is 0)
replaces memcg-aware shrinker pointed by the ID in IDR.

This patch fixes the problem.

Link: http://lkml.kernel.org/r/8ff8a793-8211-713a-4ed9-d6e52390c2fc@virtuozzo.com
Fixes: 7e010df53c80 "mm: use special value SHRINKER_REGISTERING instead of list_empty() check"
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reported-by: <syzbot+d5f648a1bfe15678786b@syzkaller.appspotmail.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: <syzkaller-bugs@googlegroups.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4375b1e9bd56..3c6e2bfee427 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -408,7 +408,8 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 #ifdef CONFIG_MEMCG_KMEM
-	idr_replace(&shrinker_idr, shrinker, shrinker->id);
+	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+		idr_replace(&shrinker_idr, shrinker, shrinker->id);
 #endif
 	up_write(&shrinker_rwsem);
 }
-- 
cgit v1.2.3


From 92be775a3db343889ab159c44d55315c5ee0be4e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 21 Aug 2018 21:51:53 -0700
Subject: mm: struct shrink_control: keep int fields together

Patch series "Reorderings in struct shrinker and struct shrink_control".

These structures are intensively used during reclaim and, displace other
data in cache, so there is no a reason they have int fields not grouped
together.

This patch (of 2):

gfp_t is of unsigned type, so let's move nid to keep them together.

Link: http://lkml.kernel.org/r/153199747930.21131.861043607301997810.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shrinker.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index b154fd2b084c..d58aaaed34a4 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -12,6 +12,9 @@
 struct shrink_control {
 	gfp_t gfp_mask;
 
+	/* current node being shrunk (for NUMA aware shrinkers) */
+	int nid;
+
 	/*
 	 * How many objects scan_objects should scan and try to reclaim.
 	 * This is reset before every call, so it is safe for callees
@@ -26,9 +29,6 @@ struct shrink_control {
 	 */
 	unsigned long nr_scanned;
 
-	/* current node being shrunk (for NUMA aware shrinkers) */
-	int nid;
-
 	/* current memcg being shrunk (for memcg aware shrinkers) */
 	struct mem_cgroup *memcg;
 };
-- 
cgit v1.2.3


From e50ef89b0f58a2cb0e7d496a97b851e0dced62a6 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 21 Aug 2018 21:51:57 -0700
Subject: mm: struct shrinker: make flags of unsigned type

Currently, there are two flags only, so unsigned is more then enough.
Also, move int seeks to keep these fields together.

Link: http://lkml.kernel.org/r/153199748720.21131.6476256940113102483.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shrinker.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index d58aaaed34a4..9443cafd1969 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -63,9 +63,9 @@ struct shrinker {
 	unsigned long (*scan_objects)(struct shrinker *,
 				      struct shrink_control *sc);
 
-	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
-	unsigned long flags;
+	int seeks;	/* seeks to recreate an obj */
+	unsigned flags;
 
 	/* These are for internal use */
 	struct list_head list;
-- 
cgit v1.2.3


From 59d98bf3c2b9218ff4cdb2a70aa52fffedf1786c Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 21 Aug 2018 21:52:01 -0700
Subject: mm: swap: add comments to lock_cluster_or_swap_info()

Patch series "swap: THP optimizing refactoring", v4.

Now the THP (Transparent Huge Page) swap optimizing is implemented in the
way like below,

  #ifdef CONFIG_THP_SWAP
  huge_function(...)
  {
  }
  #else
  normal_function(...)
  {
  }
  #endif

  general_function(...)
  {
  	if (huge)
  		return thp_function(...);
	else
  		return normal_function(...);
  }

As pointed out by Dave Hansen, this will,

1. Create a new, wholly untested code path for huge page
2. Create two places to patch bugs
3. Are not reusing code when possible

This patchset is to address these problems via merging huge/normal code
path/functions if possible.

One concern is that this may cause code size to dilate when
!CONFIG_TRANSPARENT_HUGEPAGE.  The data shows that most refactoring will
only cause quite slight code size increase.

This patch (of 8):

To improve code readability.

Link: http://lkml.kernel.org/r/20180720071845.17920-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-and-acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8837b22c848d..52a9dd9dab8e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -296,13 +296,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci)
 		spin_unlock(&ci->lock);
 }
 
+/*
+ * Determine the locking method in use for this device.  Return
+ * swap_cluster_info if SSD-style cluster-based locking is in place.
+ */
 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
-	struct swap_info_struct *si,
-	unsigned long offset)
+		struct swap_info_struct *si, unsigned long offset)
 {
 	struct swap_cluster_info *ci;
 
+	/* Try to use fine-grained SSD-style locking if available: */
 	ci = lock_cluster(si, offset);
+	/* Otherwise, fall back to traditional, coarse locking: */
 	if (!ci)
 		spin_lock(&si->lock);
 
-- 
cgit v1.2.3


From fe5266d5d5948abc6e71cdabb98e3f5cba811205 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 21 Aug 2018 21:52:05 -0700
Subject: mm/swapfile.c: replace some #ifdef with IS_ENABLED()

In mm/swapfile.c, THP (Transparent Huge Page) swap specific code is
enclosed by #ifdef CONFIG_THP_SWAP/#endif to avoid code dilating when
THP isn't enabled.  But #ifdef/#endif in .c file hurt the code
readability, so Dave suggested to use IS_ENABLED(CONFIG_THP_SWAP)
instead and let compiler to do the dirty job for us.  This has potential
to remove some duplicated code too.  From output of `size`,

		text	   data	    bss	    dec	    hex	filename
THP=y:         26269	   2076	    340	  28685	   700d	mm/swapfile.o
ifdef/endif:   24115	   2028	    340	  26483	   6773	mm/swapfile.o
IS_ENABLED:    24179	   2028	    340	  26547	   67b3	mm/swapfile.o

IS_ENABLED() based solution works quite well, almost as good as that of
#ifdef/#endif.  And from the diffstat, the removed lines are more than
added lines.

One #ifdef for split_swap_cluster() is kept.  Because it is a public
function with a stub implementation for CONFIG_THP_SWAP=n in swap.h.

Link: http://lkml.kernel.org/r/20180720071845.17920-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-and-acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 60 ++++++++++++++++++++---------------------------------------
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 52a9dd9dab8e..618358ad464b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -868,7 +868,6 @@ no_page:
 	return n_ret;
 }
 
-#ifdef CONFIG_THP_SWAP
 static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 {
 	unsigned long idx;
@@ -876,6 +875,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 	unsigned long offset, i;
 	unsigned char *map;
 
+	/*
+	 * Should not even be attempting cluster allocations when huge
+	 * page swap is disabled.  Warn and fail the allocation.
+	 */
+	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+		VM_WARN_ON_ONCE(1);
+		return 0;
+	}
+
 	if (cluster_list_empty(&si->free_clusters))
 		return 0;
 
@@ -906,13 +914,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 	unlock_cluster(ci);
 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
-	VM_WARN_ON_ONCE(1);
-	return 0;
-}
-#endif /* CONFIG_THP_SWAP */
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
 				   unsigned char usage)
@@ -1200,7 +1201,6 @@ static void swapcache_free(swp_entry_t entry)
 	}
 }
 
-#ifdef CONFIG_THP_SWAP
 static void swapcache_free_cluster(swp_entry_t entry)
 {
 	unsigned long offset = swp_offset(entry);
@@ -1211,6 +1211,9 @@ static void swapcache_free_cluster(swp_entry_t entry)
 	unsigned int i, free_entries = 0;
 	unsigned char val;
 
+	if (!IS_ENABLED(CONFIG_THP_SWAP))
+		return;
+
 	si = _swap_info_get(entry);
 	if (!si)
 		return;
@@ -1246,6 +1249,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
 	}
 }
 
+#ifdef CONFIG_THP_SWAP
 int split_swap_cluster(swp_entry_t entry)
 {
 	struct swap_info_struct *si;
@@ -1260,11 +1264,7 @@ int split_swap_cluster(swp_entry_t entry)
 	unlock_cluster(ci);
 	return 0;
 }
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
+#endif
 
 void put_swap_page(struct page *page, swp_entry_t entry)
 {
@@ -1414,7 +1414,6 @@ out:
 	return count;
 }
 
-#ifdef CONFIG_THP_SWAP
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 					 swp_entry_t entry)
 {
@@ -1425,6 +1424,9 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 	int i;
 	bool ret = false;
 
+	if (!IS_ENABLED(CONFIG_THP_SWAP))
+		return swap_swapcount(si, entry) != 0;
+
 	ci = lock_cluster_or_swap_info(si, offset);
 	if (!ci || !cluster_is_huge(ci)) {
 		if (map[roffset] != SWAP_HAS_CACHE)
@@ -1447,7 +1449,7 @@ static bool page_swapped(struct page *page)
 	swp_entry_t entry;
 	struct swap_info_struct *si;
 
-	if (likely(!PageTransCompound(page)))
+	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
 		return page_swapcount(page) != 0;
 
 	page = compound_head(page);
@@ -1471,10 +1473,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
 	/* hugetlbfs shouldn't call it */
 	VM_BUG_ON_PAGE(PageHuge(page), page);
 
-	if (likely(!PageTransCompound(page))) {
-		mapcount = atomic_read(&page->_mapcount) + 1;
-		if (total_mapcount)
-			*total_mapcount = mapcount;
+	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
+		mapcount = page_trans_huge_mapcount(page, total_mapcount);
 		if (PageSwapCache(page))
 			swapcount = page_swapcount(page);
 		if (total_swapcount)
@@ -1521,26 +1521,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
 
 	return map_swapcount;
 }
-#else
-#define swap_page_trans_huge_swapped(si, entry)	swap_swapcount(si, entry)
-#define page_swapped(page)			(page_swapcount(page) != 0)
-
-static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
-					 int *total_swapcount)
-{
-	int mapcount, swapcount = 0;
-
-	/* hugetlbfs shouldn't call it */
-	VM_BUG_ON_PAGE(PageHuge(page), page);
-
-	mapcount = page_trans_huge_mapcount(page, total_mapcount);
-	if (PageSwapCache(page))
-		swapcount = page_swapcount(page);
-	if (total_swapcount)
-		*total_swapcount = swapcount;
-	return mapcount + swapcount;
-}
-#endif
 
 /*
  * We can write to an anon page without COW if there are no other references
-- 
cgit v1.2.3


From afa4711ef1399beb60964dcb2cd0369dfb0ac20a Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 21 Aug 2018 21:52:09 -0700
Subject: mm/swapfile.c: use swap_count() in swap_page_trans_huge_swapped()

In swap_page_trans_huge_swapped(), to identify whether there's any page
table mapping for a 4k sized swap entry, "si->swap_map[i] !=
SWAP_HAS_CACHE" is used.  This works correctly now, because all users of
the function will only call it after checking SWAP_HAS_CACHE.  But as
pointed out by Daniel, it is better to use "swap_count(map[i])" here,
because it works for "map[i] == 0" case too.

And this makes the implementation more consistent between normal and
huge swap entry.

Link: http://lkml.kernel.org/r/20180720071845.17920-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-and-reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 618358ad464b..c291400c4463 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1429,12 +1429,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 
 	ci = lock_cluster_or_swap_info(si, offset);
 	if (!ci || !cluster_is_huge(ci)) {
-		if (map[roffset] != SWAP_HAS_CACHE)
+		if (swap_count(map[roffset]))
 			ret = true;
 		goto unlock_out;
 	}
 	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-		if (map[offset + i] != SWAP_HAS_CACHE) {
+		if (swap_count(map[offset + i])) {
 			ret = true;
 			break;
 		}
-- 
cgit v1.2.3


From 33ee011e5656edef6a58952006c486d342b7bbb5 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 21 Aug 2018 21:52:13 -0700
Subject: mm/swapfile.c: unify normal/huge code path in
 swap_page_trans_huge_swapped()

As suggested by Dave, we should unify the code path for normal and huge
swap support if possible to avoid duplicated code, bugs, etc.  and make
it easier to review code.

In this patch, the normal/huge code path in
swap_page_trans_huge_swapped() is unified, the added and removed lines
are same.  And the binary size is kept almost same when
CONFIG_TRANSPARENT_HUGEPAGE=n.

		 text	   data	    bss	    dec	    hex	filename
base:		24179	   2028	    340	  26547	   67b3	mm/swapfile.o
unified:	24215	   2028	    340	  26583	   67d7	mm/swapfile.o

Link: http://lkml.kernel.org/r/20180720071845.17920-5-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-and-acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index c291400c4463..3c1172b53436 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -269,7 +269,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
 
 static inline bool cluster_is_huge(struct swap_cluster_info *info)
 {
-	return info->flags & CLUSTER_FLAG_HUGE;
+	if (IS_ENABLED(CONFIG_THP_SWAP))
+		return info->flags & CLUSTER_FLAG_HUGE;
+	return false;
 }
 
 static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -1424,9 +1426,6 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 	int i;
 	bool ret = false;
 
-	if (!IS_ENABLED(CONFIG_THP_SWAP))
-		return swap_swapcount(si, entry) != 0;
-
 	ci = lock_cluster_or_swap_info(si, offset);
 	if (!ci || !cluster_is_huge(ci)) {
 		if (swap_count(map[roffset]))
-- 
cgit v1.2.3


From a448f2d07f891ac65cc48017f17735ec73086bf0 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 21 Aug 2018 21:52:17 -0700
Subject: mm/swapfile.c: unify normal/huge code path in put_swap_page()

In this patch, the normal/huge code path in put_swap_page() and several
helper functions are unified to avoid duplicated code, bugs, etc.  and
make it easier to review the code.

The removed lines are more than added lines.  And the binary size is
kept exactly same when CONFIG_TRANSPARENT_HUGEPAGE=n.

Link: http://lkml.kernel.org/r/20180720071845.17920-6-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 83 ++++++++++++++++++++++++++---------------------------------
 1 file changed, 37 insertions(+), 46 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3c1172b53436..043645e7f0b5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -204,8 +204,16 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 
 #ifdef CONFIG_THP_SWAP
 #define SWAPFILE_CLUSTER	HPAGE_PMD_NR
+
+#define swap_entry_size(size)	(size)
 #else
 #define SWAPFILE_CLUSTER	256
+
+/*
+ * Define swap_entry_size() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define swap_entry_size(size)	1
 #endif
 #define LATENCY_LIMIT		256
 
@@ -1192,18 +1200,7 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-static void swapcache_free(swp_entry_t entry)
-{
-	struct swap_info_struct *p;
-
-	p = _swap_info_get(entry);
-	if (p) {
-		if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-			free_swap_slot(entry);
-	}
-}
-
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
 {
 	unsigned long offset = swp_offset(entry);
 	unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1212,39 +1209,41 @@ static void swapcache_free_cluster(swp_entry_t entry)
 	unsigned char *map;
 	unsigned int i, free_entries = 0;
 	unsigned char val;
-
-	if (!IS_ENABLED(CONFIG_THP_SWAP))
-		return;
+	int size = swap_entry_size(hpage_nr_pages(page));
 
 	si = _swap_info_get(entry);
 	if (!si)
 		return;
 
-	ci = lock_cluster(si, offset);
-	VM_BUG_ON(!cluster_is_huge(ci));
-	map = si->swap_map + offset;
-	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-		val = map[i];
-		VM_BUG_ON(!(val & SWAP_HAS_CACHE));
-		if (val == SWAP_HAS_CACHE)
-			free_entries++;
-	}
-	if (!free_entries) {
-		for (i = 0; i < SWAPFILE_CLUSTER; i++)
-			map[i] &= ~SWAP_HAS_CACHE;
-	}
-	cluster_clear_huge(ci);
-	unlock_cluster(ci);
-	if (free_entries == SWAPFILE_CLUSTER) {
-		spin_lock(&si->lock);
+	if (size == SWAPFILE_CLUSTER) {
 		ci = lock_cluster(si, offset);
-		memset(map, 0, SWAPFILE_CLUSTER);
+		VM_BUG_ON(!cluster_is_huge(ci));
+		map = si->swap_map + offset;
+		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+			val = map[i];
+			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+			if (val == SWAP_HAS_CACHE)
+				free_entries++;
+		}
+		if (!free_entries) {
+			for (i = 0; i < SWAPFILE_CLUSTER; i++)
+				map[i] &= ~SWAP_HAS_CACHE;
+		}
+		cluster_clear_huge(ci);
 		unlock_cluster(ci);
-		mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-		swap_free_cluster(si, idx);
-		spin_unlock(&si->lock);
-	} else if (free_entries) {
-		for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+		if (free_entries == SWAPFILE_CLUSTER) {
+			spin_lock(&si->lock);
+			ci = lock_cluster(si, offset);
+			memset(map, 0, SWAPFILE_CLUSTER);
+			unlock_cluster(ci);
+			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+			swap_free_cluster(si, idx);
+			spin_unlock(&si->lock);
+			return;
+		}
+	}
+	if (size == 1 || free_entries) {
+		for (i = 0; i < size; i++, entry.val++) {
 			if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
 				free_swap_slot(entry);
 		}
@@ -1268,14 +1267,6 @@ int split_swap_cluster(swp_entry_t entry)
 }
 #endif
 
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
-	if (!PageTransHuge(page))
-		swapcache_free(entry);
-	else
-		swapcache_free_cluster(entry);
-}
-
 static int swp_entry_cmp(const void *ent1, const void *ent2)
 {
 	const swp_entry_t *e1 = ent1, *e2 = ent2;
-- 
cgit v1.2.3


From 5d5e8f19544a35ae1609bd96ae6b28c9fcd1baf6 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 21 Aug 2018 21:52:20 -0700
Subject: mm, swap, get_swap_pages: use entry_size instead of cluster in
 parameter

As suggested by Matthew Wilcox, it is better to use "int entry_size"
instead of "bool cluster" as parameter to specify whether to operate for
huge or normal swap entries.  Because this improve the flexibility to
support other swap entry size.  And Dave Hansen thinks that this
improves code readability too.

So in this patch, the "bool cluster" parameter of get_swap_pages() is
replaced by "int entry_size".

And nr_swap_entries() trick is used to reduce the binary size when
!CONFIG_TRANSPARENT_HUGE_PAGE.

       text	   data	    bss	    dec	    hex	filename
base  24215	   2028	    340	  26583	   67d7	mm/swapfile.o
head  24123	   2004	    340	  26467	   6763	mm/swapfile.o

Link: http://lkml.kernel.org/r/20180720071845.17920-7-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 +-
 mm/swap_slots.c      |  8 ++++----
 mm/swapfile.c        | 16 ++++++++--------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1a8bd05a335e..8e2c11e692ba 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -447,7 +447,7 @@ extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(struct page *page);
 extern void put_swap_page(struct page *page, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 008ccb22fee6..63a7b4563a57 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
 
 	cache->cur = 0;
 	if (swap_slot_cache_active)
-		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
-					   cache->slots);
+		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+					   cache->slots, 1);
 
 	return cache->nr;
 }
@@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page)
 
 	if (PageTransHuge(page)) {
 		if (IS_ENABLED(CONFIG_THP_SWAP))
-			get_swap_pages(1, true, &entry);
+			get_swap_pages(1, &entry, HPAGE_PMD_NR);
 		goto out;
 	}
 
@@ -350,7 +350,7 @@ repeat:
 			goto out;
 	}
 
-	get_swap_pages(1, false, &entry);
+	get_swap_pages(1, &entry, 1);
 out:
 	if (mem_cgroup_try_charge_swap(page, entry)) {
 		put_swap_page(page, entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 043645e7f0b5..b30dd0642ccf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -940,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 }
 
-int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
 {
-	unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
+	unsigned long size = swap_entry_size(entry_size);
 	struct swap_info_struct *si, *next;
 	long avail_pgs;
 	int n_ret = 0;
 	int node;
 
 	/* Only single cluster request supported */
-	WARN_ON_ONCE(n_goal > 1 && cluster);
+	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
 
-	avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
+	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
 	if (avail_pgs <= 0)
 		goto noswap;
 
@@ -961,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 	if (n_goal > avail_pgs)
 		n_goal = avail_pgs;
 
-	atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
+	atomic_long_sub(n_goal * size, &nr_swap_pages);
 
 	spin_lock(&swap_avail_lock);
 
@@ -988,14 +988,14 @@ start_over:
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
-		if (cluster) {
+		if (size == SWAPFILE_CLUSTER) {
 			if (!(si->flags & SWP_FILE))
 				n_ret = swap_alloc_cluster(si, swp_entries);
 		} else
 			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
 						    n_goal, swp_entries);
 		spin_unlock(&si->lock);
-		if (n_ret || cluster)
+		if (n_ret || size == SWAPFILE_CLUSTER)
 			goto check_out;
 		pr_debug("scan_swap_map of si %d failed to find offset\n",
 			si->type);
@@ -1021,7 +1021,7 @@ nextsi:
 
 check_out:
 	if (n_ret < n_goal)
-		atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+		atomic_long_add((long)(n_goal - n_ret) * size,
 				&nr_swap_pages);
 noswap:
 	return n_ret;
-- 
cgit v1.2.3


From b32d5f32b9dbe64970f41602066f7904df15f3e9 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 21 Aug 2018 21:52:24 -0700
Subject: mm/swapfile.c: add __swap_entry_free_locked()

The part of __swap_entry_free() with lock held is separated into a new
function __swap_entry_free_locked().  Because we want to reuse that
piece of code in some other places.

Just mechanical code refactoring, there is no any functional change in
this function.

Link: http://lkml.kernel.org/r/20180720071845.17920-8-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index b30dd0642ccf..d44b2d60a66a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1123,16 +1123,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
 	return p;
 }
 
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
-				       swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+					      unsigned long offset,
+					      unsigned char usage)
 {
-	struct swap_cluster_info *ci;
-	unsigned long offset = swp_offset(entry);
 	unsigned char count;
 	unsigned char has_cache;
 
-	ci = lock_cluster_or_swap_info(p, offset);
-
 	count = p->swap_map[offset];
 
 	has_cache = count & SWAP_HAS_CACHE;
@@ -1160,6 +1157,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
 	usage = count | has_cache;
 	p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
 
+	return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+				       swp_entry_t entry, unsigned char usage)
+{
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
+
+	ci = lock_cluster_or_swap_info(p, offset);
+	usage = __swap_entry_free_locked(p, offset, usage);
 	unlock_cluster_or_swap_info(p, ci);
 
 	return usage;
-- 
cgit v1.2.3


From c2343d2761f86ae1b857f78c7cdb9f51e5fa1641 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 21 Aug 2018 21:52:29 -0700
Subject: mm/swapfile.c: put_swap_page: share more between huge/normal code
 path

In this patch, locking related code is shared between huge/normal code
path in put_swap_page() to reduce code duplication. The `free_entries == 0`
case is merged into the more general `free_entries != SWAPFILE_CLUSTER`
case, because the new locking method makes it easy.

The added lines is same as the removed lines.  But the code size is
increased when CONFIG_TRANSPARENT_HUGEPAGE=n.

		text	   data	    bss	    dec	    hex	filename
base:	       24123	   2004	    340	  26467	   6763	mm/swapfile.o
unified:       24485	   2004	    340	  26829	   68cd	mm/swapfile.o

Dig on step deeper with `size -A mm/swapfile.o` for base and unified
kernel and compare the result, yields,

  -.text                                17723      0
  +.text                                17835      0
  -.orc_unwind_ip                        1380      0
  +.orc_unwind_ip                        1480      0
  -.orc_unwind                           2070      0
  +.orc_unwind                           2220      0
  -Total                                26686
  +Total                                27048

The total difference is the same.  The text segment difference is much
smaller: 112.  More difference comes from the ORC unwinder segments:
(1480 + 2220) - (1380 + 2070) = 250.  If the frame pointer unwinder is
used, this costs nothing.

Link: http://lkml.kernel.org/r/20180720071845.17920-9-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d44b2d60a66a..d954b71c4f9c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1223,8 +1223,8 @@ void put_swap_page(struct page *page, swp_entry_t entry)
 	if (!si)
 		return;
 
+	ci = lock_cluster_or_swap_info(si, offset);
 	if (size == SWAPFILE_CLUSTER) {
-		ci = lock_cluster(si, offset);
 		VM_BUG_ON(!cluster_is_huge(ci));
 		map = si->swap_map + offset;
 		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
@@ -1233,13 +1233,9 @@ void put_swap_page(struct page *page, swp_entry_t entry)
 			if (val == SWAP_HAS_CACHE)
 				free_entries++;
 		}
-		if (!free_entries) {
-			for (i = 0; i < SWAPFILE_CLUSTER; i++)
-				map[i] &= ~SWAP_HAS_CACHE;
-		}
 		cluster_clear_huge(ci);
-		unlock_cluster(ci);
 		if (free_entries == SWAPFILE_CLUSTER) {
+			unlock_cluster_or_swap_info(si, ci);
 			spin_lock(&si->lock);
 			ci = lock_cluster(si, offset);
 			memset(map, 0, SWAPFILE_CLUSTER);
@@ -1250,12 +1246,16 @@ void put_swap_page(struct page *page, swp_entry_t entry)
 			return;
 		}
 	}
-	if (size == 1 || free_entries) {
-		for (i = 0; i < size; i++, entry.val++) {
-			if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
-				free_swap_slot(entry);
+	for (i = 0; i < size; i++, entry.val++) {
+		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+			unlock_cluster_or_swap_info(si, ci);
+			free_swap_slot(entry);
+			if (i == size - 1)
+				return;
+			lock_cluster_or_swap_info(si, offset);
 		}
 	}
+	unlock_cluster_or_swap_info(si, ci);
 }
 
 #ifdef CONFIG_THP_SWAP
-- 
cgit v1.2.3


From 93065ac753e4443840a057bfef4be71ec766fde9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 21 Aug 2018 21:52:33 -0700
Subject: mm, oom: distinguish blockable mode for mmu notifiers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are several blockable mmu notifiers which might sleep in
mmu_notifier_invalidate_range_start and that is a problem for the
oom_reaper because it needs to guarantee a forward progress so it cannot
depend on any sleepable locks.

Currently we simply back off and mark an oom victim with blockable mmu
notifiers as done after a short sleep.  That can result in selecting a new
oom victim prematurely because the previous one still hasn't torn its
memory down yet.

We can do much better though.  Even if mmu notifiers use sleepable locks
there is no reason to automatically assume those locks are held.  Moreover
majority of notifiers only care about a portion of the address space and
there is absolutely zero reason to fail when we are unmapping an unrelated
range.  Many notifiers do really block and wait for HW which is harder to
handle and we have to bail out though.

This patch handles the low hanging fruit.
__mmu_notifier_invalidate_range_start gets a blockable flag and callbacks
are not allowed to sleep if the flag is set to false.  This is achieved by
using trylock instead of the sleepable lock for most callbacks and
continue as long as we do not block down the call chain.

I think we can improve that even further because there is a common pattern
to do a range lookup first and then do something about that.  The first
part can be done without a sleeping lock in most cases AFAICS.

The oom_reaper end then simply retries if there is at least one notifier
which couldn't make any progress in !blockable mode.  A retry loop is
already implemented to wait for the mmap_sem and this is basically the
same thing.

The simplest way for driver developers to test this code path is to wrap
userspace code which uses these notifiers into a memcg and set the hard
limit to hit the oom.  This can be done e.g.  after the test faults in all
the mmu notifier managed memory and set the hard limit to something really
small.  Then we are looking for a proper process tear down.

[akpm@linux-foundation.org: coding style fixes]
[akpm@linux-foundation.org: minor code simplification]
Link: http://lkml.kernel.org/r/20180716115058.5559-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Christian König <christian.koenig@amd.com> # AMD notifiers
Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx and umem_odp
Reported-by: David Rientjes <rientjes@google.com>
Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Dennis Dalessandro <dennis.dalessandro@intel.com>
Cc: Sudeep Dutt <sudeep.dutt@intel.com>
Cc: Ashutosh Dixit <ashutosh.dixit@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kvm/x86.c                      |  7 ++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c  | 43 ++++++++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_gem_userptr.c | 13 +++++++---
 drivers/gpu/drm/radeon/radeon_mn.c      | 22 ++++++++++++++---
 drivers/infiniband/core/umem_odp.c      | 33 +++++++++++++++++++------
 drivers/infiniband/hw/hfi1/mmu_rb.c     | 11 ++++++---
 drivers/infiniband/hw/mlx5/odp.c        |  2 +-
 drivers/misc/mic/scif/scif_dma.c        |  7 ++++--
 drivers/misc/sgi-gru/grutlbpurge.c      |  7 ++++--
 drivers/xen/gntdev.c                    | 44 ++++++++++++++++++++++++++-------
 include/linux/kvm_host.h                |  4 +--
 include/linux/mmu_notifier.h            | 33 +++++++++++++++++++------
 include/linux/oom.h                     |  2 +-
 include/rdma/ib_umem_odp.h              |  3 ++-
 mm/hmm.c                                |  7 ++++--
 mm/mmap.c                               |  2 +-
 mm/mmu_notifier.c                       | 19 +++++++++++---
 mm/oom_kill.c                           | 29 +++++++++++-----------
 virt/kvm/kvm_main.c                     | 15 +++++++----
 19 files changed, 223 insertions(+), 80 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f7dff0457846..4a74a7cf0a8b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7305,8 +7305,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
-void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end)
+int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+		unsigned long start, unsigned long end,
+		bool blockable)
 {
 	unsigned long apic_address;
 
@@ -7317,6 +7318,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
 	if (start <= apic_address && apic_address < end)
 		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+
+	return 0;
 }
 
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index a365ea2383d1..e55508b39496 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn)
  *
  * @amn: our notifier
  */
-static void amdgpu_mn_read_lock(struct amdgpu_mn *amn)
+static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable)
 {
-	mutex_lock(&amn->read_lock);
+	if (blockable)
+		mutex_lock(&amn->read_lock);
+	else if (!mutex_trylock(&amn->read_lock))
+		return -EAGAIN;
+
 	if (atomic_inc_return(&amn->recursion) == 1)
 		down_read_non_owner(&amn->lock);
 	mutex_unlock(&amn->read_lock);
+
+	return 0;
 }
 
 /**
@@ -239,10 +245,11 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
  * Block for operations on BOs to finish and mark pages as accessed and
  * potentially dirty.
  */
-static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
+static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
 						 struct mm_struct *mm,
 						 unsigned long start,
-						 unsigned long end)
+						 unsigned long end,
+						 bool blockable)
 {
 	struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
 	struct interval_tree_node *it;
@@ -250,17 +257,28 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
 	/* notification is exclusive, but interval is inclusive */
 	end -= 1;
 
-	amdgpu_mn_read_lock(amn);
+	/* TODO we should be able to split locking for interval tree and
+	 * amdgpu_mn_invalidate_node
+	 */
+	if (amdgpu_mn_read_lock(amn, blockable))
+		return -EAGAIN;
 
 	it = interval_tree_iter_first(&amn->objects, start, end);
 	while (it) {
 		struct amdgpu_mn_node *node;
 
+		if (!blockable) {
+			amdgpu_mn_read_unlock(amn);
+			return -EAGAIN;
+		}
+
 		node = container_of(it, struct amdgpu_mn_node, it);
 		it = interval_tree_iter_next(it, start, end);
 
 		amdgpu_mn_invalidate_node(node, start, end);
 	}
+
+	return 0;
 }
 
 /**
@@ -275,10 +293,11 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
  * necessitates evicting all user-mode queues of the process. The BOs
  * are restorted in amdgpu_mn_invalidate_range_end_hsa.
  */
-static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
+static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
 						 struct mm_struct *mm,
 						 unsigned long start,
-						 unsigned long end)
+						 unsigned long end,
+						 bool blockable)
 {
 	struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
 	struct interval_tree_node *it;
@@ -286,13 +305,19 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
 	/* notification is exclusive, but interval is inclusive */
 	end -= 1;
 
-	amdgpu_mn_read_lock(amn);
+	if (amdgpu_mn_read_lock(amn, blockable))
+		return -EAGAIN;
 
 	it = interval_tree_iter_first(&amn->objects, start, end);
 	while (it) {
 		struct amdgpu_mn_node *node;
 		struct amdgpu_bo *bo;
 
+		if (!blockable) {
+			amdgpu_mn_read_unlock(amn);
+			return -EAGAIN;
+		}
+
 		node = container_of(it, struct amdgpu_mn_node, it);
 		it = interval_tree_iter_next(it, start, end);
 
@@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
 				amdgpu_amdkfd_evict_userptr(mem, mm);
 		}
 	}
+
+	return 0;
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index dcd6e230d16a..2c9b284036d1 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo)
 	mo->attached = false;
 }
 
-static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
+static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 						       struct mm_struct *mm,
 						       unsigned long start,
-						       unsigned long end)
+						       unsigned long end,
+						       bool blockable)
 {
 	struct i915_mmu_notifier *mn =
 		container_of(_mn, struct i915_mmu_notifier, mn);
@@ -124,7 +125,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 	LIST_HEAD(cancelled);
 
 	if (RB_EMPTY_ROOT(&mn->objects.rb_root))
-		return;
+		return 0;
 
 	/* interval ranges are inclusive, but invalidate range is exclusive */
 	end--;
@@ -132,6 +133,10 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 	spin_lock(&mn->lock);
 	it = interval_tree_iter_first(&mn->objects, start, end);
 	while (it) {
+		if (!blockable) {
+			spin_unlock(&mn->lock);
+			return -EAGAIN;
+		}
 		/* The mmu_object is released late when destroying the
 		 * GEM object so it is entirely possible to gain a
 		 * reference on an object in the process of being freed
@@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 
 	if (!list_empty(&cancelled))
 		flush_workqueue(mn->wq);
+
+	return 0;
 }
 
 static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index abd24975c9b1..f8b35df44c60 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -118,19 +118,27 @@ static void radeon_mn_release(struct mmu_notifier *mn,
  * We block for all BOs between start and end to be idle and
  * unmap them by move them into system domain again.
  */
-static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
+static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 					     struct mm_struct *mm,
 					     unsigned long start,
-					     unsigned long end)
+					     unsigned long end,
+					     bool blockable)
 {
 	struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
 	struct ttm_operation_ctx ctx = { false, false };
 	struct interval_tree_node *it;
+	int ret = 0;
 
 	/* notification is exclusive, but interval is inclusive */
 	end -= 1;
 
-	mutex_lock(&rmn->lock);
+	/* TODO we should be able to split locking for interval tree and
+	 * the tear down.
+	 */
+	if (blockable)
+		mutex_lock(&rmn->lock);
+	else if (!mutex_trylock(&rmn->lock))
+		return -EAGAIN;
 
 	it = interval_tree_iter_first(&rmn->objects, start, end);
 	while (it) {
@@ -138,6 +146,11 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 		struct radeon_bo *bo;
 		long r;
 
+		if (!blockable) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
+
 		node = container_of(it, struct radeon_mn_node, it);
 		it = interval_tree_iter_next(it, start, end);
 
@@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 		}
 	}
 	
+out_unlock:
 	mutex_unlock(&rmn->lock);
+
+	return ret;
 }
 
 static const struct mmu_notifier_ops radeon_mn_ops = {
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 182436b92ba9..6ec748eccff7 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn,
 	rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
 				      ULLONG_MAX,
 				      ib_umem_notifier_release_trampoline,
+				      true,
 				      NULL);
 	up_read(&context->umem_rwsem);
 }
@@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
 	return 0;
 }
 
-static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
 						    struct mm_struct *mm,
 						    unsigned long start,
-						    unsigned long end)
+						    unsigned long end,
+						    bool blockable)
 {
 	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+	int ret;
 
 	if (!context->invalidate_range)
-		return;
+		return 0;
+
+	if (blockable)
+		down_read(&context->umem_rwsem);
+	else if (!down_read_trylock(&context->umem_rwsem))
+		return -EAGAIN;
 
 	ib_ucontext_notifier_start_account(context);
-	down_read(&context->umem_rwsem);
-	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+	ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
 				      end,
-				      invalidate_range_start_trampoline, NULL);
+				      invalidate_range_start_trampoline,
+				      blockable, NULL);
 	up_read(&context->umem_rwsem);
+
+	return ret;
 }
 
 static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
@@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	if (!context->invalidate_range)
 		return;
 
+	/*
+	 * TODO: we currently bail out if there is any sleepable work to be done
+	 * in ib_umem_notifier_invalidate_range_start so we shouldn't really block
+	 * here. But this is ugly and fragile.
+	 */
 	down_read(&context->umem_rwsem);
 	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
 				      end,
-				      invalidate_range_end_trampoline, NULL);
+				      invalidate_range_end_trampoline, true, NULL);
 	up_read(&context->umem_rwsem);
 	ib_ucontext_notifier_end_account(context);
 }
@@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
 int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
 				  u64 start, u64 last,
 				  umem_call_back cb,
+				  bool blockable,
 				  void *cookie)
 {
 	int ret_val = 0;
@@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
 
 	for (node = rbt_ib_umem_iter_first(root, start, last - 1);
 			node; node = next) {
+		/* TODO move the blockable decision up to the callback */
+		if (!blockable)
+			return -EAGAIN;
 		next = rbt_ib_umem_iter_next(node, start, last - 1);
 		umem = container_of(node, struct ib_umem_odp, interval_tree);
 		ret_val = cb(umem->umem, start, last, cookie) || ret_val;
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index 70aceefe14d5..e1c7996c018e 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -67,9 +67,9 @@ struct mmu_rb_handler {
 
 static unsigned long mmu_node_start(struct mmu_rb_node *);
 static unsigned long mmu_node_last(struct mmu_rb_node *);
-static void mmu_notifier_range_start(struct mmu_notifier *,
+static int mmu_notifier_range_start(struct mmu_notifier *,
 				     struct mm_struct *,
-				     unsigned long, unsigned long);
+				     unsigned long, unsigned long, bool);
 static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
 					   unsigned long, unsigned long);
 static void do_remove(struct mmu_rb_handler *handler,
@@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
 	handler->ops->remove(handler->ops_arg, node);
 }
 
-static void mmu_notifier_range_start(struct mmu_notifier *mn,
+static int mmu_notifier_range_start(struct mmu_notifier *mn,
 				     struct mm_struct *mm,
 				     unsigned long start,
-				     unsigned long end)
+				     unsigned long end,
+				     bool blockable)
 {
 	struct mmu_rb_handler *handler =
 		container_of(mn, struct mmu_rb_handler, mn);
@@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn,
 
 	if (added)
 		queue_work(handler->wq, &handler->del_work);
+
+	return 0;
 }
 
 /*
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index f1a87a690a4c..d216e0d2921d 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 
 	down_read(&ctx->umem_rwsem);
 	rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
-				      mr_leaf_free, imr);
+				      mr_leaf_free, true, imr);
 	up_read(&ctx->umem_rwsem);
 
 	wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
index 63d6246d6dff..6369aeaa7056 100644
--- a/drivers/misc/mic/scif/scif_dma.c
+++ b/drivers/misc/mic/scif/scif_dma.c
@@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn,
 	schedule_work(&scif_info.misc_work);
 }
 
-static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 						     struct mm_struct *mm,
 						     unsigned long start,
-						     unsigned long end)
+						     unsigned long end,
+						     bool blockable)
 {
 	struct scif_mmu_notif	*mmn;
 
 	mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
 	scif_rma_destroy_tcw(mmn, start, end - start);
+
+	return 0;
 }
 
 static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
index a3454eb56fbf..be28f05bfafa 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru)
 /*
  * MMUOPS notifier callout functions
  */
-static void gru_invalidate_range_start(struct mmu_notifier *mn,
+static int gru_invalidate_range_start(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
-				       unsigned long start, unsigned long end)
+				       unsigned long start, unsigned long end,
+				       bool blockable)
 {
 	struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
 						 ms_notifier);
@@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn,
 	gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
 		start, end, atomic_read(&gms->ms_range_active));
 	gru_flush_tlb_range(gms, start, end - start);
+
+	return 0;
 }
 
 static void gru_invalidate_range_end(struct mmu_notifier *mn,
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index c866a62f766d..57390c7666e5 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -479,18 +479,25 @@ static const struct vm_operations_struct gntdev_vmops = {
 
 /* ------------------------------------------------------------------ */
 
+static bool in_range(struct gntdev_grant_map *map,
+			      unsigned long start, unsigned long end)
+{
+	if (!map->vma)
+		return false;
+	if (map->vma->vm_start >= end)
+		return false;
+	if (map->vma->vm_end <= start)
+		return false;
+
+	return true;
+}
+
 static void unmap_if_in_range(struct gntdev_grant_map *map,
 			      unsigned long start, unsigned long end)
 {
 	unsigned long mstart, mend;
 	int err;
 
-	if (!map->vma)
-		return;
-	if (map->vma->vm_start >= end)
-		return;
-	if (map->vma->vm_end <= start)
-		return;
 	mstart = max(start, map->vma->vm_start);
 	mend   = min(end,   map->vma->vm_end);
 	pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
@@ -503,21 +510,40 @@ static void unmap_if_in_range(struct gntdev_grant_map *map,
 	WARN_ON(err);
 }
 
-static void mn_invl_range_start(struct mmu_notifier *mn,
+static int mn_invl_range_start(struct mmu_notifier *mn,
 				struct mm_struct *mm,
-				unsigned long start, unsigned long end)
+				unsigned long start, unsigned long end,
+				bool blockable)
 {
 	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
 	struct gntdev_grant_map *map;
+	int ret = 0;
+
+	/* TODO do we really need a mutex here? */
+	if (blockable)
+		mutex_lock(&priv->lock);
+	else if (!mutex_trylock(&priv->lock))
+		return -EAGAIN;
 
-	mutex_lock(&priv->lock);
 	list_for_each_entry(map, &priv->maps, next) {
+		if (in_range(map, start, end)) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
 		unmap_if_in_range(map, start, end);
 	}
 	list_for_each_entry(map, &priv->freeable_maps, next) {
+		if (in_range(map, start, end)) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
 		unmap_if_in_range(map, start, end);
 	}
+
+out_unlock:
 	mutex_unlock(&priv->lock);
+
+	return ret;
 }
 
 static void mn_release(struct mmu_notifier *mn,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7c7362dd2faa..0205aee44ded 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1289,8 +1289,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
 }
 #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
 
-void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end);
+int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+		unsigned long start, unsigned long end, bool blockable);
 
 #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
 int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 392e6af82701..133ba78820ee 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -151,13 +151,15 @@ struct mmu_notifier_ops {
 	 * address space but may still be referenced by sptes until
 	 * the last refcount is dropped.
 	 *
-	 * If both of these callbacks cannot block, and invalidate_range
-	 * cannot block, mmu_notifier_ops.flags should have
-	 * MMU_INVALIDATE_DOES_NOT_BLOCK set.
+	 * If blockable argument is set to false then the callback cannot
+	 * sleep and has to return with -EAGAIN. 0 should be returned
+	 * otherwise.
+	 *
 	 */
-	void (*invalidate_range_start)(struct mmu_notifier *mn,
+	int (*invalidate_range_start)(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
-				       unsigned long start, unsigned long end);
+				       unsigned long start, unsigned long end,
+				       bool blockable);
 	void (*invalidate_range_end)(struct mmu_notifier *mn,
 				     struct mm_struct *mm,
 				     unsigned long start, unsigned long end);
@@ -229,8 +231,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 				      unsigned long address, pte_t pte);
-extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end);
+extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+				  unsigned long start, unsigned long end,
+				  bool blockable);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end,
 				  bool only_end);
@@ -281,7 +284,15 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
 	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_range_start(mm, start, end);
+		__mmu_notifier_invalidate_range_start(mm, start, end, true);
+}
+
+static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_invalidate_range_start(mm, start, end, false);
+	return 0;
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -461,6 +472,12 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 {
 }
 
+static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6adac113e96d..92f70e4c6252 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -95,7 +95,7 @@ static inline int check_stable_address_space(struct mm_struct *mm)
 	return 0;
 }
 
-void __oom_reap_task_mm(struct mm_struct *mm);
+bool __oom_reap_task_mm(struct mm_struct *mm);
 
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 6a17f856f841..381cdf5a9bd1 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -119,7 +119,8 @@ typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
  */
 int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
 				  u64 start, u64 end,
-				  umem_call_back cb, void *cookie);
+				  umem_call_back cb,
+				  bool blockable, void *cookie);
 
 /*
  * Find first region intersecting with address range.
diff --git a/mm/hmm.c b/mm/hmm.c
index 76e7a058b32f..0b0554591610 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	up_write(&hmm->mirrors_sem);
 }
 
-static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long start,
-				       unsigned long end)
+				       unsigned long end,
+				       bool blockable)
 {
 	struct hmm *hmm = mm->hmm;
 
 	VM_BUG_ON(!hmm);
 
 	atomic_inc(&hmm->sequence);
+
+	return 0;
 }
 
 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
diff --git a/mm/mmap.c b/mm/mmap.c
index 8d6449e74431..bb2a7e097c7d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3064,7 +3064,7 @@ void exit_mmap(struct mm_struct *mm)
 		 * reliably test it.
 		 */
 		mutex_lock(&oom_lock);
-		__oom_reap_task_mm(mm);
+		(void)__oom_reap_task_mm(mm);
 		mutex_unlock(&oom_lock);
 
 		set_bit(MMF_OOM_SKIP, &mm->flags);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index eff6b88a993f..82bb1a939c0e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 	srcu_read_unlock(&srcu, id);
 }
 
-void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
-				  unsigned long start, unsigned long end)
+int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+				  unsigned long start, unsigned long end,
+				  bool blockable)
 {
 	struct mmu_notifier *mn;
+	int ret = 0;
 	int id;
 
 	id = srcu_read_lock(&srcu);
 	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
-		if (mn->ops->invalidate_range_start)
-			mn->ops->invalidate_range_start(mn, mm, start, end);
+		if (mn->ops->invalidate_range_start) {
+			int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
+			if (_ret) {
+				pr_info("%pS callback failed with %d in %sblockable context.\n",
+						mn->ops->invalidate_range_start, _ret,
+						!blockable ? "non-" : "");
+				ret = _ret;
+			}
+		}
 	}
 	srcu_read_unlock(&srcu, id);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 412f43453a68..be31a1e0fe78 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -487,9 +487,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-void __oom_reap_task_mm(struct mm_struct *mm)
+bool __oom_reap_task_mm(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
+	bool ret = true;
 
 	/*
 	 * Tell all users of get_user/copy_from_user etc... that the content
@@ -519,12 +520,17 @@ void __oom_reap_task_mm(struct mm_struct *mm)
 			struct mmu_gather tlb;
 
 			tlb_gather_mmu(&tlb, mm, start, end);
-			mmu_notifier_invalidate_range_start(mm, start, end);
+			if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
+				ret = false;
+				continue;
+			}
 			unmap_page_range(&tlb, vma, start, end, NULL);
 			mmu_notifier_invalidate_range_end(mm, start, end);
 			tlb_finish_mmu(&tlb, start, end);
 		}
 	}
+
+	return ret;
 }
 
 static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
@@ -553,18 +559,6 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		goto unlock_oom;
 	}
 
-	/*
-	 * If the mm has invalidate_{start,end}() notifiers that could block,
-	 * sleep to give the oom victim some more time.
-	 * TODO: we really want to get rid of this ugly hack and make sure that
-	 * notifiers cannot block for unbounded amount of time
-	 */
-	if (mm_has_blockable_invalidate_notifiers(mm)) {
-		up_read(&mm->mmap_sem);
-		schedule_timeout_idle(HZ);
-		goto unlock_oom;
-	}
-
 	/*
 	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
 	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
@@ -579,7 +573,12 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
 	trace_start_task_reaping(tsk->pid);
 
-	__oom_reap_task_mm(mm);
+	/* failed to reap part of the address space. Try again later */
+	if (!__oom_reap_task_mm(mm)) {
+		up_read(&mm->mmap_sem);
+		ret = false;
+		goto unlock_oom;
+	}
 
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
 			task_pid_nr(tsk), tsk->comm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9263ead9fd32..0116b449b993 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -140,9 +140,10 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 static unsigned long long kvm_createvm_count;
 static unsigned long long kvm_active_vms;
 
-__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end)
+__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+		unsigned long start, unsigned long end, bool blockable)
 {
+	return 0;
 }
 
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
@@ -360,13 +361,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
-static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 						    struct mm_struct *mm,
 						    unsigned long start,
-						    unsigned long end)
+						    unsigned long end,
+						    bool blockable)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int need_tlb_flush = 0, idx;
+	int ret;
 
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
@@ -384,9 +387,11 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 
 	spin_unlock(&kvm->mmu_lock);
 
-	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
+	ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable);
 
 	srcu_read_unlock(&kvm->srcu, idx);
+
+	return ret;
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
-- 
cgit v1.2.3


From af5679fbc669f31f7ebd0d473bca76c24c07de30 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 21 Aug 2018 21:52:37 -0700
Subject: mm, oom: remove oom_lock from oom_reaper

oom_reaper used to rely on the oom_lock since e2fe14564d33 ("oom_reaper:
close race with exiting task").  We do not really need the lock anymore
though.  212925802454 ("mm: oom: let oom_reap_task and exit_mmap run
concurrently") has removed serialization with the exit path based on the
mm reference count and so we do not really rely on the oom_lock anymore.

Tetsuo was arguing that at least MMF_OOM_SKIP should be set under the lock
to prevent from races when the page allocator didn't manage to get the
freed (reaped) memory in __alloc_pages_may_oom but it sees the flag later
on and move on to another victim.  Although this is possible in principle
let's wait for it to actually happen in real life before we make the
locking more complex again.

Therefore remove the oom_lock for oom_reaper paths (both exit_mmap and
oom_reap_task_mm).  The reaper serializes with exit_mmap by mmap_sem +
MMF_OOM_SKIP flag.  There is no synchronization with out_of_memory path
now.

[mhocko@kernel.org: oom_reap_task_mm should return false when __oom_reap_task_mm did]
  Link: http://lkml.kernel.org/r/20180724141747.GP28386@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/20180719075922.13784-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Suggested-by: David Rientjes <rientjes@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c     |  2 --
 mm/oom_kill.c | 30 ++++--------------------------
 2 files changed, 4 insertions(+), 28 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index bb2a7e097c7d..5f2b2b184c60 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3063,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm)
 		 * which clears VM_LOCKED, otherwise the oom reaper cannot
 		 * reliably test it.
 		 */
-		mutex_lock(&oom_lock);
 		(void)__oom_reap_task_mm(mm);
-		mutex_unlock(&oom_lock);
 
 		set_bit(MMF_OOM_SKIP, &mm->flags);
 		down_write(&mm->mmap_sem);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index be31a1e0fe78..66a86dd049a0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -535,28 +535,9 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 
 static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
-	bool ret = true;
-
-	/*
-	 * We have to make sure to not race with the victim exit path
-	 * and cause premature new oom victim selection:
-	 * oom_reap_task_mm		exit_mm
-	 *   mmget_not_zero
-	 *				  mmput
-	 *				    atomic_dec_and_test
-	 *				  exit_oom_victim
-	 *				[...]
-	 *				out_of_memory
-	 *				  select_bad_process
-	 *				    # no TIF_MEMDIE task selects new victim
-	 *  unmap_page_range # frees some memory
-	 */
-	mutex_lock(&oom_lock);
-
 	if (!down_read_trylock(&mm->mmap_sem)) {
-		ret = false;
 		trace_skip_task_reaping(tsk->pid);
-		goto unlock_oom;
+		return false;
 	}
 
 	/*
@@ -568,7 +549,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
 		up_read(&mm->mmap_sem);
 		trace_skip_task_reaping(tsk->pid);
-		goto unlock_oom;
+		return true;
 	}
 
 	trace_start_task_reaping(tsk->pid);
@@ -576,8 +557,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	/* failed to reap part of the address space. Try again later */
 	if (!__oom_reap_task_mm(mm)) {
 		up_read(&mm->mmap_sem);
-		ret = false;
-		goto unlock_oom;
+		return false;
 	}
 
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
@@ -588,9 +568,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	up_read(&mm->mmap_sem);
 
 	trace_finish_task_reaping(tsk->pid);
-unlock_oom:
-	mutex_unlock(&oom_lock);
-	return ret;
+	return true;
 }
 
 #define MAX_OOM_REAP_RETRIES 10
-- 
cgit v1.2.3


From c3b78b11efbb2865433abf9d22c004ffe4a73f5c Mon Sep 17 00:00:00 2001
From: Rodrigo Freire <rfreire@redhat.com>
Date: Tue, 21 Aug 2018 21:52:41 -0700
Subject: mm, oom: describe task memory unit, larger PID pad

The default page memory unit of OOM task dump events might not be
intuitive and potentially misleading for the non-initiated when debugging
OOM events: These are pages and not kBs.  Add a small printk prior to the
task dump informing that the memory units are actually memory _pages_.

Also extends PID field to align on up to 7 characters.
Reference https://lkml.org/lkml/2018/7/3/1201

Link: http://lkml.kernel.org/r/c795eb5129149ed8a6345c273aba167ff1bbd388.1530715938.git.rfreire@redhat.com
Signed-off-by: Rodrigo Freire <rfreire@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 66a86dd049a0..ed5e6a221a8e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -400,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
+	pr_info("Tasks state (memory values in pages):\n");
+	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -416,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			mm_pgtables_bytes(task->mm),
-- 
cgit v1.2.3


From 431f42fdfdb36f06f43c711fc59be9b814d8fb22 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@kernel.org>
Date: Tue, 21 Aug 2018 21:52:45 -0700
Subject: mm/oom_kill.c: clean up oom_reap_task_mm()

Andrew has noticed some inconsistencies in oom_reap_task_mm.  Notably

 - Undocumented return value.

 - comment "failed to reap part..." is misleading - sounds like it's
   referring to something which happened in the past, is in fact
   referring to something which might happen in the future.

 - fails to call trace_finish_task_reaping() in one case

 - code duplication.

 - Increases mmap_sem hold time a little by moving
   trace_finish_task_reaping() inside the locked region.  So sue me ;)

 - Sharing the finish: path means that the trace event won't
   distinguish between the two sources of finishing.

Add a short explanation for the return value and fix the rest by
reorganizing the function a bit to have unified function exit paths.

Link: http://lkml.kernel.org/r/20180724141747.GP28386@dhcp22.suse.cz
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed5e6a221a8e..20600779f5db 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -534,8 +534,16 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 	return ret;
 }
 
+/*
+ * Reaps the address space of the give task.
+ *
+ * Returns true on success and false if none or part of the address space
+ * has been reclaimed and the caller should retry later.
+ */
 static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
+	bool ret = true;
+
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		trace_skip_task_reaping(tsk->pid);
 		return false;
@@ -548,28 +556,28 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	 * down_write();up_write() cycle in exit_mmap().
 	 */
 	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
-		up_read(&mm->mmap_sem);
 		trace_skip_task_reaping(tsk->pid);
-		return true;
+		goto out_unlock;
 	}
 
 	trace_start_task_reaping(tsk->pid);
 
 	/* failed to reap part of the address space. Try again later */
-	if (!__oom_reap_task_mm(mm)) {
-		up_read(&mm->mmap_sem);
-		return false;
-	}
+	ret = __oom_reap_task_mm(mm);
+	if (!ret)
+		goto out_finish;
 
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
 			task_pid_nr(tsk), tsk->comm,
 			K(get_mm_counter(mm, MM_ANONPAGES)),
 			K(get_mm_counter(mm, MM_FILEPAGES)),
 			K(get_mm_counter(mm, MM_SHMEMPAGES)));
+out_finish:
+	trace_finish_task_reaping(tsk->pid);
+out_unlock:
 	up_read(&mm->mmap_sem);
 
-	trace_finish_task_reaping(tsk->pid);
-	return true;
+	return ret;
 }
 
 #define MAX_OOM_REAP_RETRIES 10
-- 
cgit v1.2.3


From 871305bb20280804882bd08b39a38ccf1b4b68f9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 21 Aug 2018 21:52:48 -0700
Subject: mm: /proc/pid/*maps remove is_pid and related wrappers

Patch series "cleanups and refactor of /proc/pid/smaps*".

The recent regression in /proc/pid/smaps made me look more into the code.
Especially the issues with smaps_rollup reported in [1] as explained in
Patch 4, which fixes them by refactoring the code.  Patches 2 and 3 are
preparations for that.  Patch 1 is me realizing that there's a lot of
boilerplate left from times where we tried (unsuccessfuly) to mark thread
stacks in the output.

Originally I had also plans to rework the translation from
/proc/pid/*maps* file offsets to the internal structures.  Now the offset
means "vma number", which is not really stable (vma's can come and go
between read() calls) and there's an extra caching of last vma's address.
My idea was that offsets would be interpreted directly as addresses, which
would also allow meaningful seeks (see the ugly seek_to_smaps_entry() in
tools/testing/selftests/vm/mlock2.h).  However loff_t is (signed) long
long so that might be insufficient somewhere for the unsigned long
addresses.

So the result is fixed issues with skewed /proc/pid/smaps_rollup results,
simpler smaps code, and a lot of unused code removed.

[1] https://marc.info/?l=linux-mm&m=151927723128134&w=2

This patch (of 4):

Commit b76437579d13 ("procfs: mark thread stack correctly in
proc/<pid>/maps") introduced differences between /proc/PID/maps and
/proc/PID/task/TID/maps to mark thread stacks properly, and this was
also done for smaps and numa_maps.  However it didn't work properly and
was ultimately removed by commit b18cb64ead40 ("fs/proc: Stop trying to
report thread stacks").

Now the is_pid parameter for the related show_*() functions is unused
and we can remove it together with wrapper functions and ops structures
that differ for PID and TID cases only in this parameter.

Link: http://lkml.kernel.org/r/20180723111933.15443-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Daniel Colascione <dancol@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c       |   6 +--
 fs/proc/internal.h   |   3 --
 fs/proc/task_mmu.c   | 114 +++++----------------------------------------------
 fs/proc/task_nommu.c |  39 ++----------------
 4 files changed, 18 insertions(+), 144 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index aaffc0c30216..ad047977ed04 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3309,12 +3309,12 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
 	ONE("stat",      S_IRUGO, proc_tid_stat),
 	ONE("statm",     S_IRUGO, proc_pid_statm),
-	REG("maps",      S_IRUGO, proc_tid_maps_operations),
+	REG("maps",      S_IRUGO, proc_pid_maps_operations),
 #ifdef CONFIG_PROC_CHILDREN
 	REG("children",  S_IRUGO, proc_tid_children_operations),
 #endif
 #ifdef CONFIG_NUMA
-	REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
+	REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
 #endif
 	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
 	LNK("cwd",       proc_cwd_link),
@@ -3324,7 +3324,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-	REG("smaps",     S_IRUGO, proc_tid_smaps_operations),
+	REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index da3dbfa09e79..0c538769512a 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -297,12 +297,9 @@ struct proc_maps_private {
 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
 
 extern const struct file_operations proc_pid_maps_operations;
-extern const struct file_operations proc_tid_maps_operations;
 extern const struct file_operations proc_pid_numa_maps_operations;
-extern const struct file_operations proc_tid_numa_maps_operations;
 extern const struct file_operations proc_pid_smaps_operations;
 extern const struct file_operations proc_pid_smaps_rollup_operations;
-extern const struct file_operations proc_tid_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dfd73a4616ce..a3f98ca50981 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -294,7 +294,7 @@ static void show_vma_header_prefix(struct seq_file *m,
 }
 
 static void
-show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
@@ -357,35 +357,18 @@ done:
 	seq_putc(m, '\n');
 }
 
-static int show_map(struct seq_file *m, void *v, int is_pid)
+static int show_map(struct seq_file *m, void *v)
 {
-	show_map_vma(m, v, is_pid);
+	show_map_vma(m, v);
 	m_cache_vma(m, v);
 	return 0;
 }
 
-static int show_pid_map(struct seq_file *m, void *v)
-{
-	return show_map(m, v, 1);
-}
-
-static int show_tid_map(struct seq_file *m, void *v)
-{
-	return show_map(m, v, 0);
-}
-
 static const struct seq_operations proc_pid_maps_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
-	.show	= show_pid_map
-};
-
-static const struct seq_operations proc_tid_maps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_tid_map
+	.show	= show_map
 };
 
 static int pid_maps_open(struct inode *inode, struct file *file)
@@ -393,11 +376,6 @@ static int pid_maps_open(struct inode *inode, struct file *file)
 	return do_maps_open(inode, file, &proc_pid_maps_op);
 }
 
-static int tid_maps_open(struct inode *inode, struct file *file)
-{
-	return do_maps_open(inode, file, &proc_tid_maps_op);
-}
-
 const struct file_operations proc_pid_maps_operations = {
 	.open		= pid_maps_open,
 	.read		= seq_read,
@@ -405,13 +383,6 @@ const struct file_operations proc_pid_maps_operations = {
 	.release	= proc_map_release,
 };
 
-const struct file_operations proc_tid_maps_operations = {
-	.open		= tid_maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= proc_map_release,
-};
-
 /*
  * Proportional Set Size(PSS): my share of RSS.
  *
@@ -733,7 +704,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 
 #define SEQ_PUT_DEC(str, val) \
 		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
-static int show_smap(struct seq_file *m, void *v, int is_pid)
+static int show_smap(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
@@ -796,7 +767,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		mss->pss_locked += mss->pss;
 
 	if (!rollup_mode) {
-		show_map_vma(m, vma, is_pid);
+		show_map_vma(m, vma);
 	} else if (last_vma) {
 		show_vma_header_prefix(
 			m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
@@ -845,28 +816,11 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 }
 #undef SEQ_PUT_DEC
 
-static int show_pid_smap(struct seq_file *m, void *v)
-{
-	return show_smap(m, v, 1);
-}
-
-static int show_tid_smap(struct seq_file *m, void *v)
-{
-	return show_smap(m, v, 0);
-}
-
 static const struct seq_operations proc_pid_smaps_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
-	.show	= show_pid_smap
-};
-
-static const struct seq_operations proc_tid_smaps_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_tid_smap
+	.show	= show_smap
 };
 
 static int pid_smaps_open(struct inode *inode, struct file *file)
@@ -893,11 +847,6 @@ static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int tid_smaps_open(struct inode *inode, struct file *file)
-{
-	return do_maps_open(inode, file, &proc_tid_smaps_op);
-}
-
 const struct file_operations proc_pid_smaps_operations = {
 	.open		= pid_smaps_open,
 	.read		= seq_read,
@@ -912,13 +861,6 @@ const struct file_operations proc_pid_smaps_rollup_operations = {
 	.release	= proc_map_release,
 };
 
-const struct file_operations proc_tid_smaps_operations = {
-	.open		= tid_smaps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= proc_map_release,
-};
-
 enum clear_refs_types {
 	CLEAR_REFS_ALL = 1,
 	CLEAR_REFS_ANON,
@@ -1728,7 +1670,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-static int show_numa_map(struct seq_file *m, void *v, int is_pid)
+static int show_numa_map(struct seq_file *m, void *v)
 {
 	struct numa_maps_private *numa_priv = m->private;
 	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
@@ -1812,45 +1754,17 @@ out:
 	return 0;
 }
 
-static int show_pid_numa_map(struct seq_file *m, void *v)
-{
-	return show_numa_map(m, v, 1);
-}
-
-static int show_tid_numa_map(struct seq_file *m, void *v)
-{
-	return show_numa_map(m, v, 0);
-}
-
 static const struct seq_operations proc_pid_numa_maps_op = {
 	.start  = m_start,
 	.next   = m_next,
 	.stop   = m_stop,
-	.show   = show_pid_numa_map,
+	.show   = show_numa_map,
 };
 
-static const struct seq_operations proc_tid_numa_maps_op = {
-	.start  = m_start,
-	.next   = m_next,
-	.stop   = m_stop,
-	.show   = show_tid_numa_map,
-};
-
-static int numa_maps_open(struct inode *inode, struct file *file,
-			  const struct seq_operations *ops)
-{
-	return proc_maps_open(inode, file, ops,
-				sizeof(struct numa_maps_private));
-}
-
 static int pid_numa_maps_open(struct inode *inode, struct file *file)
 {
-	return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
-}
-
-static int tid_numa_maps_open(struct inode *inode, struct file *file)
-{
-	return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
+	return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
+				sizeof(struct numa_maps_private));
 }
 
 const struct file_operations proc_pid_numa_maps_operations = {
@@ -1860,10 +1774,4 @@ const struct file_operations proc_pid_numa_maps_operations = {
 	.release	= proc_map_release,
 };
 
-const struct file_operations proc_tid_numa_maps_operations = {
-	.open		= tid_numa_maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= proc_map_release,
-};
 #endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5b62f57bd9bc..0b63d68dedb2 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -142,8 +142,7 @@ static int is_stack(struct vm_area_struct *vma)
 /*
  * display a single VMA to a sequenced file
  */
-static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
-			  int is_pid)
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long ino = 0;
@@ -189,22 +188,11 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 /*
  * display mapping lines for a particular process's /proc/pid/maps
  */
-static int show_map(struct seq_file *m, void *_p, int is_pid)
+static int show_map(struct seq_file *m, void *_p)
 {
 	struct rb_node *p = _p;
 
-	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
-			      is_pid);
-}
-
-static int show_pid_map(struct seq_file *m, void *_p)
-{
-	return show_map(m, _p, 1);
-}
-
-static int show_tid_map(struct seq_file *m, void *_p)
-{
-	return show_map(m, _p, 0);
+	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
 }
 
 static void *m_start(struct seq_file *m, loff_t *pos)
@@ -260,14 +248,7 @@ static const struct seq_operations proc_pid_maps_ops = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
-	.show	= show_pid_map
-};
-
-static const struct seq_operations proc_tid_maps_ops = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= show_tid_map
+	.show	= show_map
 };
 
 static int maps_open(struct inode *inode, struct file *file,
@@ -308,11 +289,6 @@ static int pid_maps_open(struct inode *inode, struct file *file)
 	return maps_open(inode, file, &proc_pid_maps_ops);
 }
 
-static int tid_maps_open(struct inode *inode, struct file *file)
-{
-	return maps_open(inode, file, &proc_tid_maps_ops);
-}
-
 const struct file_operations proc_pid_maps_operations = {
 	.open		= pid_maps_open,
 	.read		= seq_read,
@@ -320,10 +296,3 @@ const struct file_operations proc_pid_maps_operations = {
 	.release	= map_release,
 };
 
-const struct file_operations proc_tid_maps_operations = {
-	.open		= tid_maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= map_release,
-};
-
-- 
cgit v1.2.3


From 8e68d689afe3284a5bc1663562d3e0a45d1c64fd Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 21 Aug 2018 21:52:52 -0700
Subject: mm: /proc/pid/smaps: factor out mem stats gathering

To prepare for handling /proc/pid/smaps_rollup differently from
/proc/pid/smaps factor out vma mem stats gathering from show_smap() - it
will be used by both.

Link: http://lkml.kernel.org/r/20180723111933.15443-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Daniel Colascione <dancol@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 55 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a3f98ca50981..d2ca88c92d9d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -702,14 +702,9 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 }
 #endif /* HUGETLB_PAGE */
 
-#define SEQ_PUT_DEC(str, val) \
-		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
-static int show_smap(struct seq_file *m, void *v)
+static void smap_gather_stats(struct vm_area_struct *vma,
+			     struct mem_size_stats *mss)
 {
-	struct proc_maps_private *priv = m->private;
-	struct vm_area_struct *vma = v;
-	struct mem_size_stats mss_stack;
-	struct mem_size_stats *mss;
 	struct mm_walk smaps_walk = {
 		.pmd_entry = smaps_pte_range,
 #ifdef CONFIG_HUGETLB_PAGE
@@ -717,23 +712,6 @@ static int show_smap(struct seq_file *m, void *v)
 #endif
 		.mm = vma->vm_mm,
 	};
-	int ret = 0;
-	bool rollup_mode;
-	bool last_vma;
-
-	if (priv->rollup) {
-		rollup_mode = true;
-		mss = priv->rollup;
-		if (mss->first) {
-			mss->first_vma_start = vma->vm_start;
-			mss->first = false;
-		}
-		last_vma = !m_next_vma(priv, vma);
-	} else {
-		rollup_mode = false;
-		memset(&mss_stack, 0, sizeof(mss_stack));
-		mss = &mss_stack;
-	}
 
 	smaps_walk.private = mss;
 
@@ -765,6 +743,35 @@ static int show_smap(struct seq_file *m, void *v)
 	walk_page_vma(vma, &smaps_walk);
 	if (vma->vm_flags & VM_LOCKED)
 		mss->pss_locked += mss->pss;
+}
+
+#define SEQ_PUT_DEC(str, val) \
+		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
+static int show_smap(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+	struct mem_size_stats mss_stack;
+	struct mem_size_stats *mss;
+	int ret = 0;
+	bool rollup_mode;
+	bool last_vma;
+
+	if (priv->rollup) {
+		rollup_mode = true;
+		mss = priv->rollup;
+		if (mss->first) {
+			mss->first_vma_start = vma->vm_start;
+			mss->first = false;
+		}
+		last_vma = !m_next_vma(priv, vma);
+	} else {
+		rollup_mode = false;
+		memset(&mss_stack, 0, sizeof(mss_stack));
+		mss = &mss_stack;
+	}
+
+	smap_gather_stats(vma, mss);
 
 	if (!rollup_mode) {
 		show_map_vma(m, vma);
-- 
cgit v1.2.3


From f1547959d9efd0be6cac2a2fd32f05dd7144dd6c Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 21 Aug 2018 21:52:56 -0700
Subject: mm: /proc/pid/smaps: factor out common stats printing

To prepare for handling /proc/pid/smaps_rollup differently from
/proc/pid/smaps factor out from show_smap() printing the parts of output
that are common for both variants, which is the bulk of the gathered
memory stats.

[vbabka@suse.cz: add const, per Alexey]
  Link: http://lkml.kernel.org/r/b45f319f-cd04-337b-37f8-77f99786aa8a@suse.cz
Link: http://lkml.kernel.org/r/20180723111933.15443-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Daniel Colascione <dancol@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 51 +++++++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d2ca88c92d9d..c47f3cab70a1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -747,6 +747,32 @@ static void smap_gather_stats(struct vm_area_struct *vma,
 
 #define SEQ_PUT_DEC(str, val) \
 		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
+
+/* Show the contents common for smaps and smaps_rollup */
+static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss)
+{
+	SEQ_PUT_DEC("Rss:            ", mss->resident);
+	SEQ_PUT_DEC(" kB\nPss:            ", mss->pss >> PSS_SHIFT);
+	SEQ_PUT_DEC(" kB\nShared_Clean:   ", mss->shared_clean);
+	SEQ_PUT_DEC(" kB\nShared_Dirty:   ", mss->shared_dirty);
+	SEQ_PUT_DEC(" kB\nPrivate_Clean:  ", mss->private_clean);
+	SEQ_PUT_DEC(" kB\nPrivate_Dirty:  ", mss->private_dirty);
+	SEQ_PUT_DEC(" kB\nReferenced:     ", mss->referenced);
+	SEQ_PUT_DEC(" kB\nAnonymous:      ", mss->anonymous);
+	SEQ_PUT_DEC(" kB\nLazyFree:       ", mss->lazyfree);
+	SEQ_PUT_DEC(" kB\nAnonHugePages:  ", mss->anonymous_thp);
+	SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+	SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
+	seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
+				  mss->private_hugetlb >> 10, 7);
+	SEQ_PUT_DEC(" kB\nSwap:           ", mss->swap);
+	SEQ_PUT_DEC(" kB\nSwapPss:        ",
+					mss->swap_pss >> PSS_SHIFT);
+	SEQ_PUT_DEC(" kB\nLocked:         ",
+					mss->pss_locked >> PSS_SHIFT);
+	seq_puts(m, " kB\n");
+}
+
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
@@ -791,28 +817,9 @@ static int show_smap(struct seq_file *m, void *v)
 		seq_puts(m, " kB\n");
 	}
 
-	if (!rollup_mode || last_vma) {
-		SEQ_PUT_DEC("Rss:            ", mss->resident);
-		SEQ_PUT_DEC(" kB\nPss:            ", mss->pss >> PSS_SHIFT);
-		SEQ_PUT_DEC(" kB\nShared_Clean:   ", mss->shared_clean);
-		SEQ_PUT_DEC(" kB\nShared_Dirty:   ", mss->shared_dirty);
-		SEQ_PUT_DEC(" kB\nPrivate_Clean:  ", mss->private_clean);
-		SEQ_PUT_DEC(" kB\nPrivate_Dirty:  ", mss->private_dirty);
-		SEQ_PUT_DEC(" kB\nReferenced:     ", mss->referenced);
-		SEQ_PUT_DEC(" kB\nAnonymous:      ", mss->anonymous);
-		SEQ_PUT_DEC(" kB\nLazyFree:       ", mss->lazyfree);
-		SEQ_PUT_DEC(" kB\nAnonHugePages:  ", mss->anonymous_thp);
-		SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
-		SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
-		seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
-					  mss->private_hugetlb >> 10, 7);
-		SEQ_PUT_DEC(" kB\nSwap:           ", mss->swap);
-		SEQ_PUT_DEC(" kB\nSwapPss:        ",
-						mss->swap_pss >> PSS_SHIFT);
-		SEQ_PUT_DEC(" kB\nLocked:         ",
-						mss->pss_locked >> PSS_SHIFT);
-		seq_puts(m, " kB\n");
-	}
+	if (!rollup_mode || last_vma)
+		__show_smap(m, mss);
+
 	if (!rollup_mode) {
 		if (arch_pkeys_enabled())
 			seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
-- 
cgit v1.2.3


From 258f669e7e88c18edbc23fe5ce00a476b924551f Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 21 Aug 2018 21:52:59 -0700
Subject: mm: /proc/pid/smaps_rollup: convert to single value seq_file

The /proc/pid/smaps_rollup file is currently implemented via the
m_start/m_next/m_stop seq_file iterators shared with the other maps files,
that iterate over vma's.  However, the rollup file doesn't print anything
for each vma, only accumulate the stats.

There are some issues with the current code as reported in [1] - the
accumulated stats can get skewed if seq_file start()/stop() op is called
multiple times, if show() is called multiple times, and after seeks to
non-zero position.

Patch [1] fixed those within existing design, but I believe it is
fundamentally wrong to expose the vma iterators to the seq_file mechanism
when smaps_rollup shows logically a single set of values for the whole
address space.

This patch thus refactors the code to provide a single "value" at offset
0, with vma iteration to gather the stats done internally.  This fixes the
situations where results are skewed, and simplifies the code, especially
in show_smap(), at the expense of somewhat less code reuse.

[1] https://marc.info/?l=linux-mm&m=151927723128134&w=2

[vbabka@suse.c: use seq_file infrastructure]
  Link: http://lkml.kernel.org/r/bf4525b0-fd5b-4c4c-2cb3-adee3dd95a48@suse.cz
Link: http://lkml.kernel.org/r/20180723111933.15443-5-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Daniel Colascione <dancol@google.com>
Reviewed-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/internal.h |   1 -
 fs/proc/task_mmu.c | 155 +++++++++++++++++++++++++++++++++--------------------
 2 files changed, 96 insertions(+), 60 deletions(-)

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0c538769512a..f5b75d258d22 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -285,7 +285,6 @@ struct proc_maps_private {
 	struct inode *inode;
 	struct task_struct *task;
 	struct mm_struct *mm;
-	struct mem_size_stats *rollup;
 #ifdef CONFIG_MMU
 	struct vm_area_struct *tail_vma;
 #endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c47f3cab70a1..5ea1d64cb0b4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -247,7 +247,6 @@ static int proc_map_release(struct inode *inode, struct file *file)
 	if (priv->mm)
 		mmdrop(priv->mm);
 
-	kfree(priv->rollup);
 	return seq_release_private(inode, file);
 }
 
@@ -404,7 +403,6 @@ const struct file_operations proc_pid_maps_operations = {
 
 #ifdef CONFIG_PROC_PAGE_MONITOR
 struct mem_size_stats {
-	bool first;
 	unsigned long resident;
 	unsigned long shared_clean;
 	unsigned long shared_dirty;
@@ -418,7 +416,6 @@ struct mem_size_stats {
 	unsigned long swap;
 	unsigned long shared_hugetlb;
 	unsigned long private_hugetlb;
-	unsigned long first_vma_start;
 	u64 pss;
 	u64 pss_locked;
 	u64 swap_pss;
@@ -775,57 +772,75 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss)
 
 static int show_smap(struct seq_file *m, void *v)
 {
-	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
-	struct mem_size_stats mss_stack;
-	struct mem_size_stats *mss;
+	struct mem_size_stats mss;
+
+	memset(&mss, 0, sizeof(mss));
+
+	smap_gather_stats(vma, &mss);
+
+	show_map_vma(m, vma);
+
+	SEQ_PUT_DEC("Size:           ", vma->vm_end - vma->vm_start);
+	SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
+	SEQ_PUT_DEC(" kB\nMMUPageSize:    ", vma_mmu_pagesize(vma));
+	seq_puts(m, " kB\n");
+
+	__show_smap(m, &mss);
+
+	if (arch_pkeys_enabled())
+		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
+	show_smap_vma_flags(m, vma);
+
+	m_cache_vma(m, vma);
+
+	return 0;
+}
+
+static int show_smaps_rollup(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct mem_size_stats mss;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	unsigned long last_vma_end = 0;
 	int ret = 0;
-	bool rollup_mode;
-	bool last_vma;
-
-	if (priv->rollup) {
-		rollup_mode = true;
-		mss = priv->rollup;
-		if (mss->first) {
-			mss->first_vma_start = vma->vm_start;
-			mss->first = false;
-		}
-		last_vma = !m_next_vma(priv, vma);
-	} else {
-		rollup_mode = false;
-		memset(&mss_stack, 0, sizeof(mss_stack));
-		mss = &mss_stack;
-	}
 
-	smap_gather_stats(vma, mss);
+	priv->task = get_proc_task(priv->inode);
+	if (!priv->task)
+		return -ESRCH;
 
-	if (!rollup_mode) {
-		show_map_vma(m, vma);
-	} else if (last_vma) {
-		show_vma_header_prefix(
-			m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
-		seq_pad(m, ' ');
-		seq_puts(m, "[rollup]\n");
-	} else {
-		ret = SEQ_SKIP;
+	mm = priv->mm;
+	if (!mm || !mmget_not_zero(mm)) {
+		ret = -ESRCH;
+		goto out_put_task;
 	}
 
-	if (!rollup_mode) {
-		SEQ_PUT_DEC("Size:           ", vma->vm_end - vma->vm_start);
-		SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
-		SEQ_PUT_DEC(" kB\nMMUPageSize:    ", vma_mmu_pagesize(vma));
-		seq_puts(m, " kB\n");
-	}
+	memset(&mss, 0, sizeof(mss));
 
-	if (!rollup_mode || last_vma)
-		__show_smap(m, mss);
+	down_read(&mm->mmap_sem);
+	hold_task_mempolicy(priv);
 
-	if (!rollup_mode) {
-		if (arch_pkeys_enabled())
-			seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
-		show_smap_vma_flags(m, vma);
+	for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
+		smap_gather_stats(vma, &mss);
+		last_vma_end = vma->vm_end;
 	}
-	m_cache_vma(m, vma);
+
+	show_vma_header_prefix(m, priv->mm->mmap->vm_start,
+			       last_vma_end, 0, 0, 0, 0);
+	seq_pad(m, ' ');
+	seq_puts(m, "[rollup]\n");
+
+	__show_smap(m, &mss);
+
+	release_task_mempolicy(priv);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+out_put_task:
+	put_task_struct(priv->task);
+	priv->task = NULL;
+
 	return ret;
 }
 #undef SEQ_PUT_DEC
@@ -842,23 +857,45 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
 	return do_maps_open(inode, file, &proc_pid_smaps_op);
 }
 
-static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
+static int smaps_rollup_open(struct inode *inode, struct file *file)
 {
-	struct seq_file *seq;
+	int ret;
 	struct proc_maps_private *priv;
-	int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
-
-	if (ret < 0)
-		return ret;
-	seq = file->private_data;
-	priv = seq->private;
-	priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
-	if (!priv->rollup) {
-		proc_map_release(inode, file);
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
+	if (!priv)
 		return -ENOMEM;
+
+	ret = single_open(file, show_smaps_rollup, priv);
+	if (ret)
+		goto out_free;
+
+	priv->inode = inode;
+	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+	if (IS_ERR(priv->mm)) {
+		ret = PTR_ERR(priv->mm);
+
+		single_release(inode, file);
+		goto out_free;
 	}
-	priv->rollup->first = true;
+
 	return 0;
+
+out_free:
+	kfree(priv);
+	return ret;
+}
+
+static int smaps_rollup_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct proc_maps_private *priv = seq->private;
+
+	if (priv->mm)
+		mmdrop(priv->mm);
+
+	kfree(priv);
+	return single_release(inode, file);
 }
 
 const struct file_operations proc_pid_smaps_operations = {
@@ -869,10 +906,10 @@ const struct file_operations proc_pid_smaps_operations = {
 };
 
 const struct file_operations proc_pid_smaps_rollup_operations = {
-	.open		= pid_smaps_rollup_open,
+	.open		= smaps_rollup_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= proc_map_release,
+	.release	= smaps_rollup_release,
 };
 
 enum clear_refs_types {
-- 
cgit v1.2.3


From a3bf6ce366496016990d8578af74673ea04178ff Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2018 21:53:03 -0700
Subject: mm/mempool.c: add missing parameter description

The kernel-doc for mempool_init function is missing the description of the
pool parameter.  Add it.

Link: http://lkml.kernel.org/r/1532336274-26228-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempool.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/mempool.c b/mm/mempool.c
index 44f5fa98c1e7..0ef8cc8d1602 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node);
 
 /**
  * mempool_init - initialize a memory pool
+ * @pool:      pointer to the memory pool that should be initialized
  * @min_nr:    the minimum number of elements guaranteed to be
  *             allocated for this pool.
  * @alloc_fn:  user-defined element-allocation function.
-- 
cgit v1.2.3


From a670468f5e0b5fad4db6e4d195f15915dc2a35c1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 21 Aug 2018 21:53:06 -0700
Subject: mm: zero out the vma in vma_init()

Rather than in vm_area_alloc().  To ensure that the various oddball
stack-based vmas are in a good state.  Some of the callers were zeroing
them out, others were not.

Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/process.c | 9 ++++-----
 fs/hugetlbfs/inode.c      | 2 --
 include/linux/mm.h        | 1 +
 kernel/fork.c             | 3 ++-
 mm/mempolicy.c            | 1 -
 mm/shmem.c                | 1 -
 6 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index d9c299133111..82ab015bf42b 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -330,16 +330,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
  * atomic helpers. Insert it into the gate_vma so that it is visible
  * through ptrace and /proc/<pid>/mem.
  */
-static struct vm_area_struct gate_vma = {
-	.vm_start	= 0xffff0000,
-	.vm_end		= 0xffff0000 + PAGE_SIZE,
-	.vm_flags	= VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC,
-};
+static struct vm_area_struct gate_vma;
 
 static int __init gate_vma_init(void)
 {
 	vma_init(&gate_vma, NULL);
 	gate_vma.vm_page_prot = PAGE_READONLY_EXEC;
+	gate_vma.vm_start = 0xffff0000;
+	gate_vma.vm_end	= 0xffff0000 + PAGE_SIZE;
+	gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC;
 	return 0;
 }
 arch_initcall(gate_vma_init);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 346a146c7617..32920a10100e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -410,7 +410,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	int i, freed = 0;
 	bool truncate_op = (lend == LLONG_MAX);
 
-	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
 	vma_init(&pseudo_vma, current->mm);
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
 	pagevec_init(&pvec);
@@ -595,7 +594,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 	 * allocation routines.  If NUMA is configured, use page index
 	 * as input to create an allocation policy.
 	 */
-	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
 	vma_init(&pseudo_vma, mm);
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
 	pseudo_vma.vm_file = file;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a3cae495f9ce..3a4b87d1a59a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	static const struct vm_operations_struct dummy_vm_ops = {};
 
+	memset(vma, 0, sizeof(*vma));
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5ee74c113381..8c760effa42e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep;
 
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
-	struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	struct vm_area_struct *vma;
 
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (vma)
 		vma_init(vma, mm);
 	return vma;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 01f1a14facc4..4861ba738d6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 			goto put_new;
 
 		/* Create pseudo-vma that contains just the policy */
-		memset(&pvma, 0, sizeof(struct vm_area_struct));
 		vma_init(&pvma, NULL);
 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
diff --git a/mm/shmem.c b/mm/shmem.c
index c48c79018a7c..fb04baacc9fa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
 		struct shmem_inode_info *info, pgoff_t index)
 {
 	/* Create a pseudo vma that just contains the policy */
-	memset(vma, 0, sizeof(*vma));
 	vma_init(vma, NULL);
 	/* Bias interleave by inode number to distribute better across nodes */
 	vma->vm_pgoff = index + info->vfs_inode.i_ino;
-- 
cgit v1.2.3


From 8c9a134cae6f8d66f35321b07baa202f75ef2199 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 21 Aug 2018 21:53:10 -0700
Subject: mm: clarify CONFIG_PAGE_POISONING and usage

The Kconfig text for CONFIG_PAGE_POISONING doesn't mention that it has to
be enabled explicitly.  This updates the documentation for that and adds a
note about CONFIG_PAGE_POISONING to the "page_poison" command line docs.
While here, change description of CONFIG_PAGE_POISONING_ZERO too, as it's
not "random" data, but rather the fixed debugging value that would be used
when not zeroing.  Additionally removes a stray "bool" in the Kconfig.

Link: http://lkml.kernel.org/r/20180725223832.GA43733@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 5 +++--
 mm/Kconfig.debug                                | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bffb0caa3693..970d837bd57f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3041,8 +3041,9 @@
 			on: enable the feature
 
 	page_poison=	[KNL] Boot-time parameter changing the state of
-			poisoning on the buddy allocator.
-			off: turn off poisoning
+			poisoning on the buddy allocator, available with
+			CONFIG_PAGE_POISONING=y.
+			off: turn off poisoning (default)
 			on: turn on poisoning
 
 	panic=		[KNL] Kernel behaviour on panic: delay <timeout>
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index e5e606ee5f71..9a7b8b049d04 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -46,7 +46,8 @@ config PAGE_POISONING
 	  Fill the pages with poison patterns after free_pages() and verify
 	  the patterns before alloc_pages. The filling of the memory helps
 	  reduce the risk of information leaks from freed data. This does
-	  have a potential performance impact.
+	  have a potential performance impact if enabled with the
+	  "page_poison=1" kernel boot option.
 
 	  Note that "poison" here is not the same thing as the "HWPoison"
 	  for CONFIG_MEMORY_FAILURE. This is software poisoning only.
@@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY
 	   say N.
 
 config PAGE_POISONING_ZERO
-	bool "Use zero for poisoning instead of random data"
+	bool "Use zero for poisoning instead of debugging value"
 	depends on PAGE_POISONING
 	---help---
 	   Instead of using the existing poison value, fill the pages with
@@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO
 	   allocation.
 
 	   If unsure, say N
-	bool
 
 config DEBUG_PAGE_REF
 	bool "Enable tracepoint to track down page reference manipulation"
-- 
cgit v1.2.3


From 1c4c3b99c03d3e72ac643b01edcb83c0c0aafd46 Mon Sep 17 00:00:00 2001
From: Jiang Biao <jiang.biao2@zte.com.cn>
Date: Tue, 21 Aug 2018 21:53:13 -0700
Subject: mm: fix page_freeze_refs and page_unfreeze_refs in comments

page_freeze_refs/page_unfreeze_refs have already been relplaced by
page_ref_freeze/page_ref_unfreeze , but they are not modified in the
comments.

Link: http://lkml.kernel.org/r/1532590226-106038-1-git-send-email-jiang.biao2@zte.com.cn
Signed-off-by: Jiang Biao <jiang.biao2@zte.com.cn>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c            | 4 ++--
 mm/memory-failure.c | 2 +-
 mm/vmscan.c         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 2621be57bd95..1bd514c9e5d0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -703,7 +703,7 @@ again:
 	 * We cannot do anything with the page while its refcount is 0.
 	 * Usually 0 means free, or tail of a higher-order page: in which
 	 * case this node is no longer referenced, and should be freed;
-	 * however, it might mean that the page is under page_freeze_refs().
+	 * however, it might mean that the page is under page_ref_freeze().
 	 * The __remove_mapping() case is easy, again the node is now stale;
 	 * but if page is swapcache in migrate_page_move_mapping(), it might
 	 * still be our page, in which case it's essential to keep the node.
@@ -714,7 +714,7 @@ again:
 		 * work here too.  We have chosen the !PageSwapCache test to
 		 * optimize the common case, when the page is or is about to
 		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
-		 * in the freeze_refs section of __remove_mapping(); but Anon
+		 * in the ref_freeze section of __remove_mapping(); but Anon
 		 * page->mapping reset to NULL later, in free_pages_prepare().
 		 */
 		if (!PageSwapCache(page))
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9d142b9b86dc..c83a1746812f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1167,7 +1167,7 @@ int memory_failure(unsigned long pfn, int flags)
 	 *    R/W the page; let's pray that the page has been
 	 *    used and will be freed some time later.
 	 * In fact it's dangerous to directly bump up page count from 0,
-	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+	 * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
 	 */
 	if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
 		if (is_free_buddy_page(p)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3c6e2bfee427..7e7d25504651 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -903,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		refcount = 2;
 	if (!page_ref_freeze(page, refcount))
 		goto cannot_free;
-	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+	/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
 	if (unlikely(PageDirty(page))) {
 		page_ref_unfreeze(page, refcount);
 		goto cannot_free;
-- 
cgit v1.2.3


From 8de7ecc6483bf94cef9f48c4c3511ef3e9ee6d40 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Tue, 21 Aug 2018 21:53:17 -0700
Subject: memcg: reduce memcg tree traversals for stats collection

Currently cgroup-v1's memcg_stat_show traverses the memcg tree ~17 times
to collect the stats while cgroup-v2's memory_stat_show traverses the
memcg tree thrice.  On a large machine, a couple thousand memcgs is very
normal and if the churn is high and memcgs stick around during to several
reasons, tens of thousands of nodes in memcg tree can exist.  This patch
has refactored and shared the stat collection code between cgroup-v1 and
cgroup-v2 and has reduced the tree traversal to just one.

I ran a simple benchmark which reads the root_mem_cgroup's stat file
1000 times in the presense of 2500 memcgs on cgroup-v1. The results are:

Without the patch:
$ time ./read-root-stat-1000-times

real    0m1.663s
user    0m0.000s
sys     0m1.660s

With the patch:
$ time ./read-root-stat-1000-times

real    0m0.468s
user    0m0.000s
sys     0m0.467s

Link: http://lkml.kernel.org/r/20180724224635.143944-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Bruce Merry <bmerry@ska.ac.za>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 150 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 73 insertions(+), 77 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6a921890739f..59c14c988143 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2899,29 +2899,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 	return retval;
 }
 
-static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
-{
-	struct mem_cgroup *iter;
-	int i;
-
-	memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
-
-	for_each_mem_cgroup_tree(iter, memcg) {
-		for (i = 0; i < MEMCG_NR_STAT; i++)
-			stat[i] += memcg_page_state(iter, i);
-	}
-}
+struct accumulated_stats {
+	unsigned long stat[MEMCG_NR_STAT];
+	unsigned long events[NR_VM_EVENT_ITEMS];
+	unsigned long lru_pages[NR_LRU_LISTS];
+	const unsigned int *stats_array;
+	const unsigned int *events_array;
+	int stats_size;
+	int events_size;
+};
 
-static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
+static void accumulate_memcg_tree(struct mem_cgroup *memcg,
+				  struct accumulated_stats *acc)
 {
-	struct mem_cgroup *iter;
+	struct mem_cgroup *mi;
 	int i;
 
-	memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
+	for_each_mem_cgroup_tree(mi, memcg) {
+		for (i = 0; i < acc->stats_size; i++)
+			acc->stat[i] += memcg_page_state(mi,
+				acc->stats_array ? acc->stats_array[i] : i);
 
-	for_each_mem_cgroup_tree(iter, memcg) {
-		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
-			events[i] += memcg_sum_events(iter, i);
+		for (i = 0; i < acc->events_size; i++)
+			acc->events[i] += memcg_sum_events(mi,
+				acc->events_array ? acc->events_array[i] : i);
+
+		for (i = 0; i < NR_LRU_LISTS; i++)
+			acc->lru_pages[i] +=
+				mem_cgroup_nr_lru_pages(mi, BIT(i));
 	}
 }
 
@@ -3332,6 +3337,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	unsigned long memory, memsw;
 	struct mem_cgroup *mi;
 	unsigned int i;
+	struct accumulated_stats acc;
 
 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3364,32 +3370,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		seq_printf(m, "hierarchical_memsw_limit %llu\n",
 			   (u64)memsw * PAGE_SIZE);
 
-	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
-		unsigned long long val = 0;
+	memset(&acc, 0, sizeof(acc));
+	acc.stats_size = ARRAY_SIZE(memcg1_stats);
+	acc.stats_array = memcg1_stats;
+	acc.events_size = ARRAY_SIZE(memcg1_events);
+	acc.events_array = memcg1_events;
+	accumulate_memcg_tree(memcg, &acc);
 
+	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
-		for_each_mem_cgroup_tree(mi, memcg)
-			val += memcg_page_state(mi, memcg1_stats[i]) *
-			PAGE_SIZE;
-		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
+		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+			   (u64)acc.stat[i] * PAGE_SIZE);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
-		unsigned long long val = 0;
-
-		for_each_mem_cgroup_tree(mi, memcg)
-			val += memcg_sum_events(mi, memcg1_events[i]);
-		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
-	}
-
-	for (i = 0; i < NR_LRU_LISTS; i++) {
-		unsigned long long val = 0;
+	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
+			   (u64)acc.events[i]);
 
-		for_each_mem_cgroup_tree(mi, memcg)
-			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
-		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
-	}
+	for (i = 0; i < NR_LRU_LISTS; i++)
+		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
+			   (u64)acc.lru_pages[i] * PAGE_SIZE);
 
 #ifdef CONFIG_DEBUG_VM
 	{
@@ -5486,8 +5487,7 @@ static int memory_events_show(struct seq_file *m, void *v)
 static int memory_stat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-	unsigned long stat[MEMCG_NR_STAT];
-	unsigned long events[NR_VM_EVENT_ITEMS];
+	struct accumulated_stats acc;
 	int i;
 
 	/*
@@ -5501,66 +5501,62 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	 * Current memory state:
 	 */
 
-	tree_stat(memcg, stat);
-	tree_events(memcg, events);
+	memset(&acc, 0, sizeof(acc));
+	acc.stats_size = MEMCG_NR_STAT;
+	acc.events_size = NR_VM_EVENT_ITEMS;
+	accumulate_memcg_tree(memcg, &acc);
 
 	seq_printf(m, "anon %llu\n",
-		   (u64)stat[MEMCG_RSS] * PAGE_SIZE);
+		   (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
 	seq_printf(m, "file %llu\n",
-		   (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
+		   (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
 	seq_printf(m, "kernel_stack %llu\n",
-		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
+		   (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
 	seq_printf(m, "slab %llu\n",
-		   (u64)(stat[NR_SLAB_RECLAIMABLE] +
-			 stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+		   (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
+			 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
 	seq_printf(m, "sock %llu\n",
-		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+		   (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
 
 	seq_printf(m, "shmem %llu\n",
-		   (u64)stat[NR_SHMEM] * PAGE_SIZE);
+		   (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
 	seq_printf(m, "file_mapped %llu\n",
-		   (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
+		   (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
 	seq_printf(m, "file_dirty %llu\n",
-		   (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
+		   (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
 	seq_printf(m, "file_writeback %llu\n",
-		   (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
+		   (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
 
-	for (i = 0; i < NR_LRU_LISTS; i++) {
-		struct mem_cgroup *mi;
-		unsigned long val = 0;
-
-		for_each_mem_cgroup_tree(mi, memcg)
-			val += mem_cgroup_nr_lru_pages(mi, BIT(i));
-		seq_printf(m, "%s %llu\n",
-			   mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
-	}
+	for (i = 0; i < NR_LRU_LISTS; i++)
+		seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
+			   (u64)acc.lru_pages[i] * PAGE_SIZE);
 
 	seq_printf(m, "slab_reclaimable %llu\n",
-		   (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+		   (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
 	seq_printf(m, "slab_unreclaimable %llu\n",
-		   (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+		   (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
 
 	/* Accumulated memory events */
 
-	seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
-	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+	seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
+	seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
 
-	seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
-	seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
-		   events[PGSCAN_DIRECT]);
-	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
-		   events[PGSTEAL_DIRECT]);
-	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
-	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
-	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
-	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+	seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
+	seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
+		   acc.events[PGSCAN_DIRECT]);
+	seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
+		   acc.events[PGSTEAL_DIRECT]);
+	seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
+	seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
+	seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
+	seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
 
 	seq_printf(m, "workingset_refault %lu\n",
-		   stat[WORKINGSET_REFAULT]);
+		   acc.stat[WORKINGSET_REFAULT]);
 	seq_printf(m, "workingset_activate %lu\n",
-		   stat[WORKINGSET_ACTIVATE]);
+		   acc.stat[WORKINGSET_ACTIVATE]);
 	seq_printf(m, "workingset_nodereclaim %lu\n",
-		   stat[WORKINGSET_NODERECLAIM]);
+		   acc.stat[WORKINGSET_NODERECLAIM]);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 85f237a57f143c0c895dcb7cc53fa0174522ce07 Mon Sep 17 00:00:00 2001
From: juviliu <juviliu@tencent.com>
Date: Tue, 21 Aug 2018 21:53:20 -0700
Subject: Documentation/sysctl/vm.txt: update __vm_enough_memory()'s path

__vm_enough_memory has moved to mm/util.c.

Link: http://lkml.kernel.org/r/E18EDF4A4FA4A04BBFA824B6D7699E532A7E5913@EXMBX-SZMAIL013.tencent.com
Signed-off-by: Juvi Liu <juviliu@tencent.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e72853b6d725..7d73882e2c27 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -691,7 +691,7 @@ and don't use much of it.
 The default value is 0.
 
 See Documentation/vm/overcommit-accounting.rst and
-mm/mmap.c::__vm_enough_memory() for more information.
+mm/util.c::__vm_enough_memory() for more information.
 
 ==============================================================
 
-- 
cgit v1.2.3


From 89696701ea847786f88ccc1c580fb4c3c20c4a3a Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 21 Aug 2018 21:53:24 -0700
Subject: mm: remove zone_id() and make use of zone_idx() in is_dev_zone()

is_dev_zone() is using zone_id() to check if the zone is ZONE_DEVICE.
zone_id() looks pretty much the same as zone_idx(), and while the use of
zone_idx() is quite spread in the kernel, zone_id() is only being used by
is_dev_zone().

This patch removes zone_id() and makes is_dev_zone() use zone_idx() to
check the zone, so we do not have two things with the same functionality
around.

Link: http://lkml.kernel.org/r/20180730133718.28683-1-osalvador@techadventures.net
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2dc52a..3d4577a3ae7e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -755,25 +755,6 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
 	return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
 }
 
-static inline int zone_id(const struct zone *zone)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-
-	return zone - pgdat->node_zones;
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_dev_zone(const struct zone *zone)
-{
-	return zone_id(zone) == ZONE_DEVICE;
-}
-#else
-static inline bool is_dev_zone(const struct zone *zone)
-{
-	return false;
-}
-#endif
-
 #include <linux/memory_hotplug.h>
 
 void build_all_zonelists(pg_data_t *pgdat);
@@ -824,6 +805,18 @@ static inline int local_memory_node(int node_id) { return node_id; };
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool is_dev_zone(const struct zone *zone)
+{
+	return zone_idx(zone) == ZONE_DEVICE;
+}
+#else
+static inline bool is_dev_zone(const struct zone *zone)
+{
+	return false;
+}
+#endif
+
 /*
  * Returns true if a zone has pages managed by the buddy allocator.
  * All the reclaim decisions have to use this function rather than
-- 
cgit v1.2.3


From ace1db39768cdc58f6b157d99ae5958ad34ffff8 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 21 Aug 2018 21:53:29 -0700
Subject: mm/page_alloc.c: move ifdefery out of free_area_init_core

Patch series "Refactor free_area_init_core and add
free_area_init_core_hotplug", v6.

This patchset does three things:

 1) Clean up/refactor free_area_init_core/free_area_init_node
    by moving the ifdefery out of the functions.
 2) Move the pgdat/zone initialization in free_area_init_core to its
    own function.
 3) Introduce free_area_init_core_hotplug, a small subset of
    free_area_init_core, which is only called from memhotlug code path. In this
    way, we have:

    free_area_init_core: called during early initialization
    free_area_init_core_hotplug: called whenever a new node is allocated/re-used (memhotplug path)

This patch (of 5):

Moving the #ifdefs out of the function makes it easier to follow.

Link: http://lkml.kernel.org/r/20180730101757.28058-2-osalvador@techadventures.net
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 15ea511fb41c..455cc3bfae99 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6194,6 +6194,37 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static void pgdat_init_numabalancing(struct pglist_data *pgdat)
+{
+	spin_lock_init(&pgdat->numabalancing_migrate_lock);
+	pgdat->numabalancing_migrate_nr_pages = 0;
+	pgdat->numabalancing_migrate_next_window = jiffies;
+}
+#else
+static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pgdat_init_split_queue(struct pglist_data *pgdat)
+{
+	spin_lock_init(&pgdat->split_queue_lock);
+	INIT_LIST_HEAD(&pgdat->split_queue);
+	pgdat->split_queue_len = 0;
+}
+#else
+static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
+#endif
+
+#ifdef CONFIG_COMPACTION
+static void pgdat_init_kcompactd(struct pglist_data *pgdat)
+{
+	init_waitqueue_head(&pgdat->kcompactd_wait);
+}
+#else
+static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
+#endif
+
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -6208,21 +6239,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 	int nid = pgdat->node_id;
 
 	pgdat_resize_init(pgdat);
-#ifdef CONFIG_NUMA_BALANCING
-	spin_lock_init(&pgdat->numabalancing_migrate_lock);
-	pgdat->numabalancing_migrate_nr_pages = 0;
-	pgdat->numabalancing_migrate_next_window = jiffies;
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	spin_lock_init(&pgdat->split_queue_lock);
-	INIT_LIST_HEAD(&pgdat->split_queue);
-	pgdat->split_queue_len = 0;
-#endif
+
+	pgdat_init_numabalancing(pgdat);
+	pgdat_init_split_queue(pgdat);
+	pgdat_init_kcompactd(pgdat);
+
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
-#ifdef CONFIG_COMPACTION
-	init_waitqueue_head(&pgdat->kcompactd_wait);
-#endif
+
 	pgdat_page_ext_init(pgdat);
 	spin_lock_init(&pgdat->lru_lock);
 	lruvec_init(node_lruvec(pgdat));
-- 
cgit v1.2.3


From c1093b746c0576ed81c4d568d1e39cab651d37e6 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Tue, 21 Aug 2018 21:53:32 -0700
Subject: mm: access zone->node via zone_to_nid() and zone_set_nid()

zone->node is configured only when CONFIG_NUMA=y, so it is a good idea to
have inline functions to access this field in order to avoid ifdef's in c
files.

Link: http://lkml.kernel.org/r/20180730101757.28058-3-osalvador@techadventures.net
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h     |  9 ---------
 include/linux/mmzone.h | 26 ++++++++++++++++++++------
 mm/mempolicy.c         |  4 ++--
 mm/mm_init.c           |  9 ++-------
 mm/page_alloc.c        | 10 ++++------
 5 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3a4b87d1a59a..a55f5389c491 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -960,15 +960,6 @@ static inline int page_zone_id(struct page *page)
 	return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
 }
 
-static inline int zone_to_nid(struct zone *zone)
-{
-#ifdef CONFIG_NUMA
-	return zone->node;
-#else
-	return 0;
-#endif
-}
-
 #ifdef NODE_NOT_IN_PAGE_FLAGS
 extern int page_to_nid(const struct page *page);
 #else
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3d4577a3ae7e..1e22d96734e0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -834,6 +834,25 @@ static inline bool populated_zone(struct zone *zone)
 	return zone->present_pages;
 }
 
+#ifdef CONFIG_NUMA
+static inline int zone_to_nid(struct zone *zone)
+{
+	return zone->node;
+}
+
+static inline void zone_set_nid(struct zone *zone, int nid)
+{
+	zone->node = nid;
+}
+#else
+static inline int zone_to_nid(struct zone *zone)
+{
+	return 0;
+}
+
+static inline void zone_set_nid(struct zone *zone, int nid) {}
+#endif
+
 extern int movable_zone;
 
 #ifdef CONFIG_HIGHMEM
@@ -949,12 +968,7 @@ static inline int zonelist_zone_idx(struct zoneref *zoneref)
 
 static inline int zonelist_node_idx(struct zoneref *zoneref)
 {
-#ifdef CONFIG_NUMA
-	/* zone_to_nid not available in this context */
-	return zoneref->zone->node;
-#else
-	return 0;
-#endif /* CONFIG_NUMA */
+	return zone_to_nid(zoneref->zone);
 }
 
 struct zoneref *__next_zones_zonelist(struct zoneref *z,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4861ba738d6f..da858f794eb6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void)
 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
 		z = first_zones_zonelist(zonelist, highest_zoneidx,
 							&policy->v.nodes);
-		return z->zone ? z->zone->node : node;
+		return z->zone ? zone_to_nid(z->zone) : node;
 	}
 
 	default:
@@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
 				&pol->v.nodes);
-		polnid = z->zone->node;
+		polnid = zone_to_nid(z->zone);
 		break;
 
 	default:
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5b72266b4b03..6838a530789b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void)
 				zone->name);
 
 			/* Iterate the zonelist */
-			for_each_zone_zonelist(zone, z, zonelist, zoneid) {
-#ifdef CONFIG_NUMA
-				pr_cont("%d:%s ", zone->node, zone->name);
-#else
-				pr_cont("0:%s ", zone->name);
-#endif /* CONFIG_NUMA */
-			}
+			for_each_zone_zonelist(zone, z, zonelist, zoneid)
+				pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
 			pr_cont("\n");
 		}
 	}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 455cc3bfae99..6b2324bc61db 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2909,10 +2909,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 	if (!static_branch_likely(&vm_numa_stat_key))
 		return;
 
-	if (z->node != numa_node_id())
+	if (zone_to_nid(z) != numa_node_id())
 		local_stat = NUMA_OTHER;
 
-	if (z->node == preferred_zone->node)
+	if (zone_to_nid(z) == zone_to_nid(preferred_zone))
 		__inc_numa_state(z, NUMA_HIT);
 	else {
 		__inc_numa_state(z, NUMA_MISS);
@@ -5278,7 +5278,7 @@ int local_memory_node(int node)
 	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL);
-	return z->zone->node;
+	return zone_to_nid(z->zone);
 }
 #endif
 
@@ -6299,9 +6299,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 		 * And all highmem pages will be managed by the buddy system.
 		 */
 		zone->managed_pages = freesize;
-#ifdef CONFIG_NUMA
-		zone->node = nid;
-#endif
+		zone_set_nid(zone, nid);
 		zone->name = zone_names[j];
 		zone->zone_pgdat = pgdat;
 		spin_lock_init(&zone->lock);
-- 
cgit v1.2.3


From 7cc2a9596d77e598a476ec7819046a453378d4a6 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Tue, 21 Aug 2018 21:53:36 -0700
Subject: mm: remove __paginginit

__paginginit is the same thing as __meminit except for platforms without
sparsemem, there it is defined as __init.

Remove __paginginit and use __meminit.  Use __ref in one single function
that merges __meminit and __init sections: setup_usemap().

Link: http://lkml.kernel.org/r/20180801122348.21588-4-osalvador@techadventures.net
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h   | 12 ------------
 mm/page_alloc.c | 19 ++++++++++---------
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 9e3654d70289..dab088cb6937 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter,
 	return iter + 1;
 }
 
-/*
- * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
- * so all functions starting at paging_init should be marked __init
- * in those cases. SPARSEMEM, however, allows for memory hotplug,
- * and alloc_bootmem_node is not used.
- */
-#ifdef CONFIG_SPARSEMEM
-#define __paginginit __meminit
-#else
-#define __paginginit __init
-#endif
-
 /* Memory initialisation debug and verification */
 enum mminit_level {
 	MMINIT_WARNING,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6b2324bc61db..98fe3b68f209 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6120,7 +6120,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
 	return usemapsize / 8;
 }
 
-static void __init setup_usemap(struct pglist_data *pgdat,
+static void __ref setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone,
 				unsigned long zone_start_pfn,
 				unsigned long zonesize)
@@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __paginginit set_pageblock_order(void)
+void __meminit set_pageblock_order(void)
 {
 	unsigned int order;
 
@@ -6168,14 +6168,14 @@ void __paginginit set_pageblock_order(void)
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
-void __paginginit set_pageblock_order(void)
+void __meminit set_pageblock_order(void)
 {
 }
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
-static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
-						   unsigned long present_pages)
+static unsigned long __meminit calc_memmap_size(unsigned long spanned_pages,
+						unsigned long present_pages)
 {
 	unsigned long pages = spanned_pages;
 
@@ -6233,7 +6233,7 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
  *
  * NOTE: pgdat should get zeroed by caller.
  */
-static void __paginginit free_area_init_core(struct pglist_data *pgdat)
+static void __meminit free_area_init_core(struct pglist_data *pgdat)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
@@ -6364,8 +6364,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 
-void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
-		unsigned long node_start_pfn, unsigned long *zholes_size)
+void __meminit free_area_init_node(int nid, unsigned long *zones_size,
+				   unsigned long node_start_pfn,
+				   unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long start_pfn = 0;
@@ -6410,7 +6411,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  * may be accessed (for example page_to_pfn() on some configuration accesses
  * flags). We must explicitly zero those struct pages.
  */
-void __paginginit zero_resv_unavail(void)
+void __meminit zero_resv_unavail(void)
 {
 	phys_addr_t start, end;
 	unsigned long pfn;
-- 
cgit v1.2.3


From 0188dc98ad5c7c361d46175623471d4be0fb8610 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 21 Aug 2018 21:53:39 -0700
Subject: mm/page_alloc: inline function to handle
 CONFIG_DEFERRED_STRUCT_PAGE_INIT

Let us move the code between CONFIG_DEFERRED_STRUCT_PAGE_INIT to an inline
function.  Not having an ifdef in the function makes the code more
readable.

Link: http://lkml.kernel.org/r/20180730101757.28058-4-osalvador@techadventures.net
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 98fe3b68f209..5b939bd1bff9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6364,6 +6364,21 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
+{
+	/*
+	 * We start only with one section of pages, more pages are added as
+	 * needed until the rest of deferred pages are initialized.
+	 */
+	pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
+						pgdat->node_spanned_pages);
+	pgdat->first_deferred_pfn = ULONG_MAX;
+}
+#else
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
+#endif
+
 void __meminit free_area_init_node(int nid, unsigned long *zones_size,
 				   unsigned long node_start_pfn,
 				   unsigned long *zholes_size)
@@ -6390,16 +6405,8 @@ void __meminit free_area_init_node(int nid, unsigned long *zones_size,
 				  zones_size, zholes_size);
 
 	alloc_node_mem_map(pgdat);
+	pgdat_set_deferred_range(pgdat);
 
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-	/*
-	 * We start only with one section of pages, more pages are added as
-	 * needed until the rest of deferred pages are initialized.
-	 */
-	pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
-					 pgdat->node_spanned_pages);
-	pgdat->first_deferred_pfn = ULONG_MAX;
-#endif
 	free_area_init_core(pgdat);
 }
 
-- 
cgit v1.2.3


From 03e85f9d5f1f8c74f127c5f7a87575d74a78d248 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 21 Aug 2018 21:53:43 -0700
Subject: mm/page_alloc: Introduce free_area_init_core_hotplug

Currently, whenever a new node is created/re-used from the memhotplug
path, we call free_area_init_node()->free_area_init_core().  But there is
some code that we do not really need to run when we are coming from such
path.

free_area_init_core() performs the following actions:

1) Initializes pgdat internals, such as spinlock, waitqueues and more.
2) Account # nr_all_pages and # nr_kernel_pages. These values are used later on
   when creating hash tables.
3) Account number of managed_pages per zone, substracting dma_reserved and
   memmap pages.
4) Initializes some fields of the zone structure data
5) Calls init_currently_empty_zone to initialize all the freelists
6) Calls memmap_init to initialize all pages belonging to certain zone

When called from memhotplug path, free_area_init_core() only performs
actions #1 and #4.

Action #2 is pointless as the zones do not have any pages since either the
node was freed, or we are re-using it, eitherway all zones belonging to
this node should have 0 pages.  For the same reason, action #3 results
always in manages_pages being 0.

Action #5 and #6 are performed later on when onlining the pages:
 online_pages()->move_pfn_range_to_zone()->init_currently_empty_zone()
 online_pages()->move_pfn_range_to_zone()->memmap_init_zone()

This patch does two things:

First, moves the node/zone initializtion to their own function, so it
allows us to create a small version of free_area_init_core, where we only
perform:

1) Initialization of pgdat internals, such as spinlock, waitqueues and more
4) Initialization of some fields of the zone structure data

These two functions are: pgdat_init_internals() and zone_init_internals().

The second thing this patch does, is to introduce
free_area_init_core_hotplug(), the memhotplug version of
free_area_init_core():

Currently, we call free_area_init_node() from the memhotplug path.  In
there, we set some pgdat's fields, and call calculate_node_totalpages().
calculate_node_totalpages() calculates the # of pages the node has.

Since the node is either new, or we are re-using it, the zones belonging
to this node should not have any pages, so there is no point to calculate
this now.

Actually, we re-set these values to 0 later on with the calls to:

reset_node_managed_pages()
reset_node_present_pages()

The # of pages per node and the # of pages per zone will be calculated when
onlining the pages:

online_pages()->move_pfn_range()->move_pfn_range_to_zone()->resize_zone_range()
online_pages()->move_pfn_range()->move_pfn_range_to_zone()->resize_pgdat_range()

Also, since free_area_init_core/free_area_init_node will now only get called during early init, let us replace
__paginginit with __init, so their code gets freed up.

[osalvador@techadventures.net: fix section usage]
  Link: http://lkml.kernel.org/r/20180731101752.GA473@techadventures.net
[osalvador@suse.de: v6]
  Link: http://lkml.kernel.org/r/20180801122348.21588-6-osalvador@techadventures.net
Link: http://lkml.kernel.org/r/20180730101757.28058-5-osalvador@techadventures.net
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Pasha Tatashin <Pavel.Tatashin@microsoft.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  1 +
 include/linux/mm.h             |  2 +-
 mm/memory_hotplug.c            | 16 +++------
 mm/page_alloc.c                | 78 +++++++++++++++++++++++++++++-------------
 4 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4e9828cda7a2..34a28227068d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -319,6 +319,7 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 static inline void remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+extern void __ref free_area_init_core_hotplug(int nid);
 extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int add_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a55f5389c491..a9e733b5fb76 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2015,7 +2015,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
 
 extern void __init pagecache_init(void);
 extern void free_area_init(unsigned long * zones_size);
-extern void free_area_init_node(int nid, unsigned long * zones_size,
+extern void __init free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4eb6e824a80c..9eea6e809a4e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat)
 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 {
 	struct pglist_data *pgdat;
-	unsigned long zones_size[MAX_NR_ZONES] = {0};
-	unsigned long zholes_size[MAX_NR_ZONES] = {0};
 	unsigned long start_pfn = PFN_DOWN(start);
 
 	pgdat = NODE_DATA(nid);
@@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 
 	/* we can use NODE_DATA(nid) from here */
 
+	pgdat->node_id = nid;
+	pgdat->node_start_pfn = start_pfn;
+
 	/* init node's zones as empty zones, we don't have any present pages.*/
-	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+	free_area_init_core_hotplug(nid);
 	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
 
 	/*
@@ -1016,19 +1017,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 	 */
 	build_all_zonelists(pgdat);
 
-	/*
-	 * zone->managed_pages is set to an approximate value in
-	 * free_area_init_core(), which will cause
-	 * /sys/device/system/node/nodeX/meminfo has wrong data.
-	 * So reset it to 0 before any memory is onlined.
-	 */
-	reset_node_managed_pages(pgdat);
-
 	/*
 	 * When memory is hot-added, all the memory is in offline state. So
 	 * clear all zones' present_pages because they will be updated in
 	 * online_pages() and offline_pages().
 	 */
+	reset_node_managed_pages(pgdat);
 	reset_node_present_pages(pgdat);
 
 	return pgdat;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b939bd1bff9..c677c1506d73 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __meminit set_pageblock_order(void)
+void __init set_pageblock_order(void)
 {
 	unsigned int order;
 
@@ -6168,13 +6168,13 @@ void __meminit set_pageblock_order(void)
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
-void __meminit set_pageblock_order(void)
+void __init set_pageblock_order(void)
 {
 }
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
-static unsigned long __meminit calc_memmap_size(unsigned long spanned_pages,
+static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
 						unsigned long present_pages)
 {
 	unsigned long pages = spanned_pages;
@@ -6225,19 +6225,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat)
 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
 #endif
 
-/*
- * Set up the zone data structures:
- *   - mark all pages reserved
- *   - mark all memory queues empty
- *   - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- */
-static void __meminit free_area_init_core(struct pglist_data *pgdat)
+static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 {
-	enum zone_type j;
-	int nid = pgdat->node_id;
-
 	pgdat_resize_init(pgdat);
 
 	pgdat_init_numabalancing(pgdat);
@@ -6250,7 +6239,54 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat)
 	pgdat_page_ext_init(pgdat);
 	spin_lock_init(&pgdat->lru_lock);
 	lruvec_init(node_lruvec(pgdat));
+}
+
+static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+							unsigned long remaining_pages)
+{
+	zone->managed_pages = remaining_pages;
+	zone_set_nid(zone, nid);
+	zone->name = zone_names[idx];
+	zone->zone_pgdat = NODE_DATA(nid);
+	spin_lock_init(&zone->lock);
+	zone_seqlock_init(zone);
+	zone_pcp_init(zone);
+}
+
+/*
+ * Set up the zone data structures
+ * - init pgdat internals
+ * - init all zones belonging to this node
+ *
+ * NOTE: this function is only called during memory hotplug
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __ref free_area_init_core_hotplug(int nid)
+{
+	enum zone_type z;
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	pgdat_init_internals(pgdat);
+	for (z = 0; z < MAX_NR_ZONES; z++)
+		zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+}
+#endif
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
+ * NOTE: this function is only called during early init.
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat)
+{
+	enum zone_type j;
+	int nid = pgdat->node_id;
 
+	pgdat_init_internals(pgdat);
 	pgdat->per_cpu_nodestats = &boot_nodestats;
 
 	for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -6298,13 +6334,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat)
 		 * when the bootmem allocator frees pages into the buddy system.
 		 * And all highmem pages will be managed by the buddy system.
 		 */
-		zone->managed_pages = freesize;
-		zone_set_nid(zone, nid);
-		zone->name = zone_names[j];
-		zone->zone_pgdat = pgdat;
-		spin_lock_init(&zone->lock);
-		zone_seqlock_init(zone);
-		zone_pcp_init(zone);
+		zone_init_internals(zone, j, nid, freesize);
 
 		if (!size)
 			continue;
@@ -6379,7 +6409,7 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
 #endif
 
-void __meminit free_area_init_node(int nid, unsigned long *zones_size,
+void __init free_area_init_node(int nid, unsigned long *zones_size,
 				   unsigned long node_start_pfn,
 				   unsigned long *zholes_size)
 {
@@ -6418,7 +6448,7 @@ void __meminit free_area_init_node(int nid, unsigned long *zones_size,
  * may be accessed (for example page_to_pfn() on some configuration accesses
  * flags). We must explicitly zero those struct pages.
  */
-void __meminit zero_resv_unavail(void)
+void __init zero_resv_unavail(void)
 {
 	phys_addr_t start, end;
 	unsigned long pfn;
-- 
cgit v1.2.3


From 1caed86022b9eb927283d8b2cdd7216baf43029e Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Tue, 21 Aug 2018 21:53:47 -0700
Subject: tools/testing/selftests/vm/: add MAP_POPULATE test

As with many other projects, we use some shmalloc allocator.  At some
point we need to make a part of allocated pages back private to process.
And it should be populated straight away.  Check that (MAP_PRIVATE |
MAP_POPULATE) actually copies the private page.

[akpm@linux-foundation.org: change message, per review discussion]
Link: http://lkml.kernel.org/r/20180801233636.29354-1-dima@arista.com
Signed-off-by: Dmitry Safonov <dima@arista.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Hua Zhong <hzhong@arista.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Stuart Ritchie <sritchie@arista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/.gitignore     |   1 +
 tools/testing/selftests/vm/Makefile       |   1 +
 tools/testing/selftests/vm/map_populate.c | 113 ++++++++++++++++++++++++++++++
 tools/testing/selftests/vm/run_vmtests    |  11 +++
 4 files changed, 126 insertions(+)
 create mode 100644 tools/testing/selftests/vm/map_populate.c

diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 342c7bc9dc8c..af5ff83f6d7f 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -1,6 +1,7 @@
 hugepage-mmap
 hugepage-shm
 map_hugetlb
+map_populate
 thuge-gen
 compaction_test
 mlock2-tests
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index fdefa2295ddc..9881876d2aa0 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -12,6 +12,7 @@ TEST_GEN_FILES += gup_benchmark
 TEST_GEN_FILES += hugepage-mmap
 TEST_GEN_FILES += hugepage-shm
 TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_populate
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += mlock2-tests
 TEST_GEN_FILES += on-fault-limit
diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c
new file mode 100644
index 000000000000..6b8aeaa0bf7a
--- /dev/null
+++ b/tools/testing/selftests/vm/map_populate.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Dmitry Safonov, Arista Networks
+ *
+ * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifndef MMAP_SZ
+#define MMAP_SZ		4096
+#endif
+
+#define BUG_ON(condition, description)					\
+	do {								\
+		if (condition) {					\
+			fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
+				__LINE__, (description), strerror(errno)); \
+			exit(1);					\
+		}							\
+	} while (0)
+
+static int parent_f(int sock, unsigned long *smap, int child)
+{
+	int status, ret;
+
+	ret = read(sock, &status, sizeof(int));
+	BUG_ON(ret <= 0, "read(sock)");
+
+	*smap = 0x22222BAD;
+	ret = msync(smap, MMAP_SZ, MS_SYNC);
+	BUG_ON(ret, "msync()");
+
+	ret = write(sock, &status, sizeof(int));
+	BUG_ON(ret <= 0, "write(sock)");
+
+	waitpid(child, &status, 0);
+	BUG_ON(!WIFEXITED(status), "child in unexpected state");
+
+	return WEXITSTATUS(status);
+}
+
+static int child_f(int sock, unsigned long *smap, int fd)
+{
+	int ret, buf = 0;
+
+	smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_POPULATE, fd, 0);
+	BUG_ON(smap == MAP_FAILED, "mmap()");
+
+	BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
+
+	ret = write(sock, &buf, sizeof(int));
+	BUG_ON(ret <= 0, "write(sock)");
+
+	ret = read(sock, &buf, sizeof(int));
+	BUG_ON(ret <= 0, "read(sock)");
+
+	BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
+	BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int sock[2], child, ret;
+	FILE *ftmp;
+	unsigned long *smap;
+
+	ftmp = tmpfile();
+	BUG_ON(ftmp == 0, "tmpfile()");
+
+	ret = ftruncate(fileno(ftmp), MMAP_SZ);
+	BUG_ON(ret, "ftruncate()");
+
+	smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+			MAP_SHARED, fileno(ftmp), 0);
+	BUG_ON(smap == MAP_FAILED, "mmap()");
+
+	*smap = 0xdeadbabe;
+	/* Probably unnecessary, but let it be. */
+	ret = msync(smap, MMAP_SZ, MS_SYNC);
+	BUG_ON(ret, "msync()");
+
+	ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
+	BUG_ON(ret, "socketpair()");
+
+	child = fork();
+	BUG_ON(child == -1, "fork()");
+
+	if (child) {
+		ret = close(sock[0]);
+		BUG_ON(ret, "close()");
+
+		return parent_f(sock[1], smap, child);
+	}
+
+	ret = close(sock[1]);
+	BUG_ON(ret, "close()");
+
+	return child_f(sock[0], smap, fileno(ftmp));
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 88cbe5575f0c..584a91ae4a8f 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -167,6 +167,17 @@ else
 	echo "[PASS]"
 fi
 
+echo "--------------------"
+echo "running map_populate"
+echo "--------------------"
+./map_populate
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
 echo "--------------------"
 echo "running mlock2-tests"
 echo "--------------------"
-- 
cgit v1.2.3


From 5989ad7b5ede38d605c588981f634c08252abfc3 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 21 Aug 2018 21:53:50 -0700
Subject: mm, oom: refactor oom_kill_process()

Patch series "introduce memory.oom.group", v2.

This is a tiny implementation of cgroup-aware OOM killer, which adds an
ability to kill a cgroup as a single unit and so guarantee the integrity
of the workload.

Although it has only a limited functionality in comparison to what now
resides in the mm tree (it doesn't change the victim task selection
algorithm, doesn't look at memory stas on cgroup level, etc), it's also
much simpler and more straightforward.  So, hopefully, we can avoid having
long debates here, as we had with the full implementation.

As it doesn't prevent any futher development, and implements an useful and
complete feature, it looks as a sane way forward.

This patch (of 2):

oom_kill_process() consists of two logical parts: the first one is
responsible for considering task's children as a potential victim and
printing the debug information.  The second half is responsible for
sending SIGKILL to all tasks sharing the mm struct with the given victim.

This commit splits oom_kill_process() with an intention to re-use the the
second half: __oom_kill_process().

The cgroup-aware OOM killer will kill multiple tasks belonging to the
victim cgroup.  We don't need to print the debug information for the each
task, as well as play with task selection (considering task's children),
so we can't use the existing oom_kill_process().

Link: http://lkml.kernel.org/r/20171130152824.1591-2-guro@fb.com
Link: http://lkml.kernel.org/r/20180802003201.817-3-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 123 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 65 insertions(+), 58 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 20600779f5db..330416c67ce5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -829,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task)
 	return ret;
 }
 
-static void oom_kill_process(struct oom_control *oc, const char *message)
+static void __oom_kill_process(struct task_struct *victim)
 {
-	struct task_struct *p = oc->chosen;
-	unsigned int points = oc->chosen_points;
-	struct task_struct *victim = p;
-	struct task_struct *child;
-	struct task_struct *t;
+	struct task_struct *p;
 	struct mm_struct *mm;
-	unsigned int victim_points = 0;
-	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
-					      DEFAULT_RATELIMIT_BURST);
 	bool can_oom_reap = true;
 
-	/*
-	 * If the task is already exiting, don't alarm the sysadmin or kill
-	 * its children or threads, just give it access to memory reserves
-	 * so it can die quickly
-	 */
-	task_lock(p);
-	if (task_will_free_mem(p)) {
-		mark_oom_victim(p);
-		wake_oom_reaper(p);
-		task_unlock(p);
-		put_task_struct(p);
-		return;
-	}
-	task_unlock(p);
-
-	if (__ratelimit(&oom_rs))
-		dump_header(oc, p);
-
-	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
-		message, task_pid_nr(p), p->comm, points);
-
-	/*
-	 * If any of p's children has a different mm and is eligible for kill,
-	 * the one with the highest oom_badness() score is sacrificed for its
-	 * parent.  This attempts to lose the minimal amount of work done while
-	 * still freeing memory.
-	 */
-	read_lock(&tasklist_lock);
-	for_each_thread(p, t) {
-		list_for_each_entry(child, &t->children, sibling) {
-			unsigned int child_points;
-
-			if (process_shares_mm(child, p->mm))
-				continue;
-			/*
-			 * oom_badness() returns 0 if the thread is unkillable
-			 */
-			child_points = oom_badness(child,
-				oc->memcg, oc->nodemask, oc->totalpages);
-			if (child_points > victim_points) {
-				put_task_struct(victim);
-				victim = child;
-				victim_points = child_points;
-				get_task_struct(victim);
-			}
-		}
-	}
-	read_unlock(&tasklist_lock);
-
 	p = find_lock_task_mm(victim);
 	if (!p) {
 		put_task_struct(victim);
@@ -964,6 +908,69 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 }
 #undef K
 
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+	struct task_struct *p = oc->chosen;
+	unsigned int points = oc->chosen_points;
+	struct task_struct *victim = p;
+	struct task_struct *child;
+	struct task_struct *t;
+	unsigned int victim_points = 0;
+	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+					      DEFAULT_RATELIMIT_BURST);
+
+	/*
+	 * If the task is already exiting, don't alarm the sysadmin or kill
+	 * its children or threads, just give it access to memory reserves
+	 * so it can die quickly
+	 */
+	task_lock(p);
+	if (task_will_free_mem(p)) {
+		mark_oom_victim(p);
+		wake_oom_reaper(p);
+		task_unlock(p);
+		put_task_struct(p);
+		return;
+	}
+	task_unlock(p);
+
+	if (__ratelimit(&oom_rs))
+		dump_header(oc, p);
+
+	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+		message, task_pid_nr(p), p->comm, points);
+
+	/*
+	 * If any of p's children has a different mm and is eligible for kill,
+	 * the one with the highest oom_badness() score is sacrificed for its
+	 * parent.  This attempts to lose the minimal amount of work done while
+	 * still freeing memory.
+	 */
+	read_lock(&tasklist_lock);
+	for_each_thread(p, t) {
+		list_for_each_entry(child, &t->children, sibling) {
+			unsigned int child_points;
+
+			if (process_shares_mm(child, p->mm))
+				continue;
+			/*
+			 * oom_badness() returns 0 if the thread is unkillable
+			 */
+			child_points = oom_badness(child,
+				oc->memcg, oc->nodemask, oc->totalpages);
+			if (child_points > victim_points) {
+				put_task_struct(victim);
+				victim = child;
+				victim_points = child_points;
+				get_task_struct(victim);
+			}
+		}
+	}
+	read_unlock(&tasklist_lock);
+
+	__oom_kill_process(victim);
+}
+
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
  */
-- 
cgit v1.2.3


From 3d8b38eb81cac81395f6a823f6bf401b327268e6 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 21 Aug 2018 21:53:54 -0700
Subject: mm, oom: introduce memory.oom.group

For some workloads an intervention from the OOM killer can be painful.
Killing a random task can bring the workload into an inconsistent state.

Historically, there are two common solutions for this
problem:
1) enabling panic_on_oom,
2) using a userspace daemon to monitor OOMs and kill
   all outstanding processes.

Both approaches have their downsides: rebooting on each OOM is an obvious
waste of capacity, and handling all in userspace is tricky and requires a
userspace agent, which will monitor all cgroups for OOMs.

In most cases an in-kernel after-OOM cleaning-up mechanism can eliminate
the necessity of enabling panic_on_oom.  Also, it can simplify the cgroup
management for userspace applications.

This commit introduces a new knob for cgroup v2 memory controller:
memory.oom.group.  The knob determines whether the cgroup should be
treated as an indivisible workload by the OOM killer.  If set, all tasks
belonging to the cgroup or to its descendants (if the memory cgroup is not
a leaf cgroup) are killed together or not at all.

To determine which cgroup has to be killed, we do traverse the cgroup
hierarchy from the victim task's cgroup up to the OOMing cgroup (or root)
and looking for the highest-level cgroup with memory.oom.group set.

Tasks with the OOM protection (oom_score_adj set to -1000) are treated as
an exception and are never killed.

This patch doesn't change the OOM victim selection algorithm.

Link: http://lkml.kernel.org/r/20180802003201.817-4-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 18 +++++++
 include/linux/memcontrol.h              | 18 +++++++
 mm/memcontrol.c                         | 93 +++++++++++++++++++++++++++++++++
 mm/oom_kill.c                           | 30 +++++++++++
 4 files changed, 159 insertions(+)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 1746131bc9cb..184193bcb262 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1072,6 +1072,24 @@ PAGE_SIZE multiple when read back.
 	high limit is used and monitored properly, this limit's
 	utility is limited to providing the final safety net.
 
+  memory.oom.group
+	A read-write single value file which exists on non-root
+	cgroups.  The default value is "0".
+
+	Determines whether the cgroup should be treated as
+	an indivisible workload by the OOM killer. If set,
+	all tasks belonging to the cgroup or to its descendants
+	(if the memory cgroup is not a leaf cgroup) are killed
+	together or not at all. This can be used to avoid
+	partial kills to guarantee workload integrity.
+
+	Tasks with the OOM protection (oom_score_adj set to -1000)
+	are treated as an exception and are never killed.
+
+	If the OOM killer is invoked in a cgroup, it's not going
+	to kill any tasks outside of this cgroup, regardless
+	memory.oom.group values of ancestor cgroups.
+
   memory.events
 	A read-only flat-keyed file which exists on non-root cgroups.
 	The following entries are defined.  Unless specified
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0e6c515fb698..652f602167df 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -225,6 +225,11 @@ struct mem_cgroup {
 	 */
 	bool use_hierarchy;
 
+	/*
+	 * Should the OOM killer kill all belonging tasks, had it kill one?
+	 */
+	bool oom_group;
+
 	/* protected by memcg_oom_lock */
 	bool		oom_lock;
 	int		under_oom;
@@ -542,6 +547,9 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 }
 
 bool mem_cgroup_oom_synchronize(bool wait);
+struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
+					    struct mem_cgroup *oom_domain);
+void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
 
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
@@ -1001,6 +1009,16 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
 	return false;
 }
 
+static inline struct mem_cgroup *mem_cgroup_get_oom_group(
+	struct task_struct *victim, struct mem_cgroup *oom_domain)
+{
+	return NULL;
+}
+
+static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
+{
+}
+
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
 					     int idx)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 59c14c988143..4ead5a4817de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1776,6 +1776,62 @@ cleanup:
 	return true;
 }
 
+/**
+ * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
+ * @victim: task to be killed by the OOM killer
+ * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
+ *
+ * Returns a pointer to a memory cgroup, which has to be cleaned up
+ * by killing all belonging OOM-killable tasks.
+ *
+ * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
+ */
+struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
+					    struct mem_cgroup *oom_domain)
+{
+	struct mem_cgroup *oom_group = NULL;
+	struct mem_cgroup *memcg;
+
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return NULL;
+
+	if (!oom_domain)
+		oom_domain = root_mem_cgroup;
+
+	rcu_read_lock();
+
+	memcg = mem_cgroup_from_task(victim);
+	if (memcg == root_mem_cgroup)
+		goto out;
+
+	/*
+	 * Traverse the memory cgroup hierarchy from the victim task's
+	 * cgroup up to the OOMing cgroup (or root) to find the
+	 * highest-level memory cgroup with oom.group set.
+	 */
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+		if (memcg->oom_group)
+			oom_group = memcg;
+
+		if (memcg == oom_domain)
+			break;
+	}
+
+	if (oom_group)
+		css_get(&oom_group->css);
+out:
+	rcu_read_unlock();
+
+	return oom_group;
+}
+
+void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
+{
+	pr_info("Tasks in ");
+	pr_cont_cgroup_path(memcg->css.cgroup);
+	pr_cont(" are going to be killed due to memory.oom.group set\n");
+}
+
 /**
  * lock_page_memcg - lock a page->mem_cgroup binding
  * @page: the page
@@ -5561,6 +5617,37 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int memory_oom_group_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "%d\n", memcg->oom_group);
+
+	return 0;
+}
+
+static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, oom_group;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtoint(buf, 0, &oom_group);
+	if (ret)
+		return ret;
+
+	if (oom_group != 0 && oom_group != 1)
+		return -EINVAL;
+
+	memcg->oom_group = oom_group;
+
+	return nbytes;
+}
+
 static struct cftype memory_files[] = {
 	{
 		.name = "current",
@@ -5602,6 +5689,12 @@ static struct cftype memory_files[] = {
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = memory_stat_show,
 	},
+	{
+		.name = "oom.group",
+		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+		.seq_show = memory_oom_group_show,
+		.write = memory_oom_group_write,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 330416c67ce5..0e10b864e074 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -908,6 +908,19 @@ static void __oom_kill_process(struct task_struct *victim)
 }
 #undef K
 
+/*
+ * Kill provided task unless it's secured by setting
+ * oom_score_adj to OOM_SCORE_ADJ_MIN.
+ */
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+		get_task_struct(task);
+		__oom_kill_process(task);
+	}
+	return 0;
+}
+
 static void oom_kill_process(struct oom_control *oc, const char *message)
 {
 	struct task_struct *p = oc->chosen;
@@ -915,6 +928,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	struct task_struct *victim = p;
 	struct task_struct *child;
 	struct task_struct *t;
+	struct mem_cgroup *oom_group;
 	unsigned int victim_points = 0;
 	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 					      DEFAULT_RATELIMIT_BURST);
@@ -968,7 +982,23 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	}
 	read_unlock(&tasklist_lock);
 
+	/*
+	 * Do we need to kill the entire memory cgroup?
+	 * Or even one of the ancestor memory cgroups?
+	 * Check this out before killing the victim task.
+	 */
+	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
+
 	__oom_kill_process(victim);
+
+	/*
+	 * If necessary, kill all tasks in the selected memory cgroup.
+	 */
+	if (oom_group) {
+		mem_cgroup_print_oom_group(oom_group);
+		mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+		mem_cgroup_put(oom_group);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 7e8a6304d5419cbf056a59de92939e5eef039c57 Mon Sep 17 00:00:00 2001
From: "Dennis Zhou (Facebook)" <dennisszhou@gmail.com>
Date: Tue, 21 Aug 2018 21:53:58 -0700
Subject: /proc/meminfo: add percpu populated pages count

Currently, percpu memory only exposes allocation and utilization
information via debugfs.  This more or less is only really useful for
understanding the fragmentation and allocation information at a per-chunk
level with a few global counters.  This is also gated behind a config.
BPF and cgroup, for example, have seen an increase in use causing
increased use of percpu memory.  Let's make it easier for someone to
identify how much memory is being used.

This patch adds the "Percpu" stat to meminfo to more easily look up how
much percpu memory is in use.  This number includes the cost for all
allocated backing pages and not just insight at the per a unit, per chunk
level.  Metadata is excluded.  I think excluding metadata is fair because
the backing memory scales with the numbere of cpus and can quickly
outweigh the metadata.  It also makes this calculation light.

Link: http://lkml.kernel.org/r/20180807184723.74919-1-dennisszhou@gmail.com
Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  3 +++
 fs/proc/meminfo.c                  |  2 ++
 include/linux/percpu.h             |  2 ++
 mm/percpu.c                        | 29 +++++++++++++++++++++++++++++
 4 files changed, 36 insertions(+)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 1605acbb23b6..22b4b00dee31 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -870,6 +870,7 @@ Committed_AS:   100056 kB
 VmallocTotal:   112216 kB
 VmallocUsed:       428 kB
 VmallocChunk:   111088 kB
+Percpu:          62080 kB
 HardwareCorrupted:   0 kB
 AnonHugePages:   49152 kB
 ShmemHugePages:      0 kB
@@ -962,6 +963,8 @@ Committed_AS: The amount of memory presently allocated on the system.
 VmallocTotal: total size of vmalloc memory area
  VmallocUsed: amount of vmalloc area which is used
 VmallocChunk: largest contiguous block of vmalloc area which is free
+      Percpu: Memory allocated to the percpu allocator used to back percpu
+              allocations. This stat excludes the cost of metadata.
 
 ..............................................................................
 
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 2fb04846ed11..edda898714eb 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -7,6 +7,7 @@
 #include <linux/mman.h>
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
+#include <linux/percpu.h>
 #include <linux/quicklist.h>
 #include <linux/seq_file.h>
 #include <linux/swap.h>
@@ -121,6 +122,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		   (unsigned long)VMALLOC_TOTAL >> 10);
 	show_val_kb(m, "VmallocUsed:    ", 0ul);
 	show_val_kb(m, "VmallocChunk:   ", 0ul);
+	show_val_kb(m, "Percpu:         ", pcpu_nr_pages());
 
 #ifdef CONFIG_MEMORY_FAILURE
 	seq_printf(m, "HardwareCorrupted: %5lu kB\n",
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 296bbe49d5d1..70b7123f38c7 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -149,4 +149,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
 						__alignof__(type))
 
+extern unsigned long pcpu_nr_pages(void);
+
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/percpu.c b/mm/percpu.c
index 0b6480979ac7..a749d4d96e3e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -169,6 +169,14 @@ static LIST_HEAD(pcpu_map_extend_chunks);
  */
 int pcpu_nr_empty_pop_pages;
 
+/*
+ * The number of populated pages in use by the allocator, protected by
+ * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
+ * allocated/deallocated, it is allocated/deallocated in all units of a chunk
+ * and increments/decrements this count by 1).
+ */
+static unsigned long pcpu_nr_populated;
+
 /*
  * Balance work is used to populate or destroy chunks asynchronously.  We
  * try to keep the number of populated free pages between
@@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
 
 	bitmap_set(chunk->populated, page_start, nr);
 	chunk->nr_populated += nr;
+	pcpu_nr_populated += nr;
 
 	if (!for_alloc) {
 		chunk->nr_empty_pop_pages += nr;
@@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
 	chunk->nr_populated -= nr;
 	chunk->nr_empty_pop_pages -= nr;
 	pcpu_nr_empty_pop_pages -= nr;
+	pcpu_nr_populated -= nr;
 }
 
 /*
@@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
+	/* include all regions of the first chunk */
+	pcpu_nr_populated += PFN_DOWN(size_sum);
+
 	pcpu_stats_chunk_alloc();
 	trace_percpu_create_chunk(base_addr);
 
@@ -2745,6 +2758,22 @@ void __init setup_per_cpu_areas(void)
 
 #endif	/* CONFIG_SMP */
 
+/*
+ * pcpu_nr_pages - calculate total number of populated backing pages
+ *
+ * This reflects the number of pages populated to back chunks.  Metadata is
+ * excluded in the number exposed in meminfo as the number of backing pages
+ * scales with the number of cpus and can quickly outweigh the memory used for
+ * metadata.  It also keeps this calculation nice and simple.
+ *
+ * RETURNS:
+ * Total number of populated backing pages in use by the allocator.
+ */
+unsigned long pcpu_nr_pages(void)
+{
+	return pcpu_nr_populated * pcpu_nr_units;
+}
+
 /*
  * Percpu allocator is initialized early during boot when neither slab or
  * workqueue is available.  Plug async management until everything is up
-- 
cgit v1.2.3


From c8bd134a4bddafe5917d163eea73873932c15e83 Mon Sep 17 00:00:00 2001
From: Peter Kalauskas <peskal@google.com>
Date: Tue, 21 Aug 2018 21:54:02 -0700
Subject: drivers/block/zram/zram_drv.c: fix bug storing backing_dev

The call to strlcpy in backing_dev_store is incorrect. It should take
the size of the destination buffer instead of the size of the source
buffer.  Additionally, ignore the newline character (\n) when reading
the new file_name buffer. This makes it possible to set the backing_dev
as follows:

	echo /dev/sdX > /sys/block/zram0/backing_dev

The reason it worked before was the fact that strlcpy() copies 'len - 1'
bytes, which is strlen(buf) - 1 in our case, so it accidentally didn't
copy the trailing new line symbol.  Which also means that "echo -n
/dev/sdX" most likely was broken.

Signed-off-by: Peter Kalauskas <peskal@google.com>
Link: http://lkml.kernel.org/r/20180813061623.GC64836@rodete-desktop-imager.corp.google.com
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: <stable@vger.kernel.org>    [4.14+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index c7acf74253a1..a1d6b5597c17 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -337,6 +337,7 @@ static ssize_t backing_dev_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	char *file_name;
+	size_t sz;
 	struct file *backing_dev = NULL;
 	struct inode *inode;
 	struct address_space *mapping;
@@ -357,7 +358,11 @@ static ssize_t backing_dev_store(struct device *dev,
 		goto out;
 	}
 
-	strlcpy(file_name, buf, len);
+	strlcpy(file_name, buf, PATH_MAX);
+	/* ignore trailing newline */
+	sz = strlen(file_name);
+	if (sz > 0 && file_name[sz - 1] == '\n')
+		file_name[sz - 1] = 0x00;
 
 	backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
 	if (IS_ERR(backing_dev)) {
-- 
cgit v1.2.3


From 5df66d306ec9b1952d3d183fe4e2ced1a7b2bddb Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 21 Aug 2018 21:54:06 -0700
Subject: mm: fix comment for NODEMASK_ALLOC

Currently, NODEMASK_ALLOC allocates a nodemask_t with kmalloc when
NODES_SHIFT is higher than 8, otherwise it declares it within the stack.

The comment says that the reasoning behind this, is that nodemask_t will
be 256 bytes when NODES_SHIFT is higher than 8, but this is not true.  For
example, NODES_SHIFT = 9 will give us a 64 bytes nodemask_t.  Let us fix
up the comment for that.

Another thing is that it might make sense to let values lower than
128bytes be allocated in the stack.  Although this all depends on the
depth of the stack (and this changes from function to function), I think
that 64 bytes is something we can easily afford.  So we could even bump
the limit by 1 (from > 8 to > 9).

Link: http://lkml.kernel.org/r/20180820085516.9687-1-osalvador@techadventures.net
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 1fbde8a880d9..5a30ad594ccc 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -518,7 +518,7 @@ static inline int node_random(const nodemask_t *mask)
  * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
  * name.
  */
-#if NODES_SHIFT > 8 /* nodemask_t > 256 bytes */
+#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */
 #define NODEMASK_ALLOC(type, name, gfp_flags)	\
 			type *name = kmalloc(sizeof(*name), gfp_flags)
 #define NODEMASK_FREE(m)			kfree(m)
-- 
cgit v1.2.3


From 2d6e4e822ad8174b59414903ae6b35660032c20a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:09 -0700
Subject: proc: fixup PDE allocation bloat

24074a35c5c975 ("proc: Make inline name size calculation automatic")
started to put PDE allocations into kmalloc-256 which is unnecessary as
~40 character names are very rare.

Put allocation back into kmalloc-192 cache for 64-bit non-debug builds.

Put BUILD_BUG_ON to know when PDE size has gotten out of control.

[adobriyan@gmail.com: fix BUILD_BUG_ON breakage on powerpc64]
  Link: http://lkml.kernel.org/r/20180703191602.GA25521@avx2
Link: http://lkml.kernel.org/r/20180617215732.GA24688@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c    |  6 ++++--
 fs/proc/internal.h | 17 +++++++----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 85ffbd27f288..fc5306a31a1d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -105,8 +105,10 @@ void __init proc_init_kmemcache(void)
 		kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
 				  SLAB_ACCOUNT|SLAB_PANIC, NULL);
 	proc_dir_entry_cache = kmem_cache_create_usercopy(
-		"proc_dir_entry", SIZEOF_PDE_SLOT, 0, SLAB_PANIC,
-		OFFSETOF_PDE_NAME, SIZEOF_PDE_INLINE_NAME, NULL);
+		"proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC,
+		offsetof(struct proc_dir_entry, inline_name),
+		SIZEOF_PDE_INLINE_NAME, NULL);
+	BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
 }
 
 static int proc_show_options(struct seq_file *seq, struct dentry *root)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f5b75d258d22..c9d97818a421 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,16 +65,13 @@ struct proc_dir_entry {
 	char inline_name[];
 } __randomize_layout;
 
-#define OFFSETOF_PDE_NAME offsetof(struct proc_dir_entry, inline_name)
-#define SIZEOF_PDE_SLOT					\
-	(OFFSETOF_PDE_NAME + 34 <= 64 ? 64 :		\
-	 OFFSETOF_PDE_NAME + 34 <= 128 ? 128 :		\
-	 OFFSETOF_PDE_NAME + 34 <= 192 ? 192 :		\
-	 OFFSETOF_PDE_NAME + 34 <= 256 ? 256 :		\
-	 OFFSETOF_PDE_NAME + 34 <= 512 ? 512 :		\
-	 0)
-
-#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE_SLOT - OFFSETOF_PDE_NAME)
+#define SIZEOF_PDE	(				\
+	sizeof(struct proc_dir_entry) < 128 ? 128 :	\
+	sizeof(struct proc_dir_entry) < 192 ? 192 :	\
+	sizeof(struct proc_dir_entry) < 256 ? 256 :	\
+	sizeof(struct proc_dir_entry) < 512 ? 512 :	\
+	0)
+#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
 
 extern struct kmem_cache *proc_dir_entry_cache;
 void pde_free(struct proc_dir_entry *pde);
-- 
cgit v1.2.3


From bdf228a27278d398ad26a156f01811be405867f9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 21:54:13 -0700
Subject: fs/proc/uptime.c: use ktime_get_boottime_ts64

get_monotonic_boottime() is deprecated and uses the old timespec type.
Let's convert /proc/uptime to use ktime_get_boottime_ts64().

Link: http://lkml.kernel.org/r/20180620081746.282742-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/uptime.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 3f723cb478af..a4c2791ab70b 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -9,7 +9,7 @@
 
 static int uptime_proc_show(struct seq_file *m, void *v)
 {
-	struct timespec uptime;
+	struct timespec64 uptime;
 	struct timespec64 idle;
 	u64 nsec;
 	u32 rem;
@@ -19,7 +19,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 	for_each_possible_cpu(i)
 		nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
-	get_monotonic_boottime(&uptime);
+	ktime_get_boottime_ts64(&uptime);
 	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 	idle.tv_nsec = rem;
 	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
-- 
cgit v1.2.3


From 61d47c4e71c1f080e7412315c8685bc682a8e53a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:16 -0700
Subject: proc: test /proc/self symlink

There are plans to change how /proc/self result is calculated,
for that a test is necessary.

Use direct system call because of this whole getpid caching story.

Link: http://lkml.kernel.org/r/20180627195103.GB18113@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/proc/.gitignore |  1 +
 tools/testing/selftests/proc/Makefile   |  2 ++
 tools/testing/selftests/proc/proc.h     |  7 ++++++
 tools/testing/selftests/proc/self.c     | 39 +++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+)
 create mode 100644 tools/testing/selftests/proc/self.c

diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index 74e5912e9f2e..ee33d6c3137f 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -9,3 +9,4 @@
 /proc-uptime-001
 /proc-uptime-002
 /read
+/self
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index db310eedc268..344bca8e4b28 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -1,4 +1,5 @@
 CFLAGS += -Wall -O2 -Wno-unused-function
+CFLAGS += -D_GNU_SOURCE
 
 TEST_GEN_PROGS :=
 TEST_GEN_PROGS += fd-001-lookup
@@ -12,5 +13,6 @@ TEST_GEN_PROGS += proc-self-wchan
 TEST_GEN_PROGS += proc-uptime-001
 TEST_GEN_PROGS += proc-uptime-002
 TEST_GEN_PROGS += read
+TEST_GEN_PROGS += self
 
 include ../lib.mk
diff --git a/tools/testing/selftests/proc/proc.h b/tools/testing/selftests/proc/proc.h
index 4e178166fd84..31ca4b0c007f 100644
--- a/tools/testing/selftests/proc/proc.h
+++ b/tools/testing/selftests/proc/proc.h
@@ -6,6 +6,13 @@
 #include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+static inline pid_t sys_getpid(void)
+{
+	return syscall(SYS_getpid);
+}
 
 static inline bool streq(const char *s1, const char *s2)
 {
diff --git a/tools/testing/selftests/proc/self.c b/tools/testing/selftests/proc/self.c
new file mode 100644
index 000000000000..21c15a1ffefb
--- /dev/null
+++ b/tools/testing/selftests/proc/self.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that /proc/self gives correct TGID.
+#undef NDEBUG
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "proc.h"
+
+int main(void)
+{
+	char buf1[64], buf2[64];
+	pid_t pid;
+	ssize_t rv;
+
+	pid = sys_getpid();
+	snprintf(buf1, sizeof(buf1), "%u", pid);
+
+	rv = readlink("/proc/self", buf2, sizeof(buf2));
+	assert(rv == strlen(buf1));
+	buf2[rv] = '\0';
+	assert(streq(buf1, buf2));
+
+	return 0;
+}
-- 
cgit v1.2.3


From 2cd36fb329487e9525db3ccdbac1e92053b646fa Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:20 -0700
Subject: proc: test /proc/thread-self symlink

Same story: I have WIP patch to make it faster, so better have a test
as well.

Link: http://lkml.kernel.org/r/20180627195209.GC18113@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/proc/.gitignore    |  1 +
 tools/testing/selftests/proc/Makefile      |  1 +
 tools/testing/selftests/proc/proc.h        |  5 +++
 tools/testing/selftests/proc/thread-self.c | 64 ++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+)
 create mode 100644 tools/testing/selftests/proc/thread-self.c

diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index ee33d6c3137f..82121a81681f 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -10,3 +10,4 @@
 /proc-uptime-002
 /read
 /self
+/thread-self
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index 344bca8e4b28..1c12c34cf85d 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -14,5 +14,6 @@ TEST_GEN_PROGS += proc-uptime-001
 TEST_GEN_PROGS += proc-uptime-002
 TEST_GEN_PROGS += read
 TEST_GEN_PROGS += self
+TEST_GEN_PROGS += thread-self
 
 include ../lib.mk
diff --git a/tools/testing/selftests/proc/proc.h b/tools/testing/selftests/proc/proc.h
index 31ca4b0c007f..b7d57ea40237 100644
--- a/tools/testing/selftests/proc/proc.h
+++ b/tools/testing/selftests/proc/proc.h
@@ -14,6 +14,11 @@ static inline pid_t sys_getpid(void)
 	return syscall(SYS_getpid);
 }
 
+static inline pid_t sys_gettid(void)
+{
+	return syscall(SYS_gettid);
+}
+
 static inline bool streq(const char *s1, const char *s2)
 {
 	return strcmp(s1, s2) == 0;
diff --git a/tools/testing/selftests/proc/thread-self.c b/tools/testing/selftests/proc/thread-self.c
new file mode 100644
index 000000000000..4b23b39b7ae0
--- /dev/null
+++ b/tools/testing/selftests/proc/thread-self.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that /proc/thread-self gives correct TGID/PID.
+#undef NDEBUG
+#include <assert.h>
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#include "proc.h"
+
+int f(void *arg)
+{
+	char buf1[64], buf2[64];
+	pid_t pid, tid;
+	ssize_t rv;
+
+	pid = sys_getpid();
+	tid = sys_gettid();
+	snprintf(buf1, sizeof(buf1), "%u/task/%u", pid, tid);
+
+	rv = readlink("/proc/thread-self", buf2, sizeof(buf2));
+	assert(rv == strlen(buf1));
+	buf2[rv] = '\0';
+	assert(streq(buf1, buf2));
+
+	if (arg)
+		exit(0);
+	return 0;
+}
+
+int main(void)
+{
+	const int PAGE_SIZE = sysconf(_SC_PAGESIZE);
+	pid_t pid;
+	void *stack;
+
+	/* main thread */
+	f((void *)0);
+
+	stack = mmap(NULL, 2 * PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+	assert(stack != MAP_FAILED);
+	/* side thread */
+	pid = clone(f, stack + PAGE_SIZE, CLONE_THREAD|CLONE_SIGHAND|CLONE_VM, (void *)1);
+	assert(pid > 0);
+	pause();
+
+	return 0;
+}
-- 
cgit v1.2.3


From 8d48b2e044218ba8b3f6b1d34e96a7474b6b0688 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:23 -0700
Subject: proc: smaller readlock section in readdir("/proc")

Readdir context is thread local, so ->pos is thread local,
move it out of readlock.

Link: http://lkml.kernel.org/r/20180627195339.GD18113@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index bb1c1625b158..8ae109429a88 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -286,9 +286,9 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
+	i = ctx->pos - 2;
 	read_lock(&proc_subdir_lock);
 	de = pde_subdir_first(de);
-	i = ctx->pos - 2;
 	for (;;) {
 		if (!de) {
 			read_unlock(&proc_subdir_lock);
@@ -309,8 +309,8 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
 			pde_put(de);
 			return 0;
 		}
-		read_lock(&proc_subdir_lock);
 		ctx->pos++;
+		read_lock(&proc_subdir_lock);
 		next = pde_subdir_next(de);
 		pde_put(de);
 		de = next;
-- 
cgit v1.2.3


From a44937fe4ef6a1190576492017939df636f4e38e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:27 -0700
Subject: proc: put task earlier in /proc/*/fail-nth

Link: http://lkml.kernel.org/r/20180627195427.GE18113@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ad047977ed04..912a0306bb4d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1366,10 +1366,8 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
 	if (!task)
 		return -ESRCH;
 	len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
-	len = simple_read_from_buffer(buf, count, ppos, numbuf, len);
 	put_task_struct(task);
-
-	return len;
+	return simple_read_from_buffer(buf, count, ppos, numbuf, len);
 }
 
 static const struct file_operations proc_fail_nth_operations = {
-- 
cgit v1.2.3


From 41089b6d3e44a895076cc8ce56b08e463cb4f796 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:30 -0700
Subject: proc: save 2 atomic ops on write to "/proc/*/attr/*"

Code checks if write is done by current to its own attributes.
For that get/put pair is unnecessary as it can be done under RCU.

Note: rcu_read_unlock() can be done even earlier since pointer to a task
is not dereferenced. It depends if /proc code should look scary or not:

	rcu_read_lock();
	task = pid_task(...);
	rcu_read_unlock();
	if (!task)
		return -ESRCH;
	if (task != current)
		return -EACCESS:

P.S.: rename "length" variable.	Code like this

	length = -EINVAL;

should not exist.

Link: http://lkml.kernel.org/r/20180627200218.GF18113@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 912a0306bb4d..4d8dac635da4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2517,47 +2517,47 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 				   size_t count, loff_t *ppos)
 {
 	struct inode * inode = file_inode(file);
+	struct task_struct *task;
 	void *page;
-	ssize_t length;
-	struct task_struct *task = get_proc_task(inode);
-
-	length = -ESRCH;
-	if (!task)
-		goto out_no_task;
+	int rv;
 
+	rcu_read_lock();
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (!task) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
 	/* A task may only write its own attributes. */
-	length = -EACCES;
-	if (current != task)
-		goto out;
+	if (current != task) {
+		rcu_read_unlock();
+		return -EACCES;
+	}
+	rcu_read_unlock();
 
 	if (count > PAGE_SIZE)
 		count = PAGE_SIZE;
 
 	/* No partial writes. */
-	length = -EINVAL;
 	if (*ppos != 0)
-		goto out;
+		return -EINVAL;
 
 	page = memdup_user(buf, count);
 	if (IS_ERR(page)) {
-		length = PTR_ERR(page);
+		rv = PTR_ERR(page);
 		goto out;
 	}
 
 	/* Guard against adverse ptrace interaction */
-	length = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
-	if (length < 0)
+	rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
+	if (rv < 0)
 		goto out_free;
 
-	length = security_setprocattr(file->f_path.dentry->d_name.name,
-				      page, count);
+	rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count);
 	mutex_unlock(&current->signal->cred_guard_mutex);
 out_free:
 	kfree(page);
 out:
-	put_task_struct(task);
-out_no_task:
-	return length;
+	return rv;
 }
 
 static const struct file_operations proc_pid_attr_operations = {
-- 
cgit v1.2.3


From f6d2f584d88616e27eeb603dc8c88ca16e00d682 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:34 -0700
Subject: proc: use macro in /proc/latency hook

->latency_record is defined as

	struct latency_record[LT_SAVECOUNT];

so use the same macro whie iterating.

Link: http://lkml.kernel.org/r/20180627200534.GA18434@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4d8dac635da4..ccf86f16d9f0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -463,7 +463,7 @@ static int lstats_show_proc(struct seq_file *m, void *v)
 	if (!task)
 		return -ESRCH;
 	seq_puts(m, "Latency Top version : v0.1\n");
-	for (i = 0; i < 32; i++) {
+	for (i = 0; i < LT_SAVECOUNT; i++) {
 		struct latency_record *lr = &task->latency_record[i];
 		if (lr->backtrace[0]) {
 			int q;
-- 
cgit v1.2.3


From 891ae71dc4fbd2454a3fa569e115a7ca86630949 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:37 -0700
Subject: proc: spread "const" a bit

Link: http://lkml.kernel.org/r/20180627200614.GB18434@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/internal.h      | 4 ++--
 include/linux/proc_fs.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c9d97818a421..5185d7f6a51e 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -113,12 +113,12 @@ static inline void *__PDE_DATA(const struct inode *inode)
 	return PDE(inode)->data;
 }
 
-static inline struct pid *proc_pid(struct inode *inode)
+static inline struct pid *proc_pid(const struct inode *inode)
 {
 	return PROC_I(inode)->pid;
 }
 
-static inline struct task_struct *get_proc_task(struct inode *inode)
+static inline struct task_struct *get_proc_task(const struct inode *inode)
 {
 	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 626fc65c4336..d0e1f1522a78 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -129,7 +129,7 @@ int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns));
 
 /* get the associated pid namespace for a file in procfs */
-static inline struct pid_namespace *proc_pid_ns(struct inode *inode)
+static inline struct pid_namespace *proc_pid_ns(const struct inode *inode)
 {
 	return inode->i_sb->s_fs_info;
 }
-- 
cgit v1.2.3


From 9a27e97aaab9a25fac2e8976e98e42ab7f4a8fac Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 21 Aug 2018 21:54:41 -0700
Subject: proc: use "unsigned int" in /proc/stat hook

Number of CPUs is never high enough to force 64-bit arithmetic.
Save couple of bytes on x86_64.

Link: http://lkml.kernel.org/r/20180627200710.GC18434@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 59749dfaef67..535eda7857cf 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -183,7 +183,7 @@ static int show_stat(struct seq_file *p, void *v)
 
 static int stat_open(struct inode *inode, struct file *file)
 {
-	size_t size = 1024 + 128 * num_online_cpus();
+	unsigned int size = 1024 + 128 * num_online_cpus();
 
 	/* minimum size to display an interrupt count : 2 bytes */
 	size += 2 * nr_irqs;
-- 
cgit v1.2.3


From 36f062042b0fd9f8e41b97a472f52139886ca26f Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Tue, 21 Aug 2018 21:54:44 -0700
Subject: fs/proc/vmcore.c: use new typedef vm_fault_t

Use new return type vm_fault_t for fault handler in struct
vm_operations_struct.  For now, this is just documenting that the function
returns a VM_FAULT value rather than an errno.  Once all instances are
converted, vm_fault_t will become a distinct type.

See 1c8f422059ae ("mm: change return type to vm_fault_t") for reference.

Link: http://lkml.kernel.org/r/20180702153325.GA3875@jordon-HP-15-Notebook-PC
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ganesh Goudar <ganeshgr@chelsio.com>
Cc: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cfb6674331fd..6c1c2607e9e4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -379,7 +379,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
  * On s390 the fault handler is used for memory regions that can't be mapped
  * directly with remap_pfn_range().
  */
-static int mmap_vmcore_fault(struct vm_fault *vmf)
+static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
 {
 #ifdef CONFIG_S390
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-- 
cgit v1.2.3


From df865e8337c397471b95f51017fea559bc8abb4a Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Tue, 21 Aug 2018 21:54:48 -0700
Subject: fs/proc/kcore.c: use __pa_symbol() for KCORE_TEXT list entries

elf_kcore_store_hdr() uses __pa() to find the physical address of
KCORE_RAM or KCORE_TEXT entries exported as program headers.

This trips CONFIG_DEBUG_VIRTUAL's checks, as the KCORE_TEXT entries are
not in the linear map.

Handle these two cases separately, using __pa_symbol() for the KCORE_TEXT
entries.

Link: http://lkml.kernel.org/r/20180711131944.15252-1-james.morse@arm.com
Signed-off-by: James Morse <james.morse@arm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Omar Sandoval <osandov@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e64ecb9f2720..66c373230e60 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -384,8 +384,10 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 		phdr->p_flags	= PF_R|PF_W|PF_X;
 		phdr->p_offset	= kc_vaddr_to_offset(m->addr) + dataoff;
 		phdr->p_vaddr	= (size_t)m->addr;
-		if (m->type == KCORE_RAM || m->type == KCORE_TEXT)
+		if (m->type == KCORE_RAM)
 			phdr->p_paddr	= __pa(m->addr);
+		else if (m->type == KCORE_TEXT)
+			phdr->p_paddr	= __pa_symbol(m->addr);
 		else
 			phdr->p_paddr	= (elf_addr_t)-1;
 		phdr->p_filesz	= phdr->p_memsz	= m->size;
-- 
cgit v1.2.3


From a8dd9c4df18edc873d244790d163564a5d17626b Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:54:51 -0700
Subject: proc/kcore: don't grab lock for kclist_add()

Patch series "/proc/kcore improvements", v4.

This series makes a few improvements to /proc/kcore.  It fixes a couple of
small issues in v3 but is otherwise the same.  Patches 1, 2, and 3 are
prep patches.  Patch 4 is a fix/cleanup.  Patch 5 is another prep patch.
Patches 6 and 7 are optimizations to ->read().  Patch 8 makes it possible
to enable CRASH_CORE on any architecture, which is needed for patch 9.
Patch 9 adds vmcoreinfo to /proc/kcore.

This patch (of 9):

kclist_add() is only called at init time, so there's no point in grabbing
any locks.  We're also going to replace the rwlock with a rwsem, which we
don't want to try grabbing during early boot.

While we're here, mark kclist_add() with __init so that we'll get a
warning if it's called from non-init code.

Link: http://lkml.kernel.org/r/98208db1faf167aa8b08eebfa968d95c70527739.1531953780.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Bhupesh Sharma <bhsharma@redhat.com>
Tested-by: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c       | 7 +++----
 include/linux/kcore.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 66c373230e60..b0b9a76f28d6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,16 +62,15 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 static int kcore_need_update = 1;
 
-void
-kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
+void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
+		       int type)
 {
 	new->addr = (unsigned long)addr;
 	new->size = size;
 	new->type = type;
 
-	write_lock(&kclist_lock);
 	list_add_tail(&new->list, &kclist_head);
-	write_unlock(&kclist_lock);
 }
 
 static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8de55e4b5ee9..c20f296438fb 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -35,7 +35,7 @@ struct vmcoredd_node {
 };
 
 #ifdef CONFIG_PROC_KCORE
-extern void kclist_add(struct kcore_list *, void *, size_t, int type);
+void __init kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
 static inline
 void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
-- 
cgit v1.2.3


From bf53183164dbba00b342df7d2215b33007ed83ed Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:54:55 -0700
Subject: proc/kcore: don't grab lock for memory hotplug notifier

The memory hotplug notifier kcore_callback() only needs kclist_lock to
prevent races with __kcore_update_ram(), but we can easily eliminate that
race by using an atomic xchg() in __kcore_update_ram().  This is
preparation for converting kclist_lock to an rwsem.

Link: http://lkml.kernel.org/r/0a4bc89f4dbde8b5b2ea309f7b4fb6a85fe29df2.1531953780.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index b0b9a76f28d6..e83f15a4f66d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -118,7 +118,7 @@ static void __kcore_update_ram(struct list_head *list)
 	LIST_HEAD(garbage);
 
 	write_lock(&kclist_lock);
-	if (kcore_need_update) {
+	if (xchg(&kcore_need_update, 0)) {
 		list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
 			if (pos->type == KCORE_RAM
 				|| pos->type == KCORE_VMEMMAP)
@@ -127,7 +127,6 @@ static void __kcore_update_ram(struct list_head *list)
 		list_splice_tail(list, &kclist_head);
 	} else
 		list_splice(list, &garbage);
-	kcore_need_update = 0;
 	proc_root_kcore->size = get_kcore_size(&nphdr, &size);
 	write_unlock(&kclist_lock);
 
@@ -593,9 +592,8 @@ static int __meminit kcore_callback(struct notifier_block *self,
 	switch (action) {
 	case MEM_ONLINE:
 	case MEM_OFFLINE:
-		write_lock(&kclist_lock);
 		kcore_need_update = 1;
-		write_unlock(&kclist_lock);
+		break;
 	}
 	return NOTIFY_OK;
 }
-- 
cgit v1.2.3


From 0b172f845ff963ab15e2d861dc155e2ab13241e9 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:54:59 -0700
Subject: proc/kcore: replace kclist_lock rwlock with rwsem

Now we only need kclist_lock from user context and at fs init time, and
the following changes need to sleep while holding the kclist_lock.

Link: http://lkml.kernel.org/r/521ba449ebe921d905177410fee9222d07882f0d.1531953780.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e83f15a4f66d..ae43a97d511d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -59,7 +59,7 @@ struct memelfnote
 };
 
 static LIST_HEAD(kclist_head);
-static DEFINE_RWLOCK(kclist_lock);
+static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
@@ -117,7 +117,7 @@ static void __kcore_update_ram(struct list_head *list)
 	struct kcore_list *tmp, *pos;
 	LIST_HEAD(garbage);
 
-	write_lock(&kclist_lock);
+	down_write(&kclist_lock);
 	if (xchg(&kcore_need_update, 0)) {
 		list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
 			if (pos->type == KCORE_RAM
@@ -128,7 +128,7 @@ static void __kcore_update_ram(struct list_head *list)
 	} else
 		list_splice(list, &garbage);
 	proc_root_kcore->size = get_kcore_size(&nphdr, &size);
-	write_unlock(&kclist_lock);
+	up_write(&kclist_lock);
 
 	free_kclist_ents(&garbage);
 }
@@ -451,11 +451,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 	int nphdr;
 	unsigned long start;
 
-	read_lock(&kclist_lock);
+	down_read(&kclist_lock);
 	size = get_kcore_size(&nphdr, &elf_buflen);
 
 	if (buflen == 0 || *fpos >= size) {
-		read_unlock(&kclist_lock);
+		up_read(&kclist_lock);
 		return 0;
 	}
 
@@ -472,11 +472,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			tsz = buflen;
 		elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
 		if (!elf_buf) {
-			read_unlock(&kclist_lock);
+			up_read(&kclist_lock);
 			return -ENOMEM;
 		}
 		elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-		read_unlock(&kclist_lock);
+		up_read(&kclist_lock);
 		if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
 			kfree(elf_buf);
 			return -EFAULT;
@@ -491,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		if (buflen == 0)
 			return acc;
 	} else
-		read_unlock(&kclist_lock);
+		up_read(&kclist_lock);
 
 	/*
 	 * Check to see if our file offset matches with any of
@@ -504,12 +504,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 	while (buflen) {
 		struct kcore_list *m;
 
-		read_lock(&kclist_lock);
+		down_read(&kclist_lock);
 		list_for_each_entry(m, &kclist_head, list) {
 			if (start >= m->addr && start < (m->addr+m->size))
 				break;
 		}
-		read_unlock(&kclist_lock);
+		up_read(&kclist_lock);
 
 		if (&m->list == &kclist_head) {
 			if (clear_user(buffer, tsz))
-- 
cgit v1.2.3


From b66fb005c97544e9e589b2f2e60ccfe3808c6c3e Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:55:02 -0700
Subject: proc/kcore: fix memory hotplug vs multiple opens race

There's a theoretical race condition that will cause /proc/kcore to miss
a memory hotplug event:

CPU0                              CPU1
// hotplug event 1
kcore_need_update = 1

open_kcore()                      open_kcore()
    kcore_update_ram()                kcore_update_ram()
        // Walk RAM                       // Walk RAM
        __kcore_update_ram()              __kcore_update_ram()
            kcore_need_update = 0

// hotplug event 2
kcore_need_update = 1
                                              kcore_need_update = 0

Note that CPU1 set up the RAM kcore entries with the state after hotplug
event 1 but cleared the flag for hotplug event 2.  The RAM entries will
therefore be stale until there is another hotplug event.

This is an extremely unlikely sequence of events, but the fix makes the
synchronization saner, anyways: we serialize the entire update sequence,
which means that whoever clears the flag will always succeed in replacing
the kcore list.

Link: http://lkml.kernel.org/r/6106c509998779730c12400c1b996425df7d7089.1531953780.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 93 +++++++++++++++++++++++++++------------------------------
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ae43a97d511d..95aa988c5b5d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 	return size + *elf_buflen;
 }
 
-static void free_kclist_ents(struct list_head *head)
-{
-	struct kcore_list *tmp, *pos;
-
-	list_for_each_entry_safe(pos, tmp, head, list) {
-		list_del(&pos->list);
-		kfree(pos);
-	}
-}
-/*
- * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
- */
-static void __kcore_update_ram(struct list_head *list)
-{
-	int nphdr;
-	size_t size;
-	struct kcore_list *tmp, *pos;
-	LIST_HEAD(garbage);
-
-	down_write(&kclist_lock);
-	if (xchg(&kcore_need_update, 0)) {
-		list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
-			if (pos->type == KCORE_RAM
-				|| pos->type == KCORE_VMEMMAP)
-				list_move(&pos->list, &garbage);
-		}
-		list_splice_tail(list, &kclist_head);
-	} else
-		list_splice(list, &garbage);
-	proc_root_kcore->size = get_kcore_size(&nphdr, &size);
-	up_write(&kclist_lock);
-
-	free_kclist_ents(&garbage);
-}
-
-
 #ifdef CONFIG_HIGHMEM
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
  * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
  */
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *head)
 {
-	LIST_HEAD(head);
 	struct kcore_list *ent;
-	int ret = 0;
 
 	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
 	if (!ent)
@@ -152,9 +114,8 @@ static int kcore_update_ram(void)
 	ent->addr = (unsigned long)__va(0);
 	ent->size = max_low_pfn << PAGE_SHIFT;
 	ent->type = KCORE_RAM;
-	list_add(&ent->list, &head);
-	__kcore_update_ram(&head);
-	return ret;
+	list_add(&ent->list, head);
+	return 0;
 }
 
 #else /* !CONFIG_HIGHMEM */
@@ -253,11 +214,10 @@ free_out:
 	return 1;
 }
 
-static int kcore_update_ram(void)
+static int kcore_ram_list(struct list_head *list)
 {
 	int nid, ret;
 	unsigned long end_pfn;
-	LIST_HEAD(head);
 
 	/* Not inialized....update now */
 	/* find out "max pfn" */
@@ -269,15 +229,50 @@ static int kcore_update_ram(void)
 			end_pfn = node_end;
 	}
 	/* scan 0 to max_pfn */
-	ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
-	if (ret) {
-		free_kclist_ents(&head);
+	ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+	if (ret)
 		return -ENOMEM;
+	return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+	LIST_HEAD(list);
+	LIST_HEAD(garbage);
+	int nphdr;
+	size_t size;
+	struct kcore_list *tmp, *pos;
+	int ret = 0;
+
+	down_write(&kclist_lock);
+	if (!xchg(&kcore_need_update, 0))
+		goto out;
+
+	ret = kcore_ram_list(&list);
+	if (ret) {
+		/* Couldn't get the RAM list, try again next time. */
+		WRITE_ONCE(kcore_need_update, 1);
+		list_splice_tail(&list, &garbage);
+		goto out;
+	}
+
+	list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
+		if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+			list_move(&pos->list, &garbage);
+	}
+	list_splice_tail(&list, &kclist_head);
+
+	proc_root_kcore->size = get_kcore_size(&nphdr, &size);
+
+out:
+	up_write(&kclist_lock);
+	list_for_each_entry_safe(pos, tmp, &garbage, list) {
+		list_del(&pos->list);
+		kfree(pos);
 	}
-	__kcore_update_ram(&head);
 	return ret;
 }
-#endif /* CONFIG_HIGHMEM */
 
 /*****************************************************************************/
 /*
-- 
cgit v1.2.3


From 3673fb08db73347332ab956d90090c1da6e610b7 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:55:06 -0700
Subject: proc/kcore: hold lock during read

Now that we're using an rwsem, we can hold it during the entirety of
read_kcore() and have a common return path.  This is preparation for the
next change.

[akpm@linux-foundation.org: fix locking bug reported by Tetsuo Handa]
Link: http://lkml.kernel.org/r/d7cfbc1e8a76616f3b699eaff9df0a2730380534.1531953780.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 70 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 95aa988c5b5d..dc34642bbdb7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -440,19 +440,18 @@ static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
 	char *buf = file->private_data;
-	ssize_t acc = 0;
 	size_t size, tsz;
 	size_t elf_buflen;
 	int nphdr;
 	unsigned long start;
+	size_t orig_buflen = buflen;
+	int ret = 0;
 
 	down_read(&kclist_lock);
 	size = get_kcore_size(&nphdr, &elf_buflen);
 
-	if (buflen == 0 || *fpos >= size) {
-		up_read(&kclist_lock);
-		return 0;
-	}
+	if (buflen == 0 || *fpos >= size)
+		goto out;
 
 	/* trim buflen to not go beyond EOF */
 	if (buflen > size - *fpos)
@@ -465,28 +464,26 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		tsz = elf_buflen - *fpos;
 		if (buflen < tsz)
 			tsz = buflen;
-		elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
+		elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
 		if (!elf_buf) {
-			up_read(&kclist_lock);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto out;
 		}
 		elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-		up_read(&kclist_lock);
 		if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
 			kfree(elf_buf);
-			return -EFAULT;
+			ret = -EFAULT;
+			goto out;
 		}
 		kfree(elf_buf);
 		buflen -= tsz;
 		*fpos += tsz;
 		buffer += tsz;
-		acc += tsz;
 
 		/* leave now if filled buffer already */
 		if (buflen == 0)
-			return acc;
-	} else
-		up_read(&kclist_lock);
+			goto out;
+	}
 
 	/*
 	 * Check to see if our file offset matches with any of
@@ -499,25 +496,29 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 	while (buflen) {
 		struct kcore_list *m;
 
-		down_read(&kclist_lock);
 		list_for_each_entry(m, &kclist_head, list) {
 			if (start >= m->addr && start < (m->addr+m->size))
 				break;
 		}
-		up_read(&kclist_lock);
 
 		if (&m->list == &kclist_head) {
-			if (clear_user(buffer, tsz))
-				return -EFAULT;
+			if (clear_user(buffer, tsz)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else if (m->type == KCORE_VMALLOC) {
 			vread(buf, (char *)start, tsz);
 			/* we have to zero-fill user buffer even if no read */
-			if (copy_to_user(buffer, buf, tsz))
-				return -EFAULT;
+			if (copy_to_user(buffer, buf, tsz)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else if (m->type == KCORE_USER) {
 			/* User page is handled prior to normal kernel page: */
-			if (copy_to_user(buffer, (char *)start, tsz))
-				return -EFAULT;
+			if (copy_to_user(buffer, (char *)start, tsz)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else {
 			if (kern_addr_valid(start)) {
 				/*
@@ -525,26 +526,35 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 				 * hardened user copy kernel text checks.
 				 */
 				if (probe_kernel_read(buf, (void *) start, tsz)) {
-					if (clear_user(buffer, tsz))
-						return -EFAULT;
+					if (clear_user(buffer, tsz)) {
+						ret = -EFAULT;
+						goto out;
+					}
 				} else {
-					if (copy_to_user(buffer, buf, tsz))
-						return -EFAULT;
+					if (copy_to_user(buffer, buf, tsz)) {
+						ret = -EFAULT;
+						goto out;
+					}
 				}
 			} else {
-				if (clear_user(buffer, tsz))
-					return -EFAULT;
+				if (clear_user(buffer, tsz)) {
+					ret = -EFAULT;
+					goto out;
+				}
 			}
 		}
 		buflen -= tsz;
 		*fpos += tsz;
 		buffer += tsz;
-		acc += tsz;
 		start += tsz;
 		tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
 	}
 
-	return acc;
+out:
+	up_read(&kclist_lock);
+	if (ret)
+		return ret;
+	return orig_buflen - buflen;
 }
 
 
-- 
cgit v1.2.3


From 37e949bd5293ddb70acf236eedf2ae8caa1db57b Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:55:09 -0700
Subject: proc/kcore: clean up ELF header generation

Currently, the ELF file header, program headers, and note segment are
allocated all at once, in some icky code dating back to 2.3.  Programs
tend to read the file header, then the program headers, then the note
segment, all separately, so this is a waste of effort.  It's cleaner and
more efficient to handle the three separately.

Link: http://lkml.kernel.org/r/19c92cbad0e11f6103ff3274b2e7a7e51a1eb74b.1531953780.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 350 +++++++++++++++++++++++---------------------------------
 1 file changed, 141 insertions(+), 209 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index dc34642bbdb7..808ef9afd084 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -49,15 +49,6 @@ static struct proc_dir_entry *proc_root_kcore;
 #define	kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
 #endif
 
-/* An ELF note in memory */
-struct memelfnote
-{
-	const char *name;
-	int type;
-	unsigned int datasz;
-	void *data;
-};
-
 static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
@@ -73,7 +64,8 @@ void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
 	list_add_tail(&new->list, &kclist_head);
 }
 
-static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+			     size_t *data_offset)
 {
 	size_t try, size;
 	struct kcore_list *m;
@@ -87,15 +79,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			size = try;
 		*nphdr = *nphdr + 1;
 	}
-	*elf_buflen =	sizeof(struct elfhdr) + 
-			(*nphdr + 2)*sizeof(struct elf_phdr) + 
-			3 * ((sizeof(struct elf_note)) +
-			     roundup(sizeof(CORE_STR), 4)) +
-			roundup(sizeof(struct elf_prstatus), 4) +
-			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(arch_task_struct_size, 4);
-	*elf_buflen = PAGE_ALIGN(*elf_buflen);
-	return size + *elf_buflen;
+
+	*phdrs_len = *nphdr * sizeof(struct elf_phdr);
+	*notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 4)) +
+		      ALIGN(sizeof(struct elf_prstatus), 4) +
+		      ALIGN(sizeof(struct elf_prpsinfo), 4) +
+		      ALIGN(arch_task_struct_size, 4));
+	*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+				  *notes_len);
+	return *data_offset + size;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -241,7 +233,7 @@ static int kcore_update_ram(void)
 	LIST_HEAD(list);
 	LIST_HEAD(garbage);
 	int nphdr;
-	size_t size;
+	size_t phdrs_len, notes_len, data_offset;
 	struct kcore_list *tmp, *pos;
 	int ret = 0;
 
@@ -263,7 +255,8 @@ static int kcore_update_ram(void)
 	}
 	list_splice_tail(&list, &kclist_head);
 
-	proc_root_kcore->size = get_kcore_size(&nphdr, &size);
+	proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, &notes_len,
+					       &data_offset);
 
 out:
 	up_write(&kclist_lock);
@@ -274,228 +267,168 @@ out:
 	return ret;
 }
 
-/*****************************************************************************/
-/*
- * determine size of ELF note
- */
-static int notesize(struct memelfnote *en)
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+			      unsigned int type, const void *desc,
+			      size_t descsz)
 {
-	int sz;
-
-	sz = sizeof(struct elf_note);
-	sz += roundup((strlen(en->name) + 1), 4);
-	sz += roundup(en->datasz, 4);
-
-	return sz;
-} /* end notesize() */
-
-/*****************************************************************************/
-/*
- * store a note in the header buffer
- */
-static char *storenote(struct memelfnote *men, char *bufp)
-{
-	struct elf_note en;
-
-#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-
-	en.n_namesz = strlen(men->name) + 1;
-	en.n_descsz = men->datasz;
-	en.n_type = men->type;
-
-	DUMP_WRITE(&en, sizeof(en));
-	DUMP_WRITE(men->name, en.n_namesz);
-
-	/* XXX - cast from long long to long to avoid need for libgcc.a */
-	bufp = (char*) roundup((unsigned long)bufp,4);
-	DUMP_WRITE(men->data, men->datasz);
-	bufp = (char*) roundup((unsigned long)bufp,4);
-
-#undef DUMP_WRITE
-
-	return bufp;
-} /* end storenote() */
-
-/*
- * store an ELF coredump header in the supplied buffer
- * nphdr is the number of elf_phdr to insert
- */
-static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
-{
-	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
-	struct elf_prpsinfo prpsinfo;	/* NT_PRPSINFO */
-	struct elf_phdr *nhdr, *phdr;
-	struct elfhdr *elf;
-	struct memelfnote notes[3];
-	off_t offset = 0;
-	struct kcore_list *m;
-
-	/* setup ELF header */
-	elf = (struct elfhdr *) bufp;
-	bufp += sizeof(struct elfhdr);
-	offset += sizeof(struct elfhdr);
-	memcpy(elf->e_ident, ELFMAG, SELFMAG);
-	elf->e_ident[EI_CLASS]	= ELF_CLASS;
-	elf->e_ident[EI_DATA]	= ELF_DATA;
-	elf->e_ident[EI_VERSION]= EV_CURRENT;
-	elf->e_ident[EI_OSABI] = ELF_OSABI;
-	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
-	elf->e_type	= ET_CORE;
-	elf->e_machine	= ELF_ARCH;
-	elf->e_version	= EV_CURRENT;
-	elf->e_entry	= 0;
-	elf->e_phoff	= sizeof(struct elfhdr);
-	elf->e_shoff	= 0;
-	elf->e_flags	= ELF_CORE_EFLAGS;
-	elf->e_ehsize	= sizeof(struct elfhdr);
-	elf->e_phentsize= sizeof(struct elf_phdr);
-	elf->e_phnum	= nphdr;
-	elf->e_shentsize= 0;
-	elf->e_shnum	= 0;
-	elf->e_shstrndx	= 0;
-
-	/* setup ELF PT_NOTE program header */
-	nhdr = (struct elf_phdr *) bufp;
-	bufp += sizeof(struct elf_phdr);
-	offset += sizeof(struct elf_phdr);
-	nhdr->p_type	= PT_NOTE;
-	nhdr->p_offset	= 0;
-	nhdr->p_vaddr	= 0;
-	nhdr->p_paddr	= 0;
-	nhdr->p_filesz	= 0;
-	nhdr->p_memsz	= 0;
-	nhdr->p_flags	= 0;
-	nhdr->p_align	= 0;
-
-	/* setup ELF PT_LOAD program header for every area */
-	list_for_each_entry(m, &kclist_head, list) {
-		phdr = (struct elf_phdr *) bufp;
-		bufp += sizeof(struct elf_phdr);
-		offset += sizeof(struct elf_phdr);
-
-		phdr->p_type	= PT_LOAD;
-		phdr->p_flags	= PF_R|PF_W|PF_X;
-		phdr->p_offset	= kc_vaddr_to_offset(m->addr) + dataoff;
-		phdr->p_vaddr	= (size_t)m->addr;
-		if (m->type == KCORE_RAM)
-			phdr->p_paddr	= __pa(m->addr);
-		else if (m->type == KCORE_TEXT)
-			phdr->p_paddr	= __pa_symbol(m->addr);
-		else
-			phdr->p_paddr	= (elf_addr_t)-1;
-		phdr->p_filesz	= phdr->p_memsz	= m->size;
-		phdr->p_align	= PAGE_SIZE;
-	}
-
-	/*
-	 * Set up the notes in similar form to SVR4 core dumps made
-	 * with info from their /proc.
-	 */
-	nhdr->p_offset	= offset;
-
-	/* set up the process status */
-	notes[0].name = CORE_STR;
-	notes[0].type = NT_PRSTATUS;
-	notes[0].datasz = sizeof(struct elf_prstatus);
-	notes[0].data = &prstatus;
-
-	memset(&prstatus, 0, sizeof(struct elf_prstatus));
-
-	nhdr->p_filesz	= notesize(&notes[0]);
-	bufp = storenote(&notes[0], bufp);
-
-	/* set up the process info */
-	notes[1].name	= CORE_STR;
-	notes[1].type	= NT_PRPSINFO;
-	notes[1].datasz	= sizeof(struct elf_prpsinfo);
-	notes[1].data	= &prpsinfo;
-
-	memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo));
-	prpsinfo.pr_state	= 0;
-	prpsinfo.pr_sname	= 'R';
-	prpsinfo.pr_zomb	= 0;
-
-	strcpy(prpsinfo.pr_fname, "vmlinux");
-	strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
-
-	nhdr->p_filesz	+= notesize(&notes[1]);
-	bufp = storenote(&notes[1], bufp);
-
-	/* set up the task structure */
-	notes[2].name	= CORE_STR;
-	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= arch_task_struct_size;
-	notes[2].data	= current;
-
-	nhdr->p_filesz	+= notesize(&notes[2]);
-	bufp = storenote(&notes[2], bufp);
-
-} /* end elf_kcore_store_hdr() */
+	struct elf_note *note = (struct elf_note *)&notes[*i];
+
+	note->n_namesz = strlen(name) + 1;
+	note->n_descsz = descsz;
+	note->n_type = type;
+	*i += sizeof(*note);
+	memcpy(&notes[*i], name, note->n_namesz);
+	*i = ALIGN(*i + note->n_namesz, 4);
+	memcpy(&notes[*i], desc, descsz);
+	*i = ALIGN(*i + descsz, 4);
+}
 
-/*****************************************************************************/
-/*
- * read from the ELF header and then kernel memory
- */
 static ssize_t
 read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 {
 	char *buf = file->private_data;
-	size_t size, tsz;
-	size_t elf_buflen;
+	size_t phdrs_offset, notes_offset, data_offset;
+	size_t phdrs_len, notes_len;
+	struct kcore_list *m;
+	size_t tsz;
 	int nphdr;
 	unsigned long start;
 	size_t orig_buflen = buflen;
 	int ret = 0;
 
 	down_read(&kclist_lock);
-	size = get_kcore_size(&nphdr, &elf_buflen);
 
-	if (buflen == 0 || *fpos >= size)
-		goto out;
+	get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
+	phdrs_offset = sizeof(struct elfhdr);
+	notes_offset = phdrs_offset + phdrs_len;
+
+	/* ELF file header. */
+	if (buflen && *fpos < sizeof(struct elfhdr)) {
+		struct elfhdr ehdr = {
+			.e_ident = {
+				[EI_MAG0] = ELFMAG0,
+				[EI_MAG1] = ELFMAG1,
+				[EI_MAG2] = ELFMAG2,
+				[EI_MAG3] = ELFMAG3,
+				[EI_CLASS] = ELF_CLASS,
+				[EI_DATA] = ELF_DATA,
+				[EI_VERSION] = EV_CURRENT,
+				[EI_OSABI] = ELF_OSABI,
+			},
+			.e_type = ET_CORE,
+			.e_machine = ELF_ARCH,
+			.e_version = EV_CURRENT,
+			.e_phoff = sizeof(struct elfhdr),
+			.e_flags = ELF_CORE_EFLAGS,
+			.e_ehsize = sizeof(struct elfhdr),
+			.e_phentsize = sizeof(struct elf_phdr),
+			.e_phnum = nphdr,
+		};
+
+		tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
+		if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+			ret = -EFAULT;
+			goto out;
+		}
 
-	/* trim buflen to not go beyond EOF */
-	if (buflen > size - *fpos)
-		buflen = size - *fpos;
+		buffer += tsz;
+		buflen -= tsz;
+		*fpos += tsz;
+	}
 
-	/* construct an ELF core header if we'll need some of it */
-	if (*fpos < elf_buflen) {
-		char * elf_buf;
+	/* ELF program headers. */
+	if (buflen && *fpos < phdrs_offset + phdrs_len) {
+		struct elf_phdr *phdrs, *phdr;
 
-		tsz = elf_buflen - *fpos;
-		if (buflen < tsz)
-			tsz = buflen;
-		elf_buf = kzalloc(elf_buflen, GFP_KERNEL);
-		if (!elf_buf) {
+		phdrs = kzalloc(phdrs_len, GFP_KERNEL);
+		if (!phdrs) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
-		if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
-			kfree(elf_buf);
+
+		phdrs[0].p_type = PT_NOTE;
+		phdrs[0].p_offset = notes_offset;
+		phdrs[0].p_filesz = notes_len;
+
+		phdr = &phdrs[1];
+		list_for_each_entry(m, &kclist_head, list) {
+			phdr->p_type = PT_LOAD;
+			phdr->p_flags = PF_R | PF_W | PF_X;
+			phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
+			phdr->p_vaddr = (size_t)m->addr;
+			if (m->type == KCORE_RAM)
+				phdr->p_paddr = __pa(m->addr);
+			else if (m->type == KCORE_TEXT)
+				phdr->p_paddr = __pa_symbol(m->addr);
+			else
+				phdr->p_paddr = (elf_addr_t)-1;
+			phdr->p_filesz = phdr->p_memsz = m->size;
+			phdr->p_align = PAGE_SIZE;
+			phdr++;
+		}
+
+		tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
+		if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
+				 tsz)) {
+			kfree(phdrs);
 			ret = -EFAULT;
 			goto out;
 		}
-		kfree(elf_buf);
+		kfree(phdrs);
+
+		buffer += tsz;
 		buflen -= tsz;
 		*fpos += tsz;
-		buffer += tsz;
+	}
+
+	/* ELF note segment. */
+	if (buflen && *fpos < notes_offset + notes_len) {
+		struct elf_prstatus prstatus = {};
+		struct elf_prpsinfo prpsinfo = {
+			.pr_sname = 'R',
+			.pr_fname = "vmlinux",
+		};
+		char *notes;
+		size_t i = 0;
+
+		strlcpy(prpsinfo.pr_psargs, saved_command_line,
+			sizeof(prpsinfo.pr_psargs));
+
+		notes = kzalloc(notes_len, GFP_KERNEL);
+		if (!notes) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+				  sizeof(prstatus));
+		append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+				  sizeof(prpsinfo));
+		append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+				  arch_task_struct_size);
 
-		/* leave now if filled buffer already */
-		if (buflen == 0)
+		tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
+		if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+			kfree(notes);
+			ret = -EFAULT;
 			goto out;
+		}
+		kfree(notes);
+
+		buffer += tsz;
+		buflen -= tsz;
+		*fpos += tsz;
 	}
 
 	/*
 	 * Check to see if our file offset matches with any of
 	 * the addresses in the elf_phdr on our list.
 	 */
-	start = kc_offset_to_vaddr(*fpos - elf_buflen);
+	start = kc_offset_to_vaddr(*fpos - data_offset);
 	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
 		tsz = buflen;
-		
-	while (buflen) {
-		struct kcore_list *m;
 
+	while (buflen) {
 		list_for_each_entry(m, &kclist_head, list) {
 			if (start >= m->addr && start < (m->addr+m->size))
 				break;
@@ -557,7 +490,6 @@ out:
 	return orig_buflen - buflen;
 }
 
-
 static int open_kcore(struct inode *inode, struct file *filp)
 {
 	if (!capable(CAP_SYS_RAWIO))
-- 
cgit v1.2.3


From bf991c2231117d50a7645792b514354fc8d19dae Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:55:13 -0700
Subject: proc/kcore: optimize multiple page reads

The current code does a full search of the segment list every time for
every page.  This is wasteful, since it's almost certain that the next
page will be in the same segment.  Instead, check if the previous segment
covers the current page before doing the list search.

Link: http://lkml.kernel.org/r/fd346c11090cf93d867e01b8d73a6567c5ac6361.1531953780.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 808ef9afd084..758c14e46a44 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -428,10 +428,18 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
 		tsz = buflen;
 
+	m = NULL;
 	while (buflen) {
-		list_for_each_entry(m, &kclist_head, list) {
-			if (start >= m->addr && start < (m->addr+m->size))
-				break;
+		/*
+		 * If this is the first iteration or the address is not within
+		 * the previous entry, search for a matching entry.
+		 */
+		if (!m || start < m->addr || start >= m->addr + m->size) {
+			list_for_each_entry(m, &kclist_head, list) {
+				if (start >= m->addr &&
+				    start < m->addr + m->size)
+					break;
+			}
 		}
 
 		if (&m->list == &kclist_head) {
-- 
cgit v1.2.3


From eff4345e7fba0aac8665f00e8599b36946d9aaec Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:55:17 -0700
Subject: crash_core: use VMCOREINFO_SYMBOL_ARRAY() for swapper_pg_dir

This is preparation for allowing CRASH_CORE to be enabled for any
architecture.

swapper_pg_dir is always either an array or a macro expanding to NULL.
In the latter case, VMCOREINFO_SYMBOL() won't work, as it tries to take
the address of the given symbol:

	#define VMCOREINFO_SYMBOL(name) \
		vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)

Instead, use VMCOREINFO_SYMBOL_ARRAY(), which uses the value:

	#define VMCOREINFO_SYMBOL_ARRAY(name) \
		vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)

This is the same thing for the array case but isn't an error for the macro
case.

Link: http://lkml.kernel.org/r/c05f9781ec204f40fc96f95086e7b6de6a3eb2c3.1532563124.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/crash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b66aced5e8c2..e7b4025c7b24 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(init_uts_ns);
 	VMCOREINFO_SYMBOL(node_online_map);
 #ifdef CONFIG_MMU
-	VMCOREINFO_SYMBOL(swapper_pg_dir);
+	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
 	VMCOREINFO_SYMBOL(vmap_area_list);
-- 
cgit v1.2.3


From 23c85094fe1895caefdd19ef624ee687ec5f4507 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Aug 2018 21:55:20 -0700
Subject: proc/kcore: add vmcoreinfo note to /proc/kcore

The vmcoreinfo information is useful for runtime debugging tools, not just
for crash dumps.  A lot of this information can be determined by other
means, but this is much more convenient, and it only adds a page at most
to the file.

Link: http://lkml.kernel.org/r/fddbcd08eed76344863303878b12de1c1e2a04b6.1531953780.git.osandov@fb.com
Signed-off-by: Omar Sandoval <osandov@fb.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/Kconfig            |  1 +
 fs/proc/kcore.c            | 18 ++++++++++++++++--
 include/linux/crash_core.h |  2 ++
 kernel/crash_core.c        |  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 0eaeb41453f5..817c02b13b1d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,7 @@ config PROC_FS
 config PROC_KCORE
 	bool "/proc/kcore support" if !ARM
 	depends on PROC_FS && MMU
+	select CRASH_CORE
 	help
 	  Provides a virtual ELF core file of the live kernel.  This can
 	  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 758c14e46a44..80464432dfe6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,6 +10,7 @@
  *	Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
  */
 
+#include <linux/crash_core.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/kcore.h>
@@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
 	}
 
 	*phdrs_len = *nphdr * sizeof(struct elf_phdr);
-	*notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 4)) +
+	*notes_len = (4 * sizeof(struct elf_note) +
+		      3 * ALIGN(sizeof(CORE_STR), 4) +
+		      VMCOREINFO_NOTE_NAME_BYTES +
 		      ALIGN(sizeof(struct elf_prstatus), 4) +
 		      ALIGN(sizeof(struct elf_prpsinfo), 4) +
-		      ALIGN(arch_task_struct_size, 4));
+		      ALIGN(arch_task_struct_size, 4) +
+		      ALIGN(vmcoreinfo_size, 4));
 	*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
 				  *notes_len);
 	return *data_offset + size;
@@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 				  sizeof(prpsinfo));
 		append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
 				  arch_task_struct_size);
+		/*
+		 * vmcoreinfo_size is mostly constant after init time, but it
+		 * can be changed by crash_save_vmcoreinfo(). Racing here with a
+		 * panic on another CPU before the machine goes down is insanely
+		 * unlikely, but it's better to not leave potential buffer
+		 * overflows lying around, regardless.
+		 */
+		append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
+				  vmcoreinfo_data,
+				  min(vmcoreinfo_size, notes_len - i));
 
 		tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
 		if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..525510a9f965 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
 	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
 extern u32 *vmcoreinfo_note;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index e7b4025c7b24..a683bfb0530f 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,8 +14,8 @@
 #include <asm/sections.h>
 
 /* vmcoreinfo stuff */
-static unsigned char *vmcoreinfo_data;
-static size_t vmcoreinfo_size;
+unsigned char *vmcoreinfo_data;
+size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
-- 
cgit v1.2.3


From 96c6a32ccb55a366054fd82cc63523bb7f7493d3 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Tue, 21 Aug 2018 21:55:24 -0700
Subject: include/asm-generic/bug.h: clarify valid uses of WARN()

Explicitly state that WARN*() should be used only for recoverable kernel
issues/bugs and that it should not be used for any kind of invalid
external inputs or transient conditions.

Motivation: it's a very useful capability to be able to understand if a
particular kernel splat means a kernel bug or simply an invalid user-space
program.  For the former one wants to notify kernel developers, while
notifying kernel developers for the latter is annoying.  Even a kernel
developer may not know what to do with a WARNING in an unfamiliar
subsystem.  This is especially critical for any automated testing systems
that may use panic_on_warn and mail kernel developers.

The clear separation also serves as an additional documentation: is it a
condition that must never occur because of additional checks/logic
elsewhere?  or is it simply a check for invalid inputs or unfortunate
conditions?

Use of pr_err() for user messages also leads to better error messages.
"Something is wrong in file foo on line X" is not particularly useful
message for end user.  pr_err() forces developers to write more meaningful
error messages for user.

As of now we are almost there.  We are doing systematic kernel testing
with panic_on_warn and are not seeing massive amounts of false positives.
But every now and then another WARN on ENOMEM or invalid inputs pops up
and leads to a lengthy argument each time.  The goal of this change is to
officially document the rules.

Link: http://lkml.kernel.org/r/20180620103716.61636-1-dvyukov@gmail.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/bug.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index a7613e1b0c87..20561a60db9c 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -75,9 +75,19 @@ struct bug_entry {
 
 /*
  * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
- * significant issues that need prompt attention if they should ever
- * appear at runtime.  Use the versions with printk format strings
- * to provide better diagnostics.
+ * significant kernel issues that need prompt attention if they should ever
+ * appear at runtime.
+ *
+ * Do not use these macros when checking for invalid external inputs
+ * (e.g. invalid system call arguments, or invalid data coming from
+ * network/devices), and on transient conditions like ENOMEM or EAGAIN.
+ * These macros should be used for recoverable kernel issues only.
+ * For invalid external inputs, transient conditions, etc use
+ * pr_err[_once/_ratelimited]() followed by dump_stack(), if necessary.
+ * Do not include "BUG"/"WARNING" in format strings manually to make these
+ * conditions distinguishable from kernel issues.
+ *
+ * Use the versions with printk format strings to provide better diagnostics.
  */
 #ifndef __WARN_TAINT
 extern __printf(3, 4)
-- 
cgit v1.2.3


From cedc5b6aab493f6b1b1d381dccc0cc082da7d3d8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 21 Aug 2018 21:55:28 -0700
Subject: kernel.h: documentation for roundup() vs round_up()

Things like 3619dec5103d ("dh key: fix rounding up KDF output length")
expose the lack of explicit documentation for roundup() vs round_up().  At
least we can try to document it better if anyone goes looking.

Link: http://lkml.kernel.org/r/20180703041950.GA43464@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 941dc0a5a877..d6aac75b51ba 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -85,7 +85,23 @@
  * arguments just once each.
  */
 #define __round_mask(x, y) ((__typeof__(x))((y)-1))
+/**
+ * round_up - round up to next specified power of 2
+ * @x: the value to round
+ * @y: multiple to round up to (must be a power of 2)
+ *
+ * Rounds @x up to next multiple of @y (which must be a power of 2).
+ * To perform arbitrary rounding up, use roundup() below.
+ */
 #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+/**
+ * round_down - round down to next specified power of 2
+ * @x: the value to round
+ * @y: multiple to round down to (must be a power of 2)
+ *
+ * Rounds @x down to next multiple of @y (which must be a power of 2).
+ * To perform arbitrary rounding down, use rounddown() below.
+ */
 #define round_down(x, y) ((x) & ~__round_mask(x, y))
 
 /**
@@ -110,13 +126,30 @@
 # define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d)
 #endif
 
-/* The `const' in roundup() prevents gcc-3.3 from calling __divdi3 */
+/**
+ * roundup - round up to the next specified multiple
+ * @x: the value to up
+ * @y: multiple to round up to
+ *
+ * Rounds @x up to next multiple of @y. If @y will always be a power
+ * of 2, consider using the faster round_up().
+ *
+ * The `const' here prevents gcc-3.3 from calling __divdi3
+ */
 #define roundup(x, y) (					\
 {							\
 	const typeof(y) __y = y;			\
 	(((x) + (__y - 1)) / __y) * __y;		\
 }							\
 )
+/**
+ * rounddown - round down to next specified multiple
+ * @x: the value to round
+ * @y: multiple to round down to
+ *
+ * Rounds @x down to next multiple of @y. If @y will always be a power
+ * of 2, consider using the faster round_down().
+ */
 #define rounddown(x, y) (				\
 {							\
 	typeof(x) __x = (x);				\
-- 
cgit v1.2.3


From e58dd0de5eadf145895b13451a1fef8ef03946eb Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 21 Aug 2018 21:55:31 -0700
Subject: bdi: use refcount_t for reference counting instead atomic_t

refcount_t type and corresponding API should be used instead of atomic_t
when the variable is used as a reference counter.  This permits avoiding
accidental refcounter overflows that might lead to use-after-free
situations.

Link: http://lkml.kernel.org/r/20180703200141.28415-4-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev-defs.h |  3 ++-
 include/linux/backing-dev.h      |  4 ++--
 mm/backing-dev.c                 | 12 ++++++------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 24251762c20c..9a6bc0951cfa 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -12,6 +12,7 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/kref.h>
+#include <linux/refcount.h>
 
 struct page;
 struct device;
@@ -75,7 +76,7 @@ enum wb_reason {
  */
 struct bdi_writeback_congested {
 	unsigned long state;		/* WB_[a]sync_congested flags */
-	atomic_t refcnt;		/* nr of attached wb's and blkg */
+	refcount_t refcnt;		/* nr of attached wb's and blkg */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct backing_dev_info *__bdi;	/* the associated bdi, set to NULL
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 72ca0f3d39f3..c28a47cbe355 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -404,13 +404,13 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 static inline struct bdi_writeback_congested *
 wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
 {
-	atomic_inc(&bdi->wb_congested->refcnt);
+	refcount_inc(&bdi->wb_congested->refcnt);
 	return bdi->wb_congested;
 }
 
 static inline void wb_congested_put(struct bdi_writeback_congested *congested)
 {
-	if (atomic_dec_and_test(&congested->refcnt))
+	if (refcount_dec_and_test(&congested->refcnt))
 		kfree(congested);
 }
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 2e5d3df0853d..55a233d75f39 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -438,10 +438,10 @@ retry:
 	if (new_congested) {
 		/* !found and storage for new one already allocated, insert */
 		congested = new_congested;
-		new_congested = NULL;
 		rb_link_node(&congested->rb_node, parent, node);
 		rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
-		goto found;
+		spin_unlock_irqrestore(&cgwb_lock, flags);
+		return congested;
 	}
 
 	spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -451,13 +451,13 @@ retry:
 	if (!new_congested)
 		return NULL;
 
-	atomic_set(&new_congested->refcnt, 0);
+	refcount_set(&new_congested->refcnt, 1);
 	new_congested->__bdi = bdi;
 	new_congested->blkcg_id = blkcg_id;
 	goto retry;
 
 found:
-	atomic_inc(&congested->refcnt);
+	refcount_inc(&congested->refcnt);
 	spin_unlock_irqrestore(&cgwb_lock, flags);
 	kfree(new_congested);
 	return congested;
@@ -474,7 +474,7 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
+	if (!refcount_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
 		local_irq_restore(flags);
 		return;
 	}
@@ -804,7 +804,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 	if (!bdi->wb_congested)
 		return -ENOMEM;
 
-	atomic_set(&bdi->wb_congested->refcnt, 1);
+	refcount_set(&bdi->wb_congested->refcnt, 1);
 
 	err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
 	if (err) {
-- 
cgit v1.2.3


From 060288a7320b2837a8ff2af1b3643bdbd5e568f6 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Tue, 21 Aug 2018 21:55:35 -0700
Subject: bdi: use irqsave variant of refcount_dec_and_lock()

The irqsave variant of refcount_dec_and_lock handles irqsave/restore when
taking/releasing the spin lock.  With this variant the call of
local_irq_save/restore is no longer required.

[bigeasy@linutronix.de: s@atomic_dec_and_lock@refcount_dec_and_lock@g]
Link: http://lkml.kernel.org/r/20180703200141.28415-5-bigeasy@linutronix.de
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/backing-dev.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 55a233d75f39..f5981e9d6ae2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -473,11 +473,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
-	if (!refcount_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
-		local_irq_restore(flags);
+	if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags))
 		return;
-	}
 
 	/* bdi might already have been destroyed leaving @congested unlinked */
 	if (congested->__bdi) {
-- 
cgit v1.2.3


From fc37191272a972d449bab3d8247cf52cadf5d4e6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 21 Aug 2018 21:55:38 -0700
Subject: userns: use refcount_t for reference counting instead atomic_t

refcount_t type and corresponding API should be used instead of atomic_t
wh en the variable is used as a reference counter.  This avoids accidental
refcounter overflows that might lead to use-after-free situations.

Link: http://lkml.kernel.org/r/20180703200141.28415-6-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/user.h | 5 +++--
 kernel/user.c              | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 96fe289c4c6e..39ad98c09c58 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -4,6 +4,7 @@
 
 #include <linux/uidgid.h>
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/ratelimit.h>
 
 struct key;
@@ -12,7 +13,7 @@ struct key;
  * Some day this will be a full-fledged user tracking system..
  */
 struct user_struct {
-	atomic_t __count;	/* reference count */
+	refcount_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
 #ifdef CONFIG_FANOTIFY
@@ -59,7 +60,7 @@ extern struct user_struct root_user;
 extern struct user_struct * alloc_uid(kuid_t);
 static inline struct user_struct *get_uid(struct user_struct *u)
 {
-	atomic_inc(&u->__count);
+	refcount_inc(&u->__count);
 	return u;
 }
 extern void free_uid(struct user_struct *);
diff --git a/kernel/user.c b/kernel/user.c
index 36288d840675..5f65ef195259 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -96,7 +96,7 @@ static DEFINE_SPINLOCK(uidhash_lock);
 
 /* root_user.__count is 1, for init task cred */
 struct user_struct root_user = {
-	.__count	= ATOMIC_INIT(1),
+	.__count	= REFCOUNT_INIT(1),
 	.processes	= ATOMIC_INIT(1),
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
@@ -123,7 +123,7 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
 
 	hlist_for_each_entry(user, hashent, uidhash_node) {
 		if (uid_eq(user->uid, uid)) {
-			atomic_inc(&user->__count);
+			refcount_inc(&user->__count);
 			return user;
 		}
 	}
@@ -170,7 +170,7 @@ void free_uid(struct user_struct *up)
 		return;
 
 	local_irq_save(flags);
-	if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+	if (refcount_dec_and_lock(&up->__count, &uidhash_lock))
 		free_user(up, flags);
 	else
 		local_irq_restore(flags);
@@ -191,7 +191,7 @@ struct user_struct *alloc_uid(kuid_t uid)
 			goto out_unlock;
 
 		new->uid = uid;
-		atomic_set(&new->__count, 1);
+		refcount_set(&new->__count, 1);
 		ratelimit_state_init(&new->ratelimit, HZ, 100);
 		ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
 
-- 
cgit v1.2.3


From ce0a568d327968367f294ea2425fe2c6f4f33ed2 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Tue, 21 Aug 2018 21:55:42 -0700
Subject: userns: use irqsave variant of refcount_dec_and_lock()

The irqsave variant of refcount_dec_and_lock handles irqsave/restore when
taking/releasing the spin lock.  With this variant the call of
local_irq_save/restore is no longer required.

[bigeasy@linutronix.de: s@atomic_dec_and_lock@refcount_dec_and_lock@g]
Link: http://lkml.kernel.org/r/20180703200141.28415-7-bigeasy@linutronix.de
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/user.c b/kernel/user.c
index 5f65ef195259..0df9b1640b2a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,11 +169,8 @@ void free_uid(struct user_struct *up)
 	if (!up)
 		return;
 
-	local_irq_save(flags);
-	if (refcount_dec_and_lock(&up->__count, &uidhash_lock))
+	if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags))
 		free_user(up, flags);
-	else
-		local_irq_restore(flags);
 }
 
 struct user_struct *alloc_uid(kuid_t uid)
-- 
cgit v1.2.3


From 203583990cb675aeddff6d7623f68268068c078c Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 21 Aug 2018 21:55:45 -0700
Subject: linux/compiler.h: don't use bool

Appararently, it's possible to have a non-trivial TU include a few
headers, including linux/build_bug.h, without ending up with
linux/types.h.  So the 0day bot sent me

config: um-x86_64_defconfig (attached as .config)

>> include/linux/compiler.h:316:3: error: unknown type name 'bool'; did you mean '_Bool'?
      bool __cond = !(condition);    \

for something I'm working on.

Rather than contributing to the #include madness and including
linux/types.h from compiler.h, just use int.

Link: http://lkml.kernel.org/r/20180817101036.20969-1-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Christopher Li <sparse@chrisli.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 42506e4d1f53..c8eab637a2a7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -313,7 +313,7 @@ unsigned long read_word_at_a_time(const void *addr)
 #ifdef __OPTIMIZE__
 # define __compiletime_assert(condition, msg, prefix, suffix)		\
 	do {								\
-		bool __cond = !(condition);				\
+		int __cond = !(condition);				\
 		extern void prefix ## suffix(void) __compiletime_error(msg); \
 		if (__cond)						\
 			prefix ## suffix();				\
-- 
cgit v1.2.3


From 91bc9aaf746ae41016bd6b61a48133e162542574 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 21:55:49 -0700
Subject: kernel/crash_core.c: print timestamp using time64_t

The get_seconds() call returns a 32-bit timestamp on some architectures,
and will overflow in the future.  The newer ktime_get_real_seconds()
always returns a 64-bit timestamp that does not suffer from this problem.

Link: http://lkml.kernel.org/r/20180618150329.941903-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Petr Tesarik <ptesarik@suse.cz>
Cc: Marc-Andr Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/crash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a683bfb0530f..933cb3e45b98 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -344,7 +344,7 @@ void crash_save_vmcoreinfo(void)
 	if (vmcoreinfo_data_safecopy)
 		vmcoreinfo_data = vmcoreinfo_data_safecopy;
 
-	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+	vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
 	update_vmcoreinfo_note();
 }
 
-- 
cgit v1.2.3


From a2e514453861dd39b53b7a50b6771bd3f9852078 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Tue, 21 Aug 2018 21:55:52 -0700
Subject: kernel/hung_task.c: allow to set checking interval separately from
 timeout

Currently task hung checking interval is equal to timeout, as the result
hung is detected anywhere between timeout and 2*timeout.  This is fine for
most interactive environments, but this hurts automated testing setups
(syzbot).  In an automated setup we need to strictly order CPU lockup <
RCU stall < workqueue lockup < task hung < silent loss, so that RCU stall
is not detected as task hung and task hung is not detected as silent
machine loss.  The large variance in task hung detection timeout requires
setting silent machine loss timeout to a very large value (e.g.  if task
hung is 3 mins, then silent loss need to be set to ~7 mins).  The
additional 3 minutes significantly reduce testing efficiency because
usually we crash kernel within a minute, and this can add hours to bug
localization process as it needs to do dozens of tests.

Allow setting checking interval separately from timeout.  This allows to
set timeout to, say, 3 minutes, but checking interval to 10 secs.

The interval is controlled via a new hung_task_check_interval_secs sysctl,
similar to the existing hung_task_timeout_secs sysctl.  The default value
of 0 results in the current behavior: checking interval is equal to
timeout.

[akpm@linux-foundation.org: update hung_task_timeout_max's comment]
Link: http://lkml.kernel.org/r/20180611111004.203513-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 15 ++++++++++++++-
 include/linux/sched.h           |  1 +
 include/linux/sched/sysctl.h    |  1 +
 kernel/fork.c                   |  1 +
 kernel/hung_task.c              | 15 ++++++++++++++-
 kernel/sysctl.c                 | 13 ++++++++++++-
 6 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 59585030cbaf..9d38e75b19a0 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -38,6 +38,7 @@ show up in /proc/sys/kernel:
 - hung_task_panic
 - hung_task_check_count
 - hung_task_timeout_secs
+- hung_task_check_interval_secs
 - hung_task_warnings
 - hyperv_record_panic_msg
 - kexec_load_disabled
@@ -355,7 +356,7 @@ This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
 
 hung_task_timeout_secs:
 
-Check interval. When a task in D state did not get scheduled
+When a task in D state did not get scheduled
 for more than this value report a warning.
 This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
 
@@ -364,6 +365,18 @@ Possible values to set are in range {0..LONG_MAX/HZ}.
 
 ==============================================================
 
+hung_task_check_interval_secs:
+
+Hung task check interval. If hung task checking is enabled
+(see hung_task_timeout_secs), the check is done every
+hung_task_check_interval_secs seconds.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+0 (default): means use hung_task_timeout_secs as checking interval.
+Possible values to set are in range {0..LONG_MAX/HZ}.
+
+==============================================================
+
 hung_task_warnings:
 
 The maximum number of warnings to report. During a check interval
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 789923fbee3a..58eb3a2bc695 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -853,6 +853,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 	unsigned long			last_switch_count;
+	unsigned long			last_switch_time;
 #endif
 	/* Filesystem information: */
 	struct fs_struct		*fs;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 913488d828cb..a9c32daeb9d8 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -10,6 +10,7 @@ struct ctl_table;
 extern int	     sysctl_hung_task_check_count;
 extern unsigned int  sysctl_hung_task_panic;
 extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_check_interval_secs;
 extern int sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
diff --git a/kernel/fork.c b/kernel/fork.c
index 8c760effa42e..8bcef2413f1b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1302,6 +1302,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	tsk->nvcsw = tsk->nivcsw = 0;
 #ifdef CONFIG_DETECT_HUNG_TASK
 	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+	tsk->last_switch_time = 0;
 #endif
 
 	tsk->mm = NULL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 32b479468e4d..b9132d1269ef 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  */
 unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
 
+/*
+ * Zero (default value) means use sysctl_hung_task_timeout_secs:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+
 int __read_mostly sysctl_hung_task_warnings = 10;
 
 static int __read_mostly did_panic;
@@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 
 	if (switch_count != t->last_switch_count) {
 		t->last_switch_count = switch_count;
+		t->last_switch_time = jiffies;
 		return;
 	}
+	if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+		return;
 
 	trace_sched_process_hang(t);
 
@@ -245,8 +253,13 @@ static int watchdog(void *dummy)
 
 	for ( ; ; ) {
 		unsigned long timeout = sysctl_hung_task_timeout_secs;
-		long t = hung_timeout_jiffies(hung_last_checked, timeout);
+		unsigned long interval = sysctl_hung_task_check_interval_secs;
+		long t;
 
+		if (interval == 0)
+			interval = timeout;
+		interval = min_t(unsigned long, interval, timeout);
+		t = hung_timeout_jiffies(hung_last_checked, interval);
 		if (t <= 0) {
 			if (!atomic_xchg(&reset_hung_task, 0))
 				check_hung_uninterruptible_tasks(timeout);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f22f76b7a138..dbdcb6673b27 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -145,7 +145,10 @@ static int minolduid;
 static int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
 
-/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
+/*
+ * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+ * and hung_task_check_interval_secs
+ */
 #ifdef CONFIG_DETECT_HUNG_TASK
 static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
 #endif
@@ -1090,6 +1093,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dohung_task_timeout_secs,
 		.extra2		= &hung_task_timeout_max,
 	},
+	{
+		.procname	= "hung_task_check_interval_secs",
+		.data		= &sysctl_hung_task_check_interval_secs,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_dohung_task_timeout_secs,
+		.extra2		= &hung_task_timeout_max,
+	},
 	{
 		.procname	= "hung_task_warnings",
 		.data		= &sysctl_hung_task_warnings,
-- 
cgit v1.2.3


From bc63804619465ad98af6d9086dab601a653066fb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 21 Aug 2018 21:55:56 -0700
Subject: spelling.txt: add more spellings to spelling.txt

Here are some of the more common spelling mistakes and typos that I've
found while fixing up spelling mistakes in the kernel over the past 6
months.

Link: http://lkml.kernel.org/r/20180629150603.1159-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/spelling.txt | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 9a058cff49d4..d2688d55c34b 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -10,6 +10,8 @@
 abandonning||abandoning
 abigious||ambiguous
 abitrate||arbitrate
+abord||abort
+aboslute||absolute
 abov||above
 abreviated||abbreviated
 absense||absence
@@ -25,6 +27,7 @@ accessable||accessible
 accesss||access
 accidentaly||accidentally
 accidentually||accidentally
+acclerated||accelerated
 accoding||according
 accomodate||accommodate
 accomodates||accommodates
@@ -58,12 +61,15 @@ addres||address
 adddress||address
 addreses||addresses
 addresss||address
+addrress||address
 aditional||additional
 aditionally||additionally
 aditionaly||additionally
 adminstrative||administrative
 adress||address
 adresses||addresses
+adrresses||addresses
+advertisment||advertisement
 adviced||advised
 afecting||affecting
 againt||against
@@ -100,6 +106,7 @@ alue||value
 ambigious||ambiguous
 amoung||among
 amout||amount
+amplifer||amplifier
 an union||a union
 an user||a user
 an userspace||a userspace
@@ -145,11 +152,15 @@ assistent||assistant
 assocation||association
 associcated||associated
 assotiated||associated
+asssert||assert
 assum||assume
 assumtpion||assumption
 asuming||assuming
 asycronous||asynchronous
 asynchnous||asynchronous
+asynchromous||asynchronous
+asymetric||asymmetric
+asymmeric||asymmetric
 atomatically||automatically
 atomicly||atomically
 atempt||attempt
@@ -172,6 +183,7 @@ avaible||available
 availabe||available
 availabled||available
 availablity||availability
+availaible||available
 availale||available
 availavility||availability
 availble||available
@@ -206,8 +218,11 @@ borad||board
 boundry||boundary
 brievely||briefly
 broadcat||broadcast
+bufufer||buffer
 cacluated||calculated
+caculate||calculate
 caculation||calculation
+cadidate||candidate
 calender||calendar
 calescing||coalescing
 calle||called
@@ -221,12 +236,14 @@ capabilty||capability
 capabitilies||capabilities
 capatibilities||capabilities
 capapbilities||capabilities
+caputure||capture
 carefuly||carefully
 cariage||carriage
 catagory||category
 cehck||check
 challange||challenge
 challanges||challenges
+chache||cache
 chanell||channel
 changable||changeable
 chanined||chained
@@ -240,6 +257,7 @@ charaters||characters
 charcter||character
 chcek||check
 chck||check
+checksumed||checksummed
 checksuming||checksumming
 childern||children
 childs||children
@@ -292,8 +310,10 @@ comunication||communication
 conbination||combination
 conditionaly||conditionally
 conected||connected
+conector||connector
 connecetd||connected
 configuartion||configuration
+configuation||configuration
 configuratoin||configuration
 configuraton||configuration
 configuretion||configuration
@@ -315,6 +335,7 @@ continous||continuous
 continously||continuously
 continueing||continuing
 contraints||constraints
+contruct||construct
 contol||control
 contoller||controller
 controled||controlled
@@ -343,6 +364,7 @@ dafault||default
 deafult||default
 deamon||daemon
 decompres||decompress
+decsribed||described
 decription||description
 dectected||detected
 defailt||default
@@ -379,6 +401,7 @@ desctiptor||descriptor
 desriptor||descriptor
 desriptors||descriptors
 destionation||destination
+destoried||destroyed
 destory||destroy
 destoryed||destroyed
 destorys||destroys
@@ -404,18 +427,25 @@ diffrentiate||differentiate
 difinition||definition
 dimesions||dimensions
 diplay||display
+directon||direction
 direectly||directly
+diregard||disregard
 disassocation||disassociation
 disapear||disappear
 disapeared||disappeared
 disappared||disappeared
+disbale||disable
+disbaled||disabled
 disble||disable
 disbled||disabled
 disconnet||disconnect
 discontinous||discontinuous
+disharge||discharge
 dispertion||dispersion
 dissapears||disappears
 distiction||distinction
+divisable||divisible
+divsiors||divisors
 docuentation||documentation
 documantation||documentation
 documentaion||documentation
@@ -427,6 +457,7 @@ downlad||download
 downlads||downloads
 druing||during
 dynmaic||dynamic
+eanable||enable
 easilly||easily
 ecspecially||especially
 edditable||editable
@@ -484,9 +515,12 @@ exprimental||experimental
 extened||extended
 extensability||extensibility
 extention||extension
+extenstion||extension
 extracter||extractor
+faield||failed
 falied||failed
 faild||failed
+failer||failure
 faill||fail
 failied||failed
 faillure||failure
@@ -520,6 +554,7 @@ forseeable||foreseeable
 forse||force
 fortan||fortran
 forwardig||forwarding
+frambuffer||framebuffer
 framming||framing
 framwork||framework
 frequncy||frequency
@@ -527,6 +562,7 @@ frome||from
 fucntion||function
 fuction||function
 fuctions||functions
+funcation||function
 funcion||function
 functionallity||functionality
 functionaly||functionally
@@ -540,6 +576,7 @@ futrue||future
 gaurenteed||guaranteed
 generiously||generously
 genereate||generate
+genereted||generated
 genric||generic
 globel||global
 grabing||grabbing
@@ -553,6 +590,7 @@ guarentee||guarantee
 halfs||halves
 hander||handler
 handfull||handful
+hanlde||handle
 hanled||handled
 happend||happened
 harware||hardware
@@ -561,6 +599,7 @@ helpfull||helpful
 hybernate||hibernate
 hierachy||hierarchy
 hierarchie||hierarchy
+homogenous||homogeneous
 howver||however
 hsould||should
 hypervior||hypervisor
@@ -568,6 +607,8 @@ hypter||hyper
 identidier||identifier
 iligal||illegal
 illigal||illegal
+illgal||illegal
+iomaped||iomapped
 imblance||imbalance
 immeadiately||immediately
 immedaite||immediate
@@ -618,10 +659,12 @@ initation||initiation
 initators||initiators
 initialiazation||initialization
 initializiation||initialization
+initialze||initialize
 initialzed||initialized
 initilization||initialization
 initilize||initialize
 inofficial||unofficial
+inrerface||interface
 insititute||institute
 instal||install
 instanciated||instantiated
@@ -657,6 +700,7 @@ intregral||integral
 intrrupt||interrupt
 intterrupt||interrupt
 intuative||intuitive
+inavlid||invalid
 invaid||invalid
 invald||invalid
 invalde||invalid
@@ -683,6 +727,7 @@ langauge||language
 langugage||language
 lauch||launch
 layed||laid
+legnth||length
 leightweight||lightweight
 lengh||length
 lenght||length
@@ -696,6 +741,7 @@ licenceing||licencing
 loggging||logging
 loggin||login
 logile||logfile
+loobpack||loopback
 loosing||losing
 losted||lost
 machinary||machinery
@@ -703,6 +749,7 @@ maintainance||maintenance
 maintainence||maintenance
 maintan||maintain
 makeing||making
+mailformed||malformed
 malplaced||misplaced
 malplace||misplace
 managable||manageable
@@ -710,6 +757,7 @@ managment||management
 mangement||management
 manoeuvering||maneuvering
 mappping||mapping
+matchs||matches
 mathimatical||mathematical
 mathimatic||mathematic
 mathimatics||mathematics
@@ -725,6 +773,7 @@ messsage||message
 messsages||messages
 micropone||microphone
 microprocesspr||microprocessor
+migrateable||migratable
 milliseonds||milliseconds
 minium||minimum
 minimam||minimum
@@ -741,6 +790,7 @@ missmatch||mismatch
 miximum||maximum
 mmnemonic||mnemonic
 mnay||many
+modfiy||modify
 modulues||modules
 momery||memory
 memomry||memory
@@ -777,6 +827,7 @@ notifed||notified
 numebr||number
 numner||number
 obtaion||obtain
+obusing||abusing
 occassionally||occasionally
 occationally||occasionally
 occurance||occurrence
@@ -787,6 +838,7 @@ occure||occurred
 occured||occurred
 occuring||occurring
 offet||offset
+offloded||offloaded
 omited||omitted
 omiting||omitting
 omitt||omit
@@ -829,6 +881,7 @@ parametes||parameters
 parametised||parametrised
 paramter||parameter
 paramters||parameters
+parmaters||parameters
 particuarly||particularly
 particularily||particularly
 partiton||partition
@@ -837,6 +890,7 @@ passin||passing
 pathes||paths
 pecularities||peculiarities
 peformance||performance
+peforming||performing
 peice||piece
 pendantic||pedantic
 peprocessor||preprocessor
@@ -846,6 +900,7 @@ peroid||period
 persistance||persistence
 persistant||persistent
 plalform||platform
+platfoem||platform
 platfrom||platform
 plattform||platform
 pleaes||please
@@ -858,6 +913,7 @@ posible||possible
 positon||position
 possibilites||possibilities
 powerfull||powerful
+preamle||preamble
 preample||preamble
 preapre||prepare
 preceeded||preceded
@@ -870,6 +926,7 @@ prefered||preferred
 prefferably||preferably
 premption||preemption
 prepaired||prepared
+preperation||preparation
 pressre||pressure
 primative||primitive
 princliple||principle
@@ -935,6 +992,7 @@ recommanded||recommended
 recyle||recycle
 redircet||redirect
 redirectrion||redirection
+redundacy||redundancy
 reename||rename
 refcounf||refcount
 refence||reference
@@ -945,6 +1003,7 @@ refernces||references
 refernnce||reference
 refrence||reference
 registerd||registered
+registeration||registration
 registeresd||registered
 registerred||registered
 registes||registers
@@ -973,7 +1032,9 @@ requirment||requirement
 requred||required
 requried||required
 requst||request
+reregisteration||reregistration
 reseting||resetting
+reseverd||reserved
 resizeable||resizable
 resouce||resource
 resouces||resources
@@ -982,6 +1043,7 @@ responce||response
 ressizes||resizes
 ressource||resource
 ressources||resources
+restesting||retesting
 retransmited||retransmitted
 retreived||retrieved
 retreive||retrieve
@@ -1006,6 +1068,7 @@ sacrifying||sacrificing
 safly||safely
 safty||safety
 savable||saveable
+scaleing||scaling
 scaned||scanned
 scaning||scanning
 scarch||search
@@ -1014,6 +1077,8 @@ searchs||searches
 secquence||sequence
 secund||second
 segement||segment
+semaphone||semaphore
+senario||scenario
 senarios||scenarios
 sentivite||sensitive
 separatly||separately
@@ -1025,9 +1090,13 @@ seperate||separate
 seperatly||separately
 seperator||separator
 sepperate||separate
+seqeunce||sequence
+seqeuncer||sequencer
+seqeuencer||sequencer
 sequece||sequence
 sequencial||sequential
 serveral||several
+servive||service
 setts||sets
 settting||setting
 shotdown||shutdown
@@ -1073,6 +1142,7 @@ standartization||standardization
 standart||standard
 staticly||statically
 stoped||stopped
+stoping||stopping
 stoppped||stopped
 straming||streaming
 struc||struct
@@ -1085,6 +1155,7 @@ subdirectoires||subdirectories
 suble||subtle
 substract||subtract
 submition||submission
+suceed||succeed
 succesfully||successfully
 succesful||successful
 successed||succeeded
@@ -1108,6 +1179,7 @@ surpressed||suppressed
 surpresses||suppresses
 susbsystem||subsystem
 suspeneded||suspended
+suspsend||suspend
 suspicously||suspiciously
 swaping||swapping
 switchs||switches
@@ -1122,6 +1194,7 @@ swtich||switch
 symetric||symmetric
 synax||syntax
 synchonized||synchronized
+synchronuously||synchronously
 syncronize||synchronize
 syncronized||synchronized
 syncronizing||synchronizing
@@ -1130,11 +1203,14 @@ syste||system
 sytem||system
 sythesis||synthesis
 taht||that
+tansmit||transmit
 targetted||targeted
 targetting||targeting
+taskelt||tasklet
 teh||the
 temorary||temporary
 temproarily||temporarily
+thead||thread
 therfore||therefore
 thier||their
 threds||threads
@@ -1143,11 +1219,14 @@ thresold||threshold
 throught||through
 troughput||throughput
 thses||these
+tiggers||triggers
 tiggered||triggered
 tipically||typically
+timeing||timing
 timout||timeout
 tmis||this
 torerable||tolerable
+traking||tracking
 tramsmitted||transmitted
 tramsmit||transmit
 tranasction||transaction
@@ -1162,6 +1241,7 @@ transormed||transformed
 trasfer||transfer
 trasmission||transmission
 treshold||threshold
+trigerred||triggered
 trigerring||triggering
 trun||turn
 tunning||tuning
@@ -1169,6 +1249,8 @@ ture||true
 tyep||type
 udpate||update
 uesd||used
+uknown||unknown
+usupported||unsupported
 uncommited||uncommitted
 unconditionaly||unconditionally
 underun||underrun
@@ -1181,11 +1263,14 @@ unexpeted||unexpected
 unexpexted||unexpected
 unfortunatelly||unfortunately
 unifiy||unify
+uniterrupted||uninterrupted
 unintialized||uninitialized
 unkmown||unknown
 unknonw||unknown
 unknow||unknown
 unkown||unknown
+unamed||unnamed
+uneeded||unneeded
 unneded||unneeded
 unneccecary||unnecessary
 unneccesary||unnecessary
@@ -1210,6 +1295,7 @@ usefull||useful
 usege||usage
 usera||users
 usualy||usually
+usupported||unsupported
 utilites||utilities
 utillities||utilities
 utilties||utilities
@@ -1233,7 +1319,9 @@ virtaul||virtual
 virtiual||virtual
 visiters||visitors
 vitual||virtual
+vunerable||vulnerable
 wakeus||wakeups
+wathdog||watchdog
 wating||waiting
 wiat||wait
 wether||whether
-- 
cgit v1.2.3


From 271ca788774aaef9ca4c372573122cc4548cd401 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 21 Aug 2018 21:56:00 -0700
Subject: arch: enable relative relocations for arm64, power and x86

Patch series "add support for relative references in special sections", v10.

This adds support for emitting special sections such as initcall arrays,
PCI fixups and tracepoints as relative references rather than absolute
references.  This reduces the size by 50% on 64-bit architectures, but
more importantly, it removes the need for carrying relocation metadata for
these sections in relocatable kernels (e.g., for KASLR) that needs to be
fixed up at boot time.  On arm64, this reduces the vmlinux footprint of
such a reference by 8x (8 byte absolute reference + 24 byte RELA entry vs
4 byte relative reference)

Patch #3 was sent out before as a single patch.  This series supersedes
the previous submission.  This version makes relative ksymtab entries
dependent on the new Kconfig symbol HAVE_ARCH_PREL32_RELOCATIONS rather
than trying to infer from kbuild test robot replies for which
architectures it should be blacklisted.

Patch #1 introduces the new Kconfig symbol HAVE_ARCH_PREL32_RELOCATIONS,
and sets it for the main architectures that are expected to benefit the
most from this feature, i.e., 64-bit architectures or ones that use
runtime relocations.

Patch #2 add support for #define'ing __DISABLE_EXPORTS to get rid of
ksymtab/kcrctab sections in decompressor and EFI stub objects when
rebuilding existing C files to run in a different context.

Patches #4 - #6 implement relative references for initcalls, PCI fixups
and tracepoints, respectively, all of which produce sections with order
~1000 entries on an arm64 defconfig kernel with tracing enabled.  This
means we save about 28 KB of vmlinux space for each of these patches.

[From the v7 series blurb, which included the jump_label patches as well]:

  For the arm64 kernel, all patches combined reduce the memory footprint
  of vmlinux by about 1.3 MB (using a config copied from Ubuntu that has
  KASLR enabled), of which ~1 MB is the size reduction of the RELA section
  in .init, and the remaining 300 KB is reduction of .text/.data.

This patch (of 6):

Before updating certain subsystems to use place relative 32-bit
relocations in special sections, to save space and reduce the number of
absolute relocations that need to be processed at runtime by relocatable
kernels, introduce the Kconfig symbol and define it for some architectures
that should be able to support and benefit from it.

Link: http://lkml.kernel.org/r/20180704083651.24360-2-ard.biesheuvel@linaro.org
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: James Morris <jmorris@namei.org>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>,
Cc: James Morris <james.morris@microsoft.com>
Cc: Jessica Yu <jeyu@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig         | 10 ++++++++++
 arch/arm64/Kconfig   |  1 +
 arch/powerpc/Kconfig |  1 +
 arch/x86/Kconfig     |  1 +
 4 files changed, 13 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index c6148166a7b4..4426e9687d89 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -841,6 +841,16 @@ config REFCOUNT_FULL
 	  against various use-after-free conditions that can be used in
 	  security flaw exploits.
 
+config HAVE_ARCH_PREL32_RELOCATIONS
+	bool
+	help
+	  May be selected by an architecture if it supports place-relative
+	  32-bit relocations, both in the toolchain and in the module loader,
+	  in which case relative references can be used in special sections
+	  for PCI fixup, initcalls etc which are only half the size on 64 bit
+	  architectures, and don't require runtime relocation on relocatable
+	  kernels.
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d0a53cc6293a..29e75b47becd 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -108,6 +108,7 @@ config ARM64
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
+	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a80669209155..db0b6eebbfa5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -177,6 +177,7 @@ config PPC
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if COMPAT
+	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_CBPF_JIT			if !PPC64
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b0312f8947ce..512003f16889 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,6 +124,7 @@ config X86
 	select HAVE_ARCH_MMAP_RND_BITS		if MMU
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT
 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
+	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
-- 
cgit v1.2.3


From f922c4abdf7648523589abee9460c87f51630d2f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 21 Aug 2018 21:56:04 -0700
Subject: module: allow symbol exports to be disabled

To allow existing C code to be incorporated into the decompressor or the
UEFI stub, introduce a CPP macro that turns all EXPORT_SYMBOL_xxx
declarations into nops, and #define it in places where such exports are
undesirable.  Note that this gets rid of a rather dodgy redefine of
linux/export.h's header guard.

Link: http://lkml.kernel.org/r/20180704083651.24360-3-ard.biesheuvel@linaro.org
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morris <james.morris@microsoft.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/boot/compressed/kaslr.c      |  5 +----
 drivers/firmware/efi/libstub/Makefile |  1 +
 include/linux/export.h                | 11 ++++++++++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 302517929932..d1e19f358b6e 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -23,11 +23,8 @@
  * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
  * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
  * which is meaningless and will cause compiling error in some cases.
- * So do not include linux/export.h and define EXPORT_SYMBOL(sym)
- * as empty.
  */
-#define _LINUX_EXPORT_H
-#define EXPORT_SYMBOL(sym)
+#define __DISABLE_EXPORTS
 
 #include "misc.h"
 #include "error.h"
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 88c322d7c71e..14c40a7750d1 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -24,6 +24,7 @@ KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
 				   -D__NO_FORTIFY \
 				   $(call cc-option,-ffreestanding) \
 				   $(call cc-option,-fno-stack-protector) \
+				   -D__DISABLE_EXPORTS
 
 GCOV_PROFILE			:= n
 KASAN_SANITIZE			:= n
diff --git a/include/linux/export.h b/include/linux/export.h
index b768d6dd3c90..ea7df303d68d 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -66,7 +66,16 @@ extern struct module __this_module;
 	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
 	= { (unsigned long)&sym, __kstrtab_##sym }
 
-#if defined(__KSYM_DEPS__)
+#if defined(__DISABLE_EXPORTS)
+
+/*
+ * Allow symbol exports to be disabled completely so that C code may
+ * be reused in other execution contexts such as the UEFI stub or the
+ * decompressor.
+ */
+#define __EXPORT_SYMBOL(sym, sec)
+
+#elif defined(__KSYM_DEPS__)
 
 /*
  * For fine grained build dependencies, we want to tell the build system
-- 
cgit v1.2.3


From 7290d58095712a89f845e1bca05334796dd49ed2 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 21 Aug 2018 21:56:09 -0700
Subject: module: use relative references for __ksymtab entries

An ordinary arm64 defconfig build has ~64 KB worth of __ksymtab entries,
each consisting of two 64-bit fields containing absolute references, to
the symbol itself and to a char array containing its name, respectively.

When we build the same configuration with KASLR enabled, we end up with an
additional ~192 KB of relocations in the .init section, i.e., one 24 byte
entry for each absolute reference, which all need to be processed at boot
time.

Given how the struct kernel_symbol that describes each entry is completely
local to module.c (except for the references emitted by EXPORT_SYMBOL()
itself), we can easily modify it to contain two 32-bit relative references
instead.  This reduces the size of the __ksymtab section by 50% for all
64-bit architectures, and gets rid of the runtime relocations entirely for
architectures implementing KASLR, either via standard PIE linking (arm64)
or using custom host tools (x86).

Note that the binary search involving __ksymtab contents relies on each
section being sorted by symbol name.  This is implemented based on the
input section names, not the names in the ksymtab entries, so this patch
does not interfere with that.

Given that the use of place-relative relocations requires support both in
the toolchain and in the module loader, we cannot enable this feature for
all architectures.  So make it dependent on whether
CONFIG_HAVE_ARCH_PREL32_RELOCATIONS is defined.

Link: http://lkml.kernel.org/r/20180704083651.24360-4-ard.biesheuvel@linaro.org
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Jessica Yu <jeyu@kernel.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morris <james.morris@microsoft.com>
Cc: James Morris <jmorris@namei.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/Kbuild   |  1 +
 arch/x86/include/asm/export.h |  5 -----
 include/asm-generic/export.h  | 12 +++++++++--
 include/linux/compiler.h      | 19 ++++++++++++++++++
 include/linux/export.h        | 46 ++++++++++++++++++++++++++++++++-----------
 kernel/module.c               | 32 ++++++++++++++++++++++++------
 6 files changed, 91 insertions(+), 24 deletions(-)
 delete mode 100644 arch/x86/include/asm/export.h

diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index de690c2d2e33..a0ab9ab61c75 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -8,5 +8,6 @@ generated-y += xen-hypercalls.h
 
 generic-y += dma-contiguous.h
 generic-y += early_ioremap.h
+generic-y += export.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h
deleted file mode 100644
index 2a51d66689c5..000000000000
--- a/arch/x86/include/asm/export.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef CONFIG_64BIT
-#define KSYM_ALIGN 16
-#endif
-#include <asm-generic/export.h>
diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h
index 68efb950a918..4d73e6e3c66c 100644
--- a/include/asm-generic/export.h
+++ b/include/asm-generic/export.h
@@ -5,12 +5,10 @@
 #define KSYM_FUNC(x) x
 #endif
 #ifdef CONFIG_64BIT
-#define __put .quad
 #ifndef KSYM_ALIGN
 #define KSYM_ALIGN 8
 #endif
 #else
-#define __put .long
 #ifndef KSYM_ALIGN
 #define KSYM_ALIGN 4
 #endif
@@ -19,6 +17,16 @@
 #define KCRC_ALIGN 4
 #endif
 
+.macro __put, val, name
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+	.long	\val - ., \name - .
+#elif defined(CONFIG_64BIT)
+	.quad	\val, \name
+#else
+	.long	\val, \name
+#endif
+.endm
+
 /*
  * note on .section use: @progbits vs %progbits nastiness doesn't matter,
  * since we immediately emit into those sections anyway.
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c8eab637a2a7..681d866efb1e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -280,6 +280,25 @@ unsigned long read_word_at_a_time(const void *addr)
 
 #endif /* __KERNEL__ */
 
+/*
+ * Force the compiler to emit 'sym' as a symbol, so that we can reference
+ * it from inline assembler. Necessary in case 'sym' could be inlined
+ * otherwise, or eliminated entirely due to lack of references that are
+ * visible to the compiler.
+ */
+#define __ADDRESSABLE(sym) \
+	static void * __attribute__((section(".discard.addressable"), used)) \
+		__PASTE(__addressable_##sym, __LINE__) = (void *)&sym;
+
+/**
+ * offset_to_ptr - convert a relative memory offset to an absolute pointer
+ * @off:	the address of the 32-bit offset value
+ */
+static inline void *offset_to_ptr(const int *off)
+{
+	return (void *)((unsigned long)off + *off);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #ifndef __optimize
diff --git a/include/linux/export.h b/include/linux/export.h
index ea7df303d68d..ae072bc5aacf 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -18,12 +18,6 @@
 #define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x)
 
 #ifndef __ASSEMBLY__
-struct kernel_symbol
-{
-	unsigned long value;
-	const char *name;
-};
-
 #ifdef MODULE
 extern struct module __this_module;
 #define THIS_MODULE (&__this_module)
@@ -54,17 +48,47 @@ extern struct module __this_module;
 #define __CRC_SYMBOL(sym, sec)
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+#include <linux/compiler.h>
+/*
+ * Emit the ksymtab entry as a pair of relative references: this reduces
+ * the size by half on 64-bit architectures, and eliminates the need for
+ * absolute relocations that require runtime processing on relocatable
+ * kernels.
+ */
+#define __KSYMTAB_ENTRY(sym, sec)					\
+	__ADDRESSABLE(sym)						\
+	asm("	.section \"___ksymtab" sec "+" #sym "\", \"a\"	\n"	\
+	    "	.balign	8					\n"	\
+	    "__ksymtab_" #sym ":				\n"	\
+	    "	.long	" #sym "- .				\n"	\
+	    "	.long	__kstrtab_" #sym "- .			\n"	\
+	    "	.previous					\n")
+
+struct kernel_symbol {
+	int value_offset;
+	int name_offset;
+};
+#else
+#define __KSYMTAB_ENTRY(sym, sec)					\
+	static const struct kernel_symbol __ksymtab_##sym		\
+	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
+	= { (unsigned long)&sym, __kstrtab_##sym }
+
+struct kernel_symbol {
+	unsigned long value;
+	const char *name;
+};
+#endif
+
 /* For every exported symbol, place a struct in the __ksymtab section */
 #define ___EXPORT_SYMBOL(sym, sec)					\
 	extern typeof(sym) sym;						\
 	__CRC_SYMBOL(sym, sec)						\
 	static const char __kstrtab_##sym[]				\
-	__attribute__((section("__ksymtab_strings"), aligned(1)))	\
+	__attribute__((section("__ksymtab_strings"), used, aligned(1)))	\
 	= #sym;								\
-	static const struct kernel_symbol __ksymtab_##sym		\
-	__used								\
-	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
-	= { (unsigned long)&sym, __kstrtab_##sym }
+	__KSYMTAB_ENTRY(sym, sec)
 
 #if defined(__DISABLE_EXPORTS)
 
diff --git a/kernel/module.c b/kernel/module.c
index b046a32520d8..6746c85511fe 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -529,12 +529,30 @@ static bool check_symbol(const struct symsearch *syms,
 	return true;
 }
 
+static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+	return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+	return sym->value;
+#endif
+}
+
+static const char *kernel_symbol_name(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+	return offset_to_ptr(&sym->name_offset);
+#else
+	return sym->name;
+#endif
+}
+
 static int cmp_name(const void *va, const void *vb)
 {
 	const char *a;
 	const struct kernel_symbol *b;
 	a = va; b = vb;
-	return strcmp(a, b->name);
+	return strcmp(a, kernel_symbol_name(b));
 }
 
 static bool find_symbol_in_section(const struct symsearch *syms,
@@ -2170,7 +2188,7 @@ void *__symbol_get(const char *symbol)
 		sym = NULL;
 	preempt_enable();
 
-	return sym ? (void *)sym->value : NULL;
+	return sym ? (void *)kernel_symbol_value(sym) : NULL;
 }
 EXPORT_SYMBOL_GPL(__symbol_get);
 
@@ -2200,10 +2218,12 @@ static int verify_export_symbols(struct module *mod)
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
 		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
-			if (find_symbol(s->name, &owner, NULL, true, false)) {
+			if (find_symbol(kernel_symbol_name(s), &owner, NULL,
+					true, false)) {
 				pr_err("%s: exports duplicate symbol %s"
 				       " (owned by %s)\n",
-				       mod->name, s->name, module_name(owner));
+				       mod->name, kernel_symbol_name(s),
+				       module_name(owner));
 				return -ENOEXEC;
 			}
 		}
@@ -2252,7 +2272,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 			ksym = resolve_symbol_wait(mod, info, name);
 			/* Ok if resolved.  */
 			if (ksym && !IS_ERR(ksym)) {
-				sym[i].st_value = ksym->value;
+				sym[i].st_value = kernel_symbol_value(ksym);
 				break;
 			}
 
@@ -2516,7 +2536,7 @@ static int is_exported(const char *name, unsigned long value,
 		ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
 	else
 		ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
-	return ks != NULL && ks->value == value;
+	return ks != NULL && kernel_symbol_value(ks) == value;
 }
 
 /* As per nm */
-- 
cgit v1.2.3


From 1b1eeca7e4c19fa76d409d4c7b338dba21f2df45 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 21 Aug 2018 21:56:13 -0700
Subject: init: allow initcall tables to be emitted using relative references

Allow the initcall tables to be emitted using relative references that
are only half the size on 64-bit architectures and don't require fixups
at runtime on relocatable kernels.

Link: http://lkml.kernel.org/r/20180704083651.24360-5-ard.biesheuvel@linaro.org
Acked-by: James Morris <james.morris@microsoft.com>
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init.h   | 44 +++++++++++++++++++++++++++++++++-----------
 init/main.c            | 32 ++++++++++++++++----------------
 kernel/printk/printk.c | 16 +++++++++-------
 security/security.c    | 17 ++++++++++-------
 4 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/include/linux/init.h b/include/linux/init.h
index bc27cf03c41e..2538d176dd1f 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -116,8 +116,24 @@
 typedef int (*initcall_t)(void);
 typedef void (*exitcall_t)(void);
 
-extern initcall_t __con_initcall_start[], __con_initcall_end[];
-extern initcall_t __security_initcall_start[], __security_initcall_end[];
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+typedef int initcall_entry_t;
+
+static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
+{
+	return offset_to_ptr(entry);
+}
+#else
+typedef initcall_t initcall_entry_t;
+
+static inline initcall_t initcall_from_entry(initcall_entry_t *entry)
+{
+	return *entry;
+}
+#endif
+
+extern initcall_entry_t __con_initcall_start[], __con_initcall_end[];
+extern initcall_entry_t __security_initcall_start[], __security_initcall_end[];
 
 /* Used for contructor calls. */
 typedef void (*ctor_fn_t)(void);
@@ -167,9 +183,20 @@ extern bool initcall_debug;
  * as KEEP() in the linker script.
  */
 
-#define __define_initcall(fn, id) \
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+#define ___define_initcall(fn, id, __sec)			\
+	__ADDRESSABLE(fn)					\
+	asm(".section	\"" #__sec ".init\", \"a\"	\n"	\
+	"__initcall_" #fn #id ":			\n"	\
+	    ".long	" #fn " - .			\n"	\
+	    ".previous					\n");
+#else
+#define ___define_initcall(fn, id, __sec) \
 	static initcall_t __initcall_##fn##id __used \
-	__attribute__((__section__(".initcall" #id ".init"))) = fn;
+		__attribute__((__section__(#__sec ".init"))) = fn;
+#endif
+
+#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)
 
 /*
  * Early initcalls run before initializing SMP.
@@ -208,13 +235,8 @@ extern bool initcall_debug;
 #define __exitcall(fn)						\
 	static exitcall_t __exitcall_##fn __exit_call = fn
 
-#define console_initcall(fn)					\
-	static initcall_t __initcall_##fn			\
-	__used __section(.con_initcall.init) = fn
-
-#define security_initcall(fn)					\
-	static initcall_t __initcall_##fn			\
-	__used __section(.security_initcall.init) = fn
+#define console_initcall(fn)	___define_initcall(fn,, .con_initcall)
+#define security_initcall(fn)	___define_initcall(fn,, .security_initcall)
 
 struct obs_kernel_param {
 	const char *str;
diff --git a/init/main.c b/init/main.c
index b729e1f22838..3a6ce89e128f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -902,18 +902,18 @@ int __init_or_module do_one_initcall(initcall_t fn)
 }
 
 
-extern initcall_t __initcall_start[];
-extern initcall_t __initcall0_start[];
-extern initcall_t __initcall1_start[];
-extern initcall_t __initcall2_start[];
-extern initcall_t __initcall3_start[];
-extern initcall_t __initcall4_start[];
-extern initcall_t __initcall5_start[];
-extern initcall_t __initcall6_start[];
-extern initcall_t __initcall7_start[];
-extern initcall_t __initcall_end[];
-
-static initcall_t *initcall_levels[] __initdata = {
+extern initcall_entry_t __initcall_start[];
+extern initcall_entry_t __initcall0_start[];
+extern initcall_entry_t __initcall1_start[];
+extern initcall_entry_t __initcall2_start[];
+extern initcall_entry_t __initcall3_start[];
+extern initcall_entry_t __initcall4_start[];
+extern initcall_entry_t __initcall5_start[];
+extern initcall_entry_t __initcall6_start[];
+extern initcall_entry_t __initcall7_start[];
+extern initcall_entry_t __initcall_end[];
+
+static initcall_entry_t *initcall_levels[] __initdata = {
 	__initcall0_start,
 	__initcall1_start,
 	__initcall2_start,
@@ -939,7 +939,7 @@ static char *initcall_level_names[] __initdata = {
 
 static void __init do_initcall_level(int level)
 {
-	initcall_t *fn;
+	initcall_entry_t *fn;
 
 	strcpy(initcall_command_line, saved_command_line);
 	parse_args(initcall_level_names[level],
@@ -950,7 +950,7 @@ static void __init do_initcall_level(int level)
 
 	trace_initcall_level(initcall_level_names[level]);
 	for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
-		do_one_initcall(*fn);
+		do_one_initcall(initcall_from_entry(fn));
 }
 
 static void __init do_initcalls(void)
@@ -981,11 +981,11 @@ static void __init do_basic_setup(void)
 
 static void __init do_pre_smp_initcalls(void)
 {
-	initcall_t *fn;
+	initcall_entry_t *fn;
 
 	trace_initcall_level("early");
 	for (fn = __initcall_start; fn < __initcall0_start; fn++)
-		do_one_initcall(*fn);
+		do_one_initcall(initcall_from_entry(fn));
 }
 
 /*
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 90b6ab01db59..918f386b2f6e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2788,7 +2788,8 @@ EXPORT_SYMBOL(unregister_console);
 void __init console_init(void)
 {
 	int ret;
-	initcall_t *call;
+	initcall_t call;
+	initcall_entry_t *ce;
 
 	/* Setup the default TTY line discipline. */
 	n_tty_init();
@@ -2797,13 +2798,14 @@ void __init console_init(void)
 	 * set up the console device so that later boot sequences can
 	 * inform about problems etc..
 	 */
-	call = __con_initcall_start;
+	ce = __con_initcall_start;
 	trace_initcall_level("console");
-	while (call < __con_initcall_end) {
-		trace_initcall_start((*call));
-		ret = (*call)();
-		trace_initcall_finish((*call), ret);
-		call++;
+	while (ce < __con_initcall_end) {
+		call = initcall_from_entry(ce);
+		trace_initcall_start(call);
+		ret = call();
+		trace_initcall_finish(call, ret);
+		ce++;
 	}
 }
 
diff --git a/security/security.c b/security/security.c
index 47cfff01d7ec..736e78da1ab9 100644
--- a/security/security.c
+++ b/security/security.c
@@ -48,14 +48,17 @@ static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] =
 static void __init do_security_initcalls(void)
 {
 	int ret;
-	initcall_t *call;
-	call = __security_initcall_start;
+	initcall_t call;
+	initcall_entry_t *ce;
+
+	ce = __security_initcall_start;
 	trace_initcall_level("security");
-	while (call < __security_initcall_end) {
-		trace_initcall_start((*call));
-		ret = (*call) ();
-		trace_initcall_finish((*call), ret);
-		call++;
+	while (ce < __security_initcall_end) {
+		call = initcall_from_entry(ce);
+		trace_initcall_start(call);
+		ret = call();
+		trace_initcall_finish(call, ret);
+		ce++;
 	}
 }
 
-- 
cgit v1.2.3


From c9d8b55fa0191623fccb9ed67d2ff8f9159e9a89 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 21 Aug 2018 21:56:18 -0700
Subject: PCI: Add support for relative addressing in quirk tables

Allow the PCI quirk tables to be emitted in a way that avoids absolute
references to the hook functions. This reduces the size of the entries,
and, more importantly, makes them invariant under runtime relocation
(e.g., for KASLR)

Link: http://lkml.kernel.org/r/20180704083651.24360-6-ard.biesheuvel@linaro.org
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morris <james.morris@microsoft.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pci/quirks.c | 12 +++++++++---
 include/linux/pci.h  | 20 ++++++++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 46f58a9771d7..ef7143a274e0 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -66,9 +66,15 @@ static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f,
 		     f->vendor == (u16) PCI_ANY_ID) &&
 		    (f->device == dev->device ||
 		     f->device == (u16) PCI_ANY_ID)) {
-			calltime = fixup_debug_start(dev, f->hook);
-			f->hook(dev);
-			fixup_debug_report(dev, calltime, f->hook);
+			void (*hook)(struct pci_dev *dev);
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+			hook = offset_to_ptr(&f->hook_offset);
+#else
+			hook = f->hook;
+#endif
+			calltime = fixup_debug_start(dev, hook);
+			hook(dev);
+			fixup_debug_report(dev, calltime, hook);
 		}
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9b87f1936906..e72ca8dd6241 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1809,7 +1809,11 @@ struct pci_fixup {
 	u16 device;			/* Or PCI_ANY_ID */
 	u32 class;			/* Or PCI_ANY_ID */
 	unsigned int class_shift;	/* should be 0, 8, 16 */
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+	int hook_offset;
+#else
 	void (*hook)(struct pci_dev *dev);
+#endif
 };
 
 enum pci_fixup_pass {
@@ -1823,12 +1827,28 @@ enum pci_fixup_pass {
 	pci_fixup_suspend_late,	/* pci_device_suspend_late() */
 };
 
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				    class_shift, hook)			\
+	__ADDRESSABLE(hook)						\
+	asm(".section "	#sec ", \"a\"				\n"	\
+	    ".balign	16					\n"	\
+	    ".short "	#vendor ", " #device "			\n"	\
+	    ".long "	#class ", " #class_shift "		\n"	\
+	    ".long "	#hook " - .				\n"	\
+	    ".previous						\n");
+#define DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, hook)			\
+	__DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
+				  class_shift, hook)
+#else
 /* Anonymous variables would be nice... */
 #define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, class,	\
 				  class_shift, hook)			\
 	static const struct pci_fixup __PASTE(__pci_fixup_##name,__LINE__) __used	\
 	__attribute__((__section__(#section), aligned((sizeof(void *)))))    \
 		= { vendor, device, class, class_shift, hook };
+#endif
 
 #define DECLARE_PCI_FIXUP_CLASS_EARLY(vendor, device, class,		\
 					 class_shift, hook)		\
-- 
cgit v1.2.3


From 46e0c9be206fa7b11aca75da2d6b8535d0139752 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 21 Aug 2018 21:56:22 -0700
Subject: kernel: tracepoints: add support for relative references

To avoid the need for relocating absolute references to tracepoint
structures at boot time when running relocatable kernels (which may
take a disproportionate amount of space), add the option to emit
these tables as relative references instead.

Link: http://lkml.kernel.org/r/20180704083651.24360-7-ard.biesheuvel@linaro.org
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morris <james.morris@microsoft.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracepoint.h | 19 ++++++++++++++----
 kernel/tracepoint.c        | 49 ++++++++++++++++++++++++----------------------
 2 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d9a084c72541..7f2e16e76ac4 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -249,6 +249,19 @@ extern void syscall_unregfunc(void);
 		return static_key_false(&__tracepoint_##name.key);	\
 	}
 
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+#define __TRACEPOINT_ENTRY(name)					\
+	asm("	.section \"__tracepoints_ptrs\", \"a\"		\n"	\
+	    "	.balign 4					\n"	\
+	    "	.long 	__tracepoint_" #name " - .		\n"	\
+	    "	.previous					\n")
+#else
+#define __TRACEPOINT_ENTRY(name)					 \
+	static struct tracepoint * const __tracepoint_ptr_##name __used	 \
+	__attribute__((section("__tracepoints_ptrs"))) =		 \
+		&__tracepoint_##name
+#endif
+
 /*
  * We have no guarantee that gcc and the linker won't up-align the tracepoint
  * structures, so we create an array of pointers that will be used for iteration
@@ -258,11 +271,9 @@ extern void syscall_unregfunc(void);
 	static const char __tpstrtab_##name[]				 \
 	__attribute__((section("__tracepoints_strings"))) = #name;	 \
 	struct tracepoint __tracepoint_##name				 \
-	__attribute__((section("__tracepoints"))) =			 \
+	__attribute__((section("__tracepoints"), used)) =		 \
 		{ __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\
-	static struct tracepoint * const __tracepoint_ptr_##name __used	 \
-	__attribute__((section("__tracepoints_ptrs"))) =		 \
-		&__tracepoint_##name;
+	__TRACEPOINT_ENTRY(name);
 
 #define DEFINE_TRACE(name)						\
 	DEFINE_TRACE_FN(name, NULL, NULL);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 96db841bf0fc..bf2c06ef9afc 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -371,6 +371,27 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
 
+static void for_each_tracepoint_range(struct tracepoint * const *begin,
+		struct tracepoint * const *end,
+		void (*fct)(struct tracepoint *tp, void *priv),
+		void *priv)
+{
+	if (!begin)
+		return;
+
+	if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) {
+		const int *iter;
+
+		for (iter = (const int *)begin; iter < (const int *)end; iter++)
+			fct(offset_to_ptr(iter), priv);
+	} else {
+		struct tracepoint * const *iter;
+
+		for (iter = begin; iter < end; iter++)
+			fct(*iter, priv);
+	}
+}
+
 #ifdef CONFIG_MODULES
 bool trace_module_has_bad_taint(struct module *mod)
 {
@@ -435,15 +456,9 @@ EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
  * Ensure the tracer unregistered the module's probes before the module
  * teardown is performed. Prevents leaks of probe and data pointers.
  */
-static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
-		struct tracepoint * const *end)
+static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
 {
-	struct tracepoint * const *iter;
-
-	if (!begin)
-		return;
-	for (iter = begin; iter < end; iter++)
-		WARN_ON_ONCE((*iter)->funcs);
+	WARN_ON_ONCE(tp->funcs);
 }
 
 static int tracepoint_module_coming(struct module *mod)
@@ -494,8 +509,9 @@ static void tracepoint_module_going(struct module *mod)
 			 * Called the going notifier before checking for
 			 * quiescence.
 			 */
-			tp_module_going_check_quiescent(mod->tracepoints_ptrs,
-				mod->tracepoints_ptrs + mod->num_tracepoints);
+			for_each_tracepoint_range(mod->tracepoints_ptrs,
+				mod->tracepoints_ptrs + mod->num_tracepoints,
+				tp_module_going_check_quiescent, NULL);
 			break;
 		}
 	}
@@ -547,19 +563,6 @@ static __init int init_tracepoints(void)
 __initcall(init_tracepoints);
 #endif /* CONFIG_MODULES */
 
-static void for_each_tracepoint_range(struct tracepoint * const *begin,
-		struct tracepoint * const *end,
-		void (*fct)(struct tracepoint *tp, void *priv),
-		void *priv)
-{
-	struct tracepoint * const *iter;
-
-	if (!begin)
-		return;
-	for (iter = begin; iter < end; iter++)
-		fct(*iter, priv);
-}
-
 /**
  * for_each_kernel_tracepoint - iteration on all kernel tracepoints
  * @fct: callback
-- 
cgit v1.2.3


From ee8ef0a4b167c1f2605bc9b5701c603224673d53 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Aug 2018 21:56:26 -0700
Subject: epoll: use the waitqueue lock to protect ep->wq

Patch series "waitqueue lockdep annotation", v3.

This series adds a strategic lockdep_assert_held to __wake_up_common to
ensure callers really do hold the wait_queue_head lock when calling the
unlocked wake_up variants.  It turns out epoll did not do this for a
fairly common path (hit all the time by systemd during bootup), so the
second patch fixed this instance as well.

This patch (of 3):

The epoll code currently uses the unlocked waitqueue helpers for managing
ep->wq, but instead of holding the waitqueue lock around these calls, it
uses its own ep->lock spinlock.  Given that the waitqueue is not exposed
to the rest of the kernel this actually works ok at the moment, but
prevents the epoll locking rules from being enforced using lockdep.
Remove ep->lock and use the waitqueue lock to not only reduce the size of
struct eventpoll but also to make sure we can assert locking invariants in
the waitqueue code.

Link: http://lkml.kernel.org/r/20171214152344.6880-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Baron <jbaron@akamai.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 65 ++++++++++++++++++++++++++--------------------------------
 1 file changed, 29 insertions(+), 36 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 67db22fe99c5..2737ef591b3e 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
  *
  * 1) epmutex (mutex)
  * 2) ep->mtx (mutex)
- * 3) ep->lock (spinlock)
+ * 3) ep->wq.lock (spinlock)
  *
  * The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->wq.lock) because we manipulate objects
  * from inside the poll callback, that might be triggered from
  * a wake_up() that in turn might be called from IRQ context.
  * So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
  * of epoll file descriptors, we use the current recursion depth as
  * the lockdep subkey.
  * It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->lock") to have it working,
+ * mutex "epmutex" (together with "ep->wq.lock") to have it working,
  * but having "ep->mtx" will make the interface more scalable.
  * Events that require holding "epmutex" are very rare, while for
  * normal operations the epoll private "ep->mtx" will guarantee
@@ -182,11 +182,10 @@ struct epitem {
  * This structure is stored inside the "private_data" member of the file
  * structure and represents the main data structure for the eventpoll
  * interface.
+ *
+ * Access to it is protected by the lock inside wq.
  */
 struct eventpoll {
-	/* Protect the access to this structure */
-	spinlock_t lock;
-
 	/*
 	 * This mutex is used to ensure that files are not removed
 	 * while epoll is using them. This is held during the event
@@ -210,7 +209,7 @@ struct eventpoll {
 	/*
 	 * This is a single linked list that chains all the "struct epitem" that
 	 * happened while transferring ready events to userspace w/out
-	 * holding ->lock.
+	 * holding ->wq.lock.
 	 */
 	struct epitem *ovflist;
 
@@ -688,17 +687,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
 	 * because we want the "sproc" callback to be able to do it
 	 * in a lockless way.
 	 */
-	spin_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->wq.lock, flags);
 	list_splice_init(&ep->rdllist, &txlist);
 	ep->ovflist = NULL;
-	spin_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->wq.lock, flags);
 
 	/*
 	 * Now call the callback function.
 	 */
 	res = (*sproc)(ep, &txlist, priv);
 
-	spin_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->wq.lock, flags);
 	/*
 	 * During the time we spent inside the "sproc" callback, some
 	 * other events might have been queued by the poll callback.
@@ -740,7 +739,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
-	spin_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->wq.lock, flags);
 
 	if (!ep_locked)
 		mutex_unlock(&ep->mtx);
@@ -768,12 +767,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	struct file *file = epi->ffd.file;
 
 	/*
-	 * Removes poll wait queue hooks. We _have_ to do this without holding
-	 * the "ep->lock" otherwise a deadlock might occur. This because of the
-	 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
-	 * queue head lock when unregistering the wait queue. The wakeup callback
-	 * will run by holding the wait queue head lock and will call our callback
-	 * that will try to get "ep->lock".
+	 * Removes poll wait queue hooks.
 	 */
 	ep_unregister_pollwait(ep, epi);
 
@@ -784,10 +778,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	rb_erase_cached(&epi->rbn, &ep->rbr);
 
-	spin_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->wq.lock, flags);
 	if (ep_is_linked(&epi->rdllink))
 		list_del_init(&epi->rdllink);
-	spin_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->wq.lock, flags);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
 	/*
@@ -837,7 +831,7 @@ static void ep_free(struct eventpoll *ep)
 	 * Walks through the whole tree by freeing each "struct epitem". At this
 	 * point we are sure no poll callbacks will be lingering around, and also by
 	 * holding "epmutex" we can be sure that no file cleanup code will hit
-	 * us during this operation. So we can avoid the lock on "ep->lock".
+	 * us during this operation. So we can avoid the lock on "ep->wq.lock".
 	 * We do not need to lock ep->mtx, either, we only do it to prevent
 	 * a lockdep warning.
 	 */
@@ -1017,7 +1011,6 @@ static int ep_alloc(struct eventpoll **pep)
 	if (unlikely(!ep))
 		goto free_uid;
 
-	spin_lock_init(&ep->lock);
 	mutex_init(&ep->mtx);
 	init_waitqueue_head(&ep->wq);
 	init_waitqueue_head(&ep->poll_wait);
@@ -1122,7 +1115,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 	__poll_t pollflags = key_to_poll(key);
 	int ewake = 0;
 
-	spin_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->wq.lock, flags);
 
 	ep_set_busy_poll_napi_id(epi);
 
@@ -1199,7 +1192,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 		pwake++;
 
 out_unlock:
-	spin_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->wq.lock, flags);
 
 	/* We have to call this outside the lock */
 	if (pwake)
@@ -1484,7 +1477,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 		goto error_remove_epi;
 
 	/* We have to drop the new item inside our item list to keep track of it */
-	spin_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->wq.lock, flags);
 
 	/* record NAPI ID of new item if present */
 	ep_set_busy_poll_napi_id(epi);
@@ -1501,7 +1494,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 			pwake++;
 	}
 
-	spin_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->wq.lock, flags);
 
 	atomic_long_inc(&ep->user->epoll_watches);
 
@@ -1527,10 +1520,10 @@ error_unregister:
 	 * list, since that is used/cleaned only inside a section bound by "mtx".
 	 * And ep_insert() is called with "mtx" held.
 	 */
-	spin_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->wq.lock, flags);
 	if (ep_is_linked(&epi->rdllink))
 		list_del_init(&epi->rdllink);
-	spin_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->wq.lock, flags);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
 
@@ -1572,9 +1565,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 	 * 1) Flush epi changes above to other CPUs.  This ensures
 	 *    we do not miss events from ep_poll_callback if an
 	 *    event occurs immediately after we call f_op->poll().
-	 *    We need this because we did not take ep->lock while
+	 *    We need this because we did not take ep->wq.lock while
 	 *    changing epi above (but ep_poll_callback does take
-	 *    ep->lock).
+	 *    ep->wq.lock).
 	 *
 	 * 2) We also need to ensure we do not miss _past_ events
 	 *    when calling f_op->poll().  This barrier also
@@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 	 * list, push it inside.
 	 */
 	if (ep_item_poll(epi, &pt, 1)) {
-		spin_lock_irq(&ep->lock);
+		spin_lock_irq(&ep->wq.lock);
 		if (!ep_is_linked(&epi->rdllink)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 			ep_pm_stay_awake(epi);
@@ -1604,7 +1597,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 			if (waitqueue_active(&ep->poll_wait))
 				pwake++;
 		}
-		spin_unlock_irq(&ep->lock);
+		spin_unlock_irq(&ep->wq.lock);
 	}
 
 	/* We have to call this outside the lock */
@@ -1756,7 +1749,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		 * caller specified a non blocking operation.
 		 */
 		timed_out = 1;
-		spin_lock_irqsave(&ep->lock, flags);
+		spin_lock_irqsave(&ep->wq.lock, flags);
 		goto check_events;
 	}
 
@@ -1765,7 +1758,7 @@ fetch_events:
 	if (!ep_events_available(ep))
 		ep_busy_loop(ep, timed_out);
 
-	spin_lock_irqsave(&ep->lock, flags);
+	spin_lock_irqsave(&ep->wq.lock, flags);
 
 	if (!ep_events_available(ep)) {
 		/*
@@ -1807,11 +1800,11 @@ fetch_events:
 				break;
 			}
 
-			spin_unlock_irqrestore(&ep->lock, flags);
+			spin_unlock_irqrestore(&ep->wq.lock, flags);
 			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
 				timed_out = 1;
 
-			spin_lock_irqsave(&ep->lock, flags);
+			spin_lock_irqsave(&ep->wq.lock, flags);
 		}
 
 		__remove_wait_queue(&ep->wq, &wait);
@@ -1821,7 +1814,7 @@ check_events:
 	/* Is it worth to try to dig for events ? */
 	eavail = ep_events_available(ep);
 
-	spin_unlock_irqrestore(&ep->lock, flags);
+	spin_unlock_irqrestore(&ep->wq.lock, flags);
 
 	/*
 	 * Try to transfer events to user space. In case we get 0 events and
-- 
cgit v1.2.3


From c430d1e848ff1240d126e79780f3c26208b8aed9 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Tue, 21 Aug 2018 21:56:30 -0700
Subject: userfaultfd: use fault_wqh lock

The userfaultfd code currently uses the unlocked waitqueue helpers for
managing fault_wqh, but instead of holding the waitqueue lock for this
waitqueue around these calls, it the waitqueue lock of
fault_pending_wq, which is a different waitqueue instance.  Given that
the waitqueue is not exposed to the rest of the kernel this actually
works ok at the moment, but prevents the userfaultfd locking rules from
being enforced using lockdep.

Switch to the internally locked waitqueue helpers instead.  This means
that the lock inside fault_wqh now nests inside the fault_pending_wqh
lock, but that's not a problem since it was entirely unused before.

[hch@lst.de: slight changelog updates]
[rppt@linux.vnet.ibm.com: spotted changelog spellos]
Link: http://lkml.kernel.org/r/20171214152344.6880-3-hch@lst.de
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 15c265d450bf..f649023b19b5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -910,7 +910,7 @@ wakeup:
 	 */
 	spin_lock(&ctx->fault_pending_wqh.lock);
 	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
-	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
+	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
 	spin_unlock(&ctx->fault_pending_wqh.lock);
 
 	/* Flush pending events that may still wait on event_wqh */
@@ -1066,7 +1066,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 			 * anyway.
 			 */
 			list_del(&uwq->wq.entry);
-			__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+			add_wait_queue(&ctx->fault_wqh, &uwq->wq);
 
 			write_seqcount_end(&ctx->refile_seq);
 
@@ -1215,7 +1215,7 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
 		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
 				     range);
 	if (waitqueue_active(&ctx->fault_wqh))
-		__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
+		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
 	spin_unlock(&ctx->fault_pending_wqh.lock);
 }
 
-- 
cgit v1.2.3


From e05a8e4d88d16e088d83ce679ac3343ac66c936b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 21 Aug 2018 21:56:34 -0700
Subject: sched/wait: assert the wait_queue_head lock is held in
 __wake_up_common

Better ensure we actually hold the lock using lockdep than just commenting
on it.  Due to the various exported _locked interfaces it is far too easy
to get the locking wrong.

Link: http://lkml.kernel.org/r/20171214152344.6880-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/wait.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 870f97b313e3..5dd47f1103d1 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -69,6 +69,8 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 	wait_queue_entry_t *curr, *next;
 	int cnt = 0;
 
+	lockdep_assert_held(&wq_head->lock);
+
 	if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
 		curr = list_next_entry(bookmark, entry);
 
-- 
cgit v1.2.3


From 002b343669c474151954266e7fcf727bf7faa851 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 21:56:38 -0700
Subject: fs/epoll: loosen irq safety in ep_scan_ready_list()

Patch series "fs/epoll: loosen irq safety when possible".

Both patches replace saving+restoring interrupts when taking the ep->lock
(now the waitqueue lock), with just disabling local irqs.  This shows
immediate performance benefits in patch 1 for an epoll workload running on
Xen.  The main concern we need to have with this sort of changes in epoll
is the ep_poll_callback() which is passed to the wait queue wakeup and is
done very often under irq context, this patch does not touch this call.

Patches have been tested pretty heavily with the customer workload,
microbenchmarks, ltp testcases and two high level workloads that use epoll
under the hood: nginx and libevent benchmarks.

This patch (of 2):

Saving and restoring interrupts in ep_scan_ready_list() is an
overkill as it is never called with interrupts disabled. Loosen
this to simply disabling local irqs such that archs where managing
irqs is expensive or virtual environments. This patch yields
some throughput improvements on a workload that is epoll intensive
running on a single Xen DomU.

1 Job	 7500	-->    8800 enq/s  (+17%)
2 Jobs	14000   -->   15200 enq/s  (+8%)
3 Jobs	20500	-->   22300 enq/s  (+8%)
4 Jobs	25000   -->   28000 enq/s  (+8-12)%

On bare metal:

For a 2-socket 40-core (ht) IvyBridge on a few workloads, unfortunately I
don't have a xen environment and the results for Xen I do have (which
numbers are in patch 1) I don't have the actual workload, so cannot
compare them directly.

1) Different configurations were used for a epoll_wait (pipes io)
   microbench (http://linux-scalability.org/epoll/epoll-test.c) and shows
   around a 7-10% improvement in overall total number of times the
   epoll_wait() loops when using both regular and nested epolls, so very
   raw numbers, but measurable nonetheless.

# threads	vanilla		dirty
     1		1677717		1805587
     2		1660510		1854064
     4		1610184		1805484
     8		1577696		1751222
     16		1568837		1725299
     32		1291532		1378463
     64		 752584		 787368

   Note that stddev is pretty small.

2) Another pipe test, which shows no real measurable improvement.
   (http://www.xmailserver.org/linux-patches/pipetest.c)

Link: http://lkml.kernel.org/r/20180720172956.2883-2-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2737ef591b3e..2247769eb941 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -667,7 +667,6 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
 {
 	__poll_t res;
 	int pwake = 0;
-	unsigned long flags;
 	struct epitem *epi, *nepi;
 	LIST_HEAD(txlist);
 
@@ -687,17 +686,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
 	 * because we want the "sproc" callback to be able to do it
 	 * in a lockless way.
 	 */
-	spin_lock_irqsave(&ep->wq.lock, flags);
+	spin_lock_irq(&ep->wq.lock);
 	list_splice_init(&ep->rdllist, &txlist);
 	ep->ovflist = NULL;
-	spin_unlock_irqrestore(&ep->wq.lock, flags);
+	spin_unlock_irq(&ep->wq.lock);
 
 	/*
 	 * Now call the callback function.
 	 */
 	res = (*sproc)(ep, &txlist, priv);
 
-	spin_lock_irqsave(&ep->wq.lock, flags);
+	spin_lock_irq(&ep->wq.lock);
 	/*
 	 * During the time we spent inside the "sproc" callback, some
 	 * other events might have been queued by the poll callback.
@@ -739,7 +738,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
-	spin_unlock_irqrestore(&ep->wq.lock, flags);
+	spin_unlock_irq(&ep->wq.lock);
 
 	if (!ep_locked)
 		mutex_unlock(&ep->mtx);
-- 
cgit v1.2.3


From 304b18b8d6af796c8ece221d34c92aeb1559789b Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 21:56:41 -0700
Subject: fs/epoll: loosen irq safety in epoll_insert() and epoll_remove()

Both functions are similar to the context of ep_modify(), called via
epoll_ctl(2).  Just like ep_modify(), saving and restoring interrupts is
an overkill in these calls as it will never be called with irqs disabled.
While ep_remove() can be called directly from EPOLL_CTL_DEL, it can also
be called when releasing the file, but this also complies with the above.

Link: http://lkml.kernel.org/r/20180720172956.2883-3-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2247769eb941..1b1abc461fc0 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -762,7 +762,6 @@ static void epi_rcu_free(struct rcu_head *head)
  */
 static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 {
-	unsigned long flags;
 	struct file *file = epi->ffd.file;
 
 	/*
@@ -777,10 +776,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 
 	rb_erase_cached(&epi->rbn, &ep->rbr);
 
-	spin_lock_irqsave(&ep->wq.lock, flags);
+	spin_lock_irq(&ep->wq.lock);
 	if (ep_is_linked(&epi->rdllink))
 		list_del_init(&epi->rdllink);
-	spin_unlock_irqrestore(&ep->wq.lock, flags);
+	spin_unlock_irq(&ep->wq.lock);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
 	/*
@@ -1409,7 +1408,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 {
 	int error, pwake = 0;
 	__poll_t revents;
-	unsigned long flags;
 	long user_watches;
 	struct epitem *epi;
 	struct ep_pqueue epq;
@@ -1476,7 +1474,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 		goto error_remove_epi;
 
 	/* We have to drop the new item inside our item list to keep track of it */
-	spin_lock_irqsave(&ep->wq.lock, flags);
+	spin_lock_irq(&ep->wq.lock);
 
 	/* record NAPI ID of new item if present */
 	ep_set_busy_poll_napi_id(epi);
@@ -1493,7 +1491,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 			pwake++;
 	}
 
-	spin_unlock_irqrestore(&ep->wq.lock, flags);
+	spin_unlock_irq(&ep->wq.lock);
 
 	atomic_long_inc(&ep->user->epoll_watches);
 
@@ -1519,10 +1517,10 @@ error_unregister:
 	 * list, since that is used/cleaned only inside a section bound by "mtx".
 	 * And ep_insert() is called with "mtx" held.
 	 */
-	spin_lock_irqsave(&ep->wq.lock, flags);
+	spin_lock_irq(&ep->wq.lock);
 	if (ep_is_linked(&epi->rdllink))
 		list_del_init(&epi->rdllink);
-	spin_unlock_irqrestore(&ep->wq.lock, flags);
+	spin_unlock_irq(&ep->wq.lock);
 
 	wakeup_source_unregister(ep_wakeup_source(epi));
 
-- 
cgit v1.2.3


From 92e641784055998879942d39c74d4f84fa750968 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 21:56:45 -0700
Subject: s/epoll: robustify irq safety with lockdep_assert_irqs_enabled()

Sprinkle lockdep_assert_irqs_enabled() checks in the functions that do not
save and restore interrupts when dealing with the ep->wq.lock.  These are
ep_scan_ready_list() and those called by epoll_ctl(): ep_insert, ep_modify
and ep_remove.

[akpm@linux-foundation.org: remove too-obvious comments]
Link: http://lkml.kernel.org/r/20180721183127.3busfa335zlcjeox@linux-r8p5
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1b1abc461fc0..58b96d8a898a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -670,6 +670,8 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
 	struct epitem *epi, *nepi;
 	LIST_HEAD(txlist);
 
+	lockdep_assert_irqs_enabled();
+
 	/*
 	 * We need to lock this because we could be hit by
 	 * eventpoll_release_file() and epoll_ctl().
@@ -764,6 +766,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 {
 	struct file *file = epi->ffd.file;
 
+	lockdep_assert_irqs_enabled();
+
 	/*
 	 * Removes poll wait queue hooks.
 	 */
@@ -1412,6 +1416,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	struct epitem *epi;
 	struct ep_pqueue epq;
 
+	lockdep_assert_irqs_enabled();
+
 	user_watches = atomic_long_read(&ep->user->epoll_watches);
 	if (unlikely(user_watches >= max_user_watches))
 		return -ENOSPC;
@@ -1540,6 +1546,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 	int pwake = 0;
 	poll_table pt;
 
+	lockdep_assert_irqs_enabled();
+
 	init_poll_funcptr(&pt, NULL);
 
 	/*
-- 
cgit v1.2.3


From 31bb82c9caa9bd8fa8865fa28a7a81f67ddc0539 Mon Sep 17 00:00:00 2001
From: Antonio Nino Diaz <antonio.ninodiaz@arm.com>
Date: Tue, 21 Aug 2018 21:56:48 -0700
Subject: get_maintainer: allow usage outside of kernel tree

Add option '--no-tree' to get_maintainer.pl script to allow using this
script in projects that aren't the Linux kernel if they use the same
format for their MAINTAINERS file.  This command is also available in
checkpatch.pl, for example.

Link: http://lkml.kernel.org/r/04452ac6-1575-f612-72c6-6ea88e70a9d5@arm.com
Signed-off-by: Antonio Nino Diaz <antonio.ninodiaz@arm.com>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index c87fa734e3e1..82a482f858ee 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -48,6 +48,7 @@ my $output_roles = 0;
 my $output_rolestats = 1;
 my $output_section_maxlen = 50;
 my $scm = 0;
+my $tree = 1;
 my $web = 0;
 my $subsystem = 0;
 my $status = 0;
@@ -255,6 +256,7 @@ if (!GetOptions(
 		'subsystem!' => \$subsystem,
 		'status!' => \$status,
 		'scm!' => \$scm,
+		'tree!' => \$tree,
 		'web!' => \$web,
 		'letters=s' => \$letters,
 		'pattern-depth=i' => \$pattern_depth,
@@ -319,7 +321,7 @@ if ($email &&
     die "$P: Please select at least 1 email option\n";
 }
 
-if (!top_of_kernel_tree($lk_path)) {
+if ($tree && !top_of_kernel_tree($lk_path)) {
     die "$P: The current directory does not appear to be "
 	. "a linux kernel source tree.\n";
 }
@@ -1031,13 +1033,14 @@ Other options:
   --sections => print all of the subsystem sections with pattern matches
   --letters => print all matching 'letter' types from all matching sections
   --mailmap => use .mailmap file (default: $email_use_mailmap)
+  --no-tree => run without a kernel tree
   --self-test => show potential issues with MAINTAINERS file content
   --version => show version
   --help => show this help information
 
 Default options:
-  [--email --nogit --git-fallback --m --r --n --l --multiline --pattern-depth=0
-   --remove-duplicates --rolestats]
+  [--email --tree --nogit --git-fallback --m --r --n --l --multiline
+   --pattern-depth=0 --remove-duplicates --rolestats]
 
 Notes:
   Using "-f directory" may give unexpected results:
-- 
cgit v1.2.3


From 5f0baf95b1edfff5eb9b6f571febb859b047de97 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:56:52 -0700
Subject: get_maintainer.pl: add -mpath=<path or file> for MAINTAINERS file
 location

Add the ability to have an override for the location of the MAINTAINERS
file.

Miscellanea:

o Properly indent a few lines with leading spaces

Link: http://lkml.kernel.org/r/a86e69195076ed3c4c526fddc76b86c28e0a1e37.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Suggested-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 48 +++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 82a482f858ee..0ebdefe74d5b 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -62,7 +62,7 @@ my $self_test = undef;
 my $version = 0;
 my $help = 0;
 my $find_maintainer_files = 0;
-
+my $maintainer_path;
 my $vcs_used = 0;
 
 my $exit = 0;
@@ -265,6 +265,7 @@ if (!GetOptions(
 		'fe|file-emails!' => \$file_emails,
 		'f|file' => \$from_filename,
 		'find-maintainer-files' => \$find_maintainer_files,
+		'mpath|maintainer-path=s' => \$maintainer_path,
 		'self-test:s' => \$self_test,
 		'v|version' => \$version,
 		'h|help|usage' => \$help,
@@ -386,26 +387,37 @@ sub find_ignore_git {
 read_all_maintainer_files();
 
 sub read_all_maintainer_files {
-    if (-d "${lk_path}MAINTAINERS") {
-        opendir(DIR, "${lk_path}MAINTAINERS") or die $!;
-        my @files = readdir(DIR);
-        closedir(DIR);
-        foreach my $file (@files) {
-            push(@mfiles, "${lk_path}MAINTAINERS/$file") if ($file !~ /^\./);
-        }
-    }
-
-    if ($find_maintainer_files) {
-        find( { wanted => \&find_is_maintainer_file,
-                preprocess => \&find_ignore_git,
-                no_chdir => 1,
-        }, "${lk_path}");
+    my $path = "${lk_path}MAINTAINERS";
+    if (defined $maintainer_path) {
+	$path = $maintainer_path;
+	# Perl Cookbook tilde expansion if necessary
+	$path =~ s@^~([^/]*)@ $1 ? (getpwnam($1))[7] : ( $ENV{HOME} || $ENV{LOGDIR} || (getpwuid($<))[7])@ex;
+    }
+
+    if (-d $path) {
+	$path .= '/' if ($path !~ m@/$@);
+	if ($path eq "${lk_path}MAINTAINERS/") {
+	    opendir(DIR, "$path") or die $!;
+	    my @files = readdir(DIR);
+	    closedir(DIR);
+	    foreach my $file (@files) {
+		push(@mfiles, "$path$file") if ($file !~ /^\./);
+	    }
+	}
+	if ($find_maintainer_files) {
+	    find( { wanted => \&find_is_maintainer_file,
+		    preprocess => \&find_ignore_git,
+		    no_chdir => 1,
+		}, "$path");
+	}
+    } elsif (-f "$path") {
+	push(@mfiles, "$path");
     } else {
-        push(@mfiles, "${lk_path}MAINTAINERS") if -f "${lk_path}MAINTAINERS";
+	die "$P: MAINTAINER file not found '$path'\n";
     }
-
+    die "$P: No MAINTAINER files found in '$path'\n" if (scalar(@mfiles) == 0);
     foreach my $file (@mfiles) {
-        read_maintainer_file("$file");
+	read_maintainer_file("$file");
     }
 }
 
-- 
cgit v1.2.3


From 0fbd75fd7feedd9e38fc1d3a01317aa23b83d29e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:56:55 -0700
Subject: get_maintainer: allow option --mpath <directory> to read all files in
 <directory>

There is an external use case for multiple private MAINTAINER style files
in a separate directory.  Allow it.

--mpath has a default of "./MAINTAINERS".

The value entered can be either a file or a directory.

The behaviors are now:

--mpath <file>          Read only the specific file as <MAINTAINER_TYPE> file
--mpath <directory>     Read all files in <directory> as <MAINTAINER_TYPE> files
--mpath <directory> --find-maintainer-files
                        Recurse through <directory> and read all files named MAINTAINERS

Link: http://lkml.kernel.org/r/991b2f20112d53863cd79e61d908f1d26d3e1971.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Tested-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 0ebdefe74d5b..c1c088ef1420 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -396,7 +396,12 @@ sub read_all_maintainer_files {
 
     if (-d $path) {
 	$path .= '/' if ($path !~ m@/$@);
-	if ($path eq "${lk_path}MAINTAINERS/") {
+	if ($find_maintainer_files) {
+	    find( { wanted => \&find_is_maintainer_file,
+		    preprocess => \&find_ignore_git,
+		    no_chdir => 1,
+		}, "$path");
+	} else {
 	    opendir(DIR, "$path") or die $!;
 	    my @files = readdir(DIR);
 	    closedir(DIR);
@@ -404,12 +409,6 @@ sub read_all_maintainer_files {
 		push(@mfiles, "$path$file") if ($file !~ /^\./);
 	    }
 	}
-	if ($find_maintainer_files) {
-	    find( { wanted => \&find_is_maintainer_file,
-		    preprocess => \&find_ignore_git,
-		    no_chdir => 1,
-		}, "$path");
-	}
     } elsif (-f "$path") {
 	push(@mfiles, "$path");
     } else {
-- 
cgit v1.2.3


From ccf7a6d457a8bd618c77468c38842b3f9d82cef4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 21 Aug 2018 21:56:59 -0700
Subject: lib/bitmap.c: drop unnecessary 0 check for u32 array operations

nbits == 0 is safe to be supplied to the function body, so remove
unnecessary checks in bitmap_to_arr32() and bitmap_from_arr32().

Link: http://lkml.kernel.org/r/20180531131914.44352-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Yury Norov <ynorov@caviumnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bitmap.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index 730969c681cb..2fd07f6df0b8 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -1152,14 +1152,10 @@ EXPORT_SYMBOL(bitmap_free);
  *	@buf: array of u32 (in host byte order), the source bitmap
  *	@nbits: number of bits in @bitmap
  */
-void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
-						unsigned int nbits)
+void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
 {
 	unsigned int i, halfwords;
 
-	if (!nbits)
-		return;
-
 	halfwords = DIV_ROUND_UP(nbits, 32);
 	for (i = 0; i < halfwords; i++) {
 		bitmap[i/2] = (unsigned long) buf[i];
@@ -1183,9 +1179,6 @@ void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
 {
 	unsigned int i, halfwords;
 
-	if (!nbits)
-		return;
-
 	halfwords = DIV_ROUND_UP(nbits, 32);
 	for (i = 0; i < halfwords; i++) {
 		buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
-- 
cgit v1.2.3


From 9144d75e22cad3c89e6b2ccab551db9ee28d250a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 Aug 2018 21:57:03 -0700
Subject: include/linux/bitops.h: introduce BITS_PER_TYPE

net_dim.h has a rather useful extension to BITS_PER_BYTE to compute the
number of bits in a type (BITS_PER_BYTE * sizeof(T)), so promote the macro
to bitops.h, alongside BITS_PER_BYTE, for wider usage.

Link: http://lkml.kernel.org/r/20180706094458.14116-1-chris@chris-wilson.co.uk
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Andy Gospodarek <gospo@broadcom.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h  | 3 ++-
 include/linux/net_dim.h | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index af419012d77d..7ddb1349394d 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -4,7 +4,8 @@
 #include <asm/types.h>
 #include <linux/bits.h>
 
-#define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
+#define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
 
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index db99240d00bd..c79e859408e6 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -363,7 +363,6 @@ static inline void net_dim_sample(u16 event_ctr,
 }
 
 #define NET_DIM_NEVENTS 64
-#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
 #define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1))
 
 static inline void net_dim_calc_stats(struct net_dim_sample *start,
-- 
cgit v1.2.3


From b15f5f1ae18a70f2db0ec4b4e5fa525420a67270 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 21 Aug 2018 21:57:07 -0700
Subject: lib/test_debug_virtual.c: make struct pointer foo static

The pointer foo is local to the source and does not need to be
in global scope, so make it static.

Cleans up sparse warning:
symbol 'foo' was not declared. Should it be static?

Link: http://lkml.kernel.org/r/20180624112206.5722-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_debug_virtual.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/test_debug_virtual.c b/lib/test_debug_virtual.c
index b9cdeecc19dc..d5a06addeb27 100644
--- a/lib/test_debug_virtual.c
+++ b/lib/test_debug_virtual.c
@@ -15,7 +15,7 @@ struct foo {
 	unsigned int bar;
 };
 
-struct foo *foo;
+static struct foo *foo;
 
 static int __init test_debug_virtual_init(void)
 {
-- 
cgit v1.2.3


From feba04fd2cf8f6a74865338df2e3e1e94d6cfd13 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Tue, 21 Aug 2018 21:57:11 -0700
Subject: lib: add crc64 calculation routines

Patch series "add crc64 calculation as kernel library", v5.

This patchset adds basic implementation of crc64 calculation as a Linux
kernel library.  Since bcache already does crc64 by itself, this patchset
also modifies bcache code to use the new crc64 library routine.

Currently bcache is the only user of crc64 calculation, another potential
user is bcachefs which is on the way to be in mainline kernel.  Therefore
it makes sense to make crc64 calculation to be a public library.

bcache uses crc64 as storage checksum, if a change of crc lib routines
results an inconsistent result, the unmatched checksum may make bcache
'think' the on-disk is corrupted, such a change should be avoided or
detected as early as possible.  Therefore a patch is being prepared which
adds a crc test framework, to check consistency of different calculations.

This patch (of 2):

Add the re-write crc64 calculation routines for Linux kernel.  The CRC64
polynomical arithmetic follows ECMA-182 specification, inspired by CRC
paper of Dr.  Ross N.  Williams (see
http://www.ross.net/crc/download/crc_v3.txt) and other public domain
implementations.

All the changes work in this way,
- When Linux kernel is built, host program lib/gen_crc64table.c will be
  compiled to lib/gen_crc64table and executed.
- The output of gen_crc64table execution is an array called as lookup
  table (a.k.a POLY 0x42f0e1eba9ea369) which contain 256 64-bit long
  numbers, this table is dumped into header file lib/crc64table.h.
- Then the header file is included by lib/crc64.c for normal 64bit crc
  calculation.
- Function declaration of the crc64 calculation routines is placed in
  include/linux/crc64.h

Currently bcache is the only user of crc64_be(), another potential user is
bcachefs which is on the way to be in mainline kernel.  Therefore it makes
sense to move crc64 calculation into lib/crc64.c as public code.

[colyli@suse.de: fix review comments from v4]
  Link: http://lkml.kernel.org/r/20180726053352.2781-2-colyli@suse.de
Link: http://lkml.kernel.org/r/20180718165545.1622-2-colyli@suse.de
Signed-off-by: Coly Li <colyli@suse.de>
Co-developed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Noah Massey <noah.massey@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crc64.h | 11 +++++++++
 lib/.gitignore        |  2 ++
 lib/Kconfig           |  8 ++++++
 lib/Makefile          | 11 +++++++++
 lib/crc64.c           | 56 ++++++++++++++++++++++++++++++++++++++++++
 lib/gen_crc64table.c  | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 156 insertions(+)
 create mode 100644 include/linux/crc64.h
 create mode 100644 lib/crc64.c
 create mode 100644 lib/gen_crc64table.c

diff --git a/include/linux/crc64.h b/include/linux/crc64.h
new file mode 100644
index 000000000000..c756e65a1b58
--- /dev/null
+++ b/include/linux/crc64.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * See lib/crc64.c for the related specification and polynomial arithmetic.
+ */
+#ifndef _LINUX_CRC64_H
+#define _LINUX_CRC64_H
+
+#include <linux/types.h>
+
+u64 __pure crc64_be(u64 crc, const void *p, size_t len);
+#endif /* _LINUX_CRC64_H */
diff --git a/lib/.gitignore b/lib/.gitignore
index 09aae85418ab..f2a39c9e5485 100644
--- a/lib/.gitignore
+++ b/lib/.gitignore
@@ -2,5 +2,7 @@
 # Generated files
 #
 gen_crc32table
+gen_crc64table
 crc32table.h
+crc64table.h
 oid_registry_data.c
diff --git a/lib/Kconfig b/lib/Kconfig
index 706836ec314d..9c10b9852563 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -170,6 +170,14 @@ config CRC32_BIT
 
 endchoice
 
+config CRC64
+	tristate "CRC64 functions"
+	help
+	  This option is provided for the case where no in-kernel-tree
+	  modules require CRC64 functions, but a module built outside
+	  the kernel tree does. Such modules that use library CRC64
+	  functions require M here.
+
 config CRC4
 	tristate "CRC4 functions"
 	help
diff --git a/lib/Makefile b/lib/Makefile
index d95bb2525101..9baefb6cb1a1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -103,6 +103,7 @@ obj-$(CONFIG_CRC16)	+= crc16.o
 obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
 obj-$(CONFIG_CRC_ITU_T)	+= crc-itu-t.o
 obj-$(CONFIG_CRC32)	+= crc32.o
+obj-$(CONFIG_CRC64)     += crc64.o
 obj-$(CONFIG_CRC32_SELFTEST)	+= crc32test.o
 obj-$(CONFIG_CRC4)	+= crc4.o
 obj-$(CONFIG_CRC7)	+= crc7.o
@@ -217,7 +218,9 @@ obj-$(CONFIG_FONT_SUPPORT) += fonts/
 obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
 
 hostprogs-y	:= gen_crc32table
+hostprogs-y	+= gen_crc64table
 clean-files	:= crc32table.h
+clean-files	+= crc64table.h
 
 $(obj)/crc32.o: $(obj)/crc32table.h
 
@@ -227,6 +230,14 @@ quiet_cmd_crc32 = GEN     $@
 $(obj)/crc32table.h: $(obj)/gen_crc32table
 	$(call cmd,crc32)
 
+$(obj)/crc64.o: $(obj)/crc64table.h
+
+quiet_cmd_crc64 = GEN     $@
+      cmd_crc64 = $< > $@
+
+$(obj)/crc64table.h: $(obj)/gen_crc64table
+	$(call cmd,crc64)
+
 #
 # Build a fast OID lookip registry from include/linux/oid_registry.h
 #
diff --git a/lib/crc64.c b/lib/crc64.c
new file mode 100644
index 000000000000..0ef8ae6ac047
--- /dev/null
+++ b/lib/crc64.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Normal 64-bit CRC calculation.
+ *
+ * This is a basic crc64 implementation following ECMA-182 specification,
+ * which can be found from,
+ * http://www.ecma-international.org/publications/standards/Ecma-182.htm
+ *
+ * Dr. Ross N. Williams has a great document to introduce the idea of CRC
+ * algorithm, here the CRC64 code is also inspired by the table-driven
+ * algorithm and detail example from this paper. This paper can be found
+ * from,
+ * http://www.ross.net/crc/download/crc_v3.txt
+ *
+ * crc64table[256] is the lookup table of a table-driven 64-bit CRC
+ * calculation, which is generated by gen_crc64table.c in kernel build
+ * time. The polynomial of crc64 arithmetic is from ECMA-182 specification
+ * as well, which is defined as,
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+ *
+ * Copyright 2018 SUSE Linux.
+ *   Author: Coly Li <colyli@suse.de>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include "crc64table.h"
+
+MODULE_DESCRIPTION("CRC64 calculations");
+MODULE_LICENSE("GPL v2");
+
+/**
+ * crc64_be - Calculate bitwise big-endian ECMA-182 CRC64
+ * @crc: seed value for computation. 0 or (u64)~0 for a new CRC calculation,
+	or the previous crc64 value if computing incrementally.
+ * @p: pointer to buffer over which CRC64 is run
+ * @len: length of buffer @p
+ */
+u64 __pure crc64_be(u64 crc, const void *p, size_t len)
+{
+	size_t i, t;
+
+	const unsigned char *_p = p;
+
+	for (i = 0; i < len; i++) {
+		t = ((crc >> 56) ^ (*_p++)) & 0xFF;
+		crc = crc64table[t] ^ (crc << 8);
+	}
+
+	return crc;
+}
+EXPORT_SYMBOL_GPL(crc64_be);
diff --git a/lib/gen_crc64table.c b/lib/gen_crc64table.c
new file mode 100644
index 000000000000..9011926e4162
--- /dev/null
+++ b/lib/gen_crc64table.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate lookup table for the table-driven CRC64 calculation.
+ *
+ * gen_crc64table is executed in kernel build time and generates
+ * lib/crc64table.h. This header is included by lib/crc64.c for
+ * the table-driven CRC64 calculation.
+ *
+ * See lib/crc64.c for more information about which specification
+ * and polynomial arithmetic that gen_crc64table.c follows to
+ * generate the lookup table.
+ *
+ * Copyright 2018 SUSE Linux.
+ *   Author: Coly Li <colyli@suse.de>
+ */
+#include <inttypes.h>
+#include <stdio.h>
+
+#include <linux/swab.h>
+
+#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL
+
+static uint64_t crc64_table[256] = {0};
+
+static void generate_crc64_table(void)
+{
+	uint64_t i, j, c, crc;
+
+	for (i = 0; i < 256; i++) {
+		crc = 0;
+		c = i << 56;
+
+		for (j = 0; j < 8; j++) {
+			if ((crc ^ c) & 0x8000000000000000ULL)
+				crc = (crc << 1) ^ CRC64_ECMA182_POLY;
+			else
+				crc <<= 1;
+			c <<= 1;
+		}
+
+		crc64_table[i] = crc;
+	}
+}
+
+static void print_crc64_table(void)
+{
+	int i;
+
+	printf("/* this file is generated - do not edit */\n\n");
+	printf("#include <linux/types.h>\n");
+	printf("#include <linux/cache.h>\n\n");
+	printf("static const u64 ____cacheline_aligned crc64table[256] = {\n");
+	for (i = 0; i < 256; i++) {
+		printf("\t0x%016" PRIx64 "ULL", crc64_table[i]);
+		if (i & 0x1)
+			printf(",\n");
+		else
+			printf(", ");
+	}
+	printf("};\n");
+}
+
+int main(int argc, char *argv[])
+{
+	generate_crc64_table();
+	print_crc64_table();
+	return 0;
+}
-- 
cgit v1.2.3


From d23599630b0fdecf3b676140a97619def16f2060 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Tue, 21 Aug 2018 21:57:15 -0700
Subject: bcache: use routines from lib/crc64.c for CRC64 calculation

Now we have crc64 calculation in lib/crc64.c, it is unnecessary for
bcache to use its own version.  This patch changes bcache code to use
crc64 routines in lib/crc64.c.

Link: http://lkml.kernel.org/r/20180718165545.1622-3-colyli@suse.de
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Noah Massey <noah.massey@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/bcache/Kconfig |   1 +
 drivers/md/bcache/util.c  | 131 ----------------------------------------------
 drivers/md/bcache/util.h  |  21 ++++++--
 3 files changed, 18 insertions(+), 135 deletions(-)

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 17bf109c58e9..af247298409a 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,6 +1,7 @@
 
 config BCACHE
 	tristate "Block device as cache"
+	select CRC64
 	---help---
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index b15256bcf0e7..c6a99dfa1ad9 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -279,134 +279,3 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
 
 	return 0;
 }
-
-/*
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
- * use permitted, subject to terms of PostgreSQL license; see.)
-
- * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
- * usual sort of implementation. (See Ross Williams' excellent introduction
- * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
- * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
- * If we have no working 64-bit type, then fake it with two 32-bit registers.
- *
- * The present implementation is a normal (not "reflected", in Williams'
- * terms) 64-bit CRC, using initial all-ones register contents and a final
- * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
- * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
- *
- * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x + 1
-*/
-
-static const uint64_t crc_table[256] = {
-	0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
-	0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
-	0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
-	0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
-	0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
-	0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
-	0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
-	0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
-	0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
-	0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
-	0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
-	0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
-	0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
-	0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
-	0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
-	0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
-	0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
-	0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
-	0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
-	0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
-	0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
-	0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
-	0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
-	0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
-	0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
-	0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
-	0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
-	0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
-	0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
-	0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
-	0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
-	0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
-	0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
-	0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
-	0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
-	0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
-	0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
-	0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
-	0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
-	0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
-	0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
-	0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
-	0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
-	0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
-	0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
-	0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
-	0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
-	0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
-	0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
-	0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
-	0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
-	0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
-	0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
-	0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
-	0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
-	0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
-	0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
-	0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
-	0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
-	0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
-	0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
-	0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
-	0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
-	0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
-	0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
-	0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
-	0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
-	0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
-	0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
-	0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
-	0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
-	0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
-	0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
-	0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
-	0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
-	0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
-	0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
-	0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
-	0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
-	0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
-	0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
-	0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
-	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
-	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
-	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
-	0x9AFCE626CE85B507ULL,
-};
-
-uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len)
-{
-	const unsigned char *data = _data;
-
-	while (len--) {
-		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
-		crc = crc_table[i] ^ (crc << 8);
-	}
-
-	return crc;
-}
-
-uint64_t bch_crc64(const void *data, size_t len)
-{
-	uint64_t crc = 0xffffffffffffffffULL;
-
-	crc = bch_crc64_update(crc, data, len);
-
-	return crc ^ 0xffffffffffffffffULL;
-}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index f7b0133c9d2f..5ff055f0a653 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -11,6 +11,7 @@
 #include <linux/ratelimit.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
+#include <linux/crc64.h>
 
 #include "closure.h"
 
@@ -542,6 +543,22 @@ dup:									\
 #define RB_PREV(ptr, member)						\
 	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
 
+static inline uint64_t bch_crc64(const void *p, size_t len)
+{
+	uint64_t crc = 0xffffffffffffffffULL;
+
+	crc = crc64_be(crc, p, len);
+	return crc ^ 0xffffffffffffffffULL;
+}
+
+static inline uint64_t bch_crc64_update(uint64_t crc,
+					const void *p,
+					size_t len)
+{
+	crc = crc64_be(crc, p, len);
+	return crc;
+}
+
 /* Does linear interpolation between powers of two */
 static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 {
@@ -561,8 +578,4 @@ static inline sector_t bdev_sectors(struct block_device *bdev)
 {
 	return bdev->bd_inode->i_size >> 9;
 }
-
-uint64_t bch_crc64_update(uint64_t, const void *, size_t);
-uint64_t bch_crc64(const void *, size_t);
-
 #endif /* _BCACHE_UTIL_H */
-- 
cgit v1.2.3


From fd7338ef624fab2efbeb011c3e790693b4199c8e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 21 Aug 2018 21:57:18 -0700
Subject: lib/Kconfig: remove 'default n' for tests

It seems contributors follow the style of Kconfig entries where explicit
'default n' is present.  The default 'default' is 'n' already, thus, drop
these lines from Kconfig to make it more clear.

Link: http://lkml.kernel.org/r/20180719085131.79541-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Coly Li <colyli@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig       |  1 -
 lib/Kconfig.debug | 17 -----------------
 2 files changed, 18 deletions(-)

diff --git a/lib/Kconfig b/lib/Kconfig
index 9c10b9852563..a3928d4438b5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -231,7 +231,6 @@ config AUDIT_COMPAT_GENERIC
 
 config RANDOM32_SELFTEST
 	bool "PRNG perform self test on init"
-	default n
 	help
 	  This option enables the 32 bit PRNG library functions to perform a
 	  self test on initialization.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ab1b599202bc..3589765141a8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1220,7 +1220,6 @@ config LOCK_TORTURE_TEST
 	tristate "torture tests for locking"
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
-	default n
 	help
 	  This option provides a kernel module that runs torture tests
 	  on kernel locking primitives.  The kernel module may be built
@@ -1687,7 +1686,6 @@ config LKDTM
 	tristate "Linux Kernel Dump Test Tool Module"
 	depends on DEBUG_FS
 	depends on BLOCK
-	default n
 	help
 	This module enables testing of the different dumping mechanisms by
 	inducing system failures at predefined crash points.
@@ -1721,7 +1719,6 @@ config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL
 	depends on KPROBES
-	default n
 	help
 	  This option provides for testing basic kprobes functionality on
 	  boot. Samples of kprobe and kretprobe are inserted and
@@ -1732,7 +1729,6 @@ config KPROBES_SANITY_TEST
 config BACKTRACE_SELF_TEST
 	tristate "Self test for the backtrace code"
 	depends on DEBUG_KERNEL
-	default n
 	help
 	  This option provides a kernel module that can be used to test
 	  the kernel stack backtrace code. This option is not useful
@@ -1802,7 +1798,6 @@ config TEST_PRINTF
 
 config TEST_BITMAP
 	tristate "Test bitmap_*() family of functions at runtime"
-	default n
 	help
 	  Enable this option to test the bitmap functions at boot.
 
@@ -1823,7 +1818,6 @@ config TEST_OVERFLOW
 
 config TEST_RHASHTABLE
 	tristate "Perform selftest on resizable hash table"
-	default n
 	help
 	  Enable this option to test the rhashtable functions at boot.
 
@@ -1831,7 +1825,6 @@ config TEST_RHASHTABLE
 
 config TEST_HASH
 	tristate "Perform selftest on hash functions"
-	default n
 	help
 	  Enable this option to test the kernel's integer (<linux/hash.h>),
 	  string (<linux/stringhash.h>), and siphash (<linux/siphash.h>)
@@ -1842,7 +1835,6 @@ config TEST_HASH
 
 config TEST_PARMAN
 	tristate "Perform selftest on priority array manager"
-	default n
 	depends on PARMAN
 	help
 	  Enable this option to test priority array manager on boot
@@ -1852,7 +1844,6 @@ config TEST_PARMAN
 
 config TEST_LKM
 	tristate "Test module loading with 'hello world' module"
-	default n
 	depends on m
 	help
 	  This builds the "test_module" module that emits "Hello, world"
@@ -1866,7 +1857,6 @@ config TEST_LKM
 
 config TEST_USER_COPY
 	tristate "Test user/kernel boundary protections"
-	default n
 	depends on m
 	help
 	  This builds the "test_user_copy" module that runs sanity checks
@@ -1879,7 +1869,6 @@ config TEST_USER_COPY
 
 config TEST_BPF
 	tristate "Test BPF filter functionality"
-	default n
 	depends on m && NET
 	help
 	  This builds the "test_bpf" module that runs various test vectors
@@ -1893,7 +1882,6 @@ config TEST_BPF
 
 config FIND_BIT_BENCHMARK
 	tristate "Test find_bit functions"
-	default n
 	help
 	  This builds the "test_find_bit" module that measure find_*_bit()
 	  functions performance.
@@ -1902,7 +1890,6 @@ config FIND_BIT_BENCHMARK
 
 config TEST_FIRMWARE
 	tristate "Test firmware loading via userspace interface"
-	default n
 	depends on FW_LOADER
 	help
 	  This builds the "test_firmware" module that creates a userspace
@@ -1915,7 +1902,6 @@ config TEST_FIRMWARE
 
 config TEST_SYSCTL
 	tristate "sysctl test driver"
-	default n
 	depends on PROC_SYSCTL
 	help
 	  This builds the "test_sysctl" module. This driver enables to test the
@@ -1926,7 +1912,6 @@ config TEST_SYSCTL
 
 config TEST_UDELAY
 	tristate "udelay test driver"
-	default n
 	help
 	  This builds the "udelay_test" module that helps to make sure
 	  that udelay() is working properly.
@@ -1935,7 +1920,6 @@ config TEST_UDELAY
 
 config TEST_STATIC_KEYS
 	tristate "Test static keys"
-	default n
 	depends on m
 	help
 	  Test the static key interfaces.
@@ -1944,7 +1928,6 @@ config TEST_STATIC_KEYS
 
 config TEST_KMOD
 	tristate "kmod stress tester"
-	default n
 	depends on m
 	depends on BLOCK && (64BIT || LBDAF)	  # for XFS, BTRFS
 	depends on NETDEVICES && NET_CORE && INET # for TUN
-- 
cgit v1.2.3


From de9df3993cfffde468c1c226382f24cec7e9fedc Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 21 Aug 2018 21:57:22 -0700
Subject: lib/test_hexdump.c: fix failure on big endian cpu

On a big endian cpu, test_hexdump fails as follows.  The logs show that
bytes are expected in reversed order.

  [...]
  test_hexdump: Len: 24 buflen: 130 strlen: 97
  test_hexdump: Result: 97 'be32db7b 0a1893b2 70bac424 7d83349b a69c31ad 9c0face9                    .2.{....p..$}.4...1.....'
  test_hexdump: Expect: 97 '7bdb32be b293180a 24c4ba70 9b34837d ad319ca6 e9ac0f9c                    .2.{....p..$}.4...1.....'
  test_hexdump: Len: 8 buflen: 130 strlen: 77
  test_hexdump: Result: 77 'be32db7b0a1893b2                                                     .2.{....'
  test_hexdump: Expect: 77 'b293180a7bdb32be                                                     .2.{....'
  test_hexdump: Len: 6 buflen: 131 strlen: 87
  test_hexdump: Result: 87 'be32 db7b 0a18                                                                   .2.{..'
  test_hexdump: Expect: 87 '32be 7bdb 180a                                                                   .2.{..'
  test_hexdump: Len: 24 buflen: 131 strlen: 97
  test_hexdump: Result: 97 'be32db7b 0a1893b2 70bac424 7d83349b a69c31ad 9c0face9                    .2.{....p..$}.4...1.....'
  test_hexdump: Expect: 97 '7bdb32be b293180a 24c4ba70 9b34837d ad319ca6 e9ac0f9c                    .2.{....p..$}.4...1.....'
  test_hexdump: Len: 32 buflen: 131 strlen: 101
  test_hexdump: Result: 101 'be32db7b0a1893b2 70bac4247d83349b a69c31ad9c0face9 4cd1199943b1af0c  .2.{....p..$}.4...1.....L...C...'
  test_hexdump: Expect: 101 'b293180a7bdb32be 9b34837d24c4ba70 e9ac0f9cad319ca6 0cafb1439919d14c  .2.{....p..$}.4...1.....L...C...'
  test_hexdump: failed 801 out of 1184 tests

This patch fixes it.

Link: http://lkml.kernel.org/r/f3112437f62c2f48300535510918e8be1dceacfb.1533610877.git.christophe.leroy@c-s.fr
Fixes: 64d1d77a44697 ("hexdump: introduce test suite")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: rashmica <rashmicy@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_hexdump.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c
index 3f415d8101f3..626f580b4ff7 100644
--- a/lib/test_hexdump.c
+++ b/lib/test_hexdump.c
@@ -18,7 +18,7 @@ static const unsigned char data_b[] = {
 
 static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C...";
 
-static const char * const test_data_1_le[] __initconst = {
+static const char * const test_data_1[] __initconst = {
 	"be", "32", "db", "7b", "0a", "18", "93", "b2",
 	"70", "ba", "c4", "24", "7d", "83", "34", "9b",
 	"a6", "9c", "31", "ad", "9c", "0f", "ac", "e9",
@@ -32,16 +32,33 @@ static const char * const test_data_2_le[] __initconst = {
 	"d14c", "9919", "b143", "0caf",
 };
 
+static const char * const test_data_2_be[] __initconst = {
+	"be32", "db7b", "0a18", "93b2",
+	"70ba", "c424", "7d83", "349b",
+	"a69c", "31ad", "9c0f", "ace9",
+	"4cd1", "1999", "43b1", "af0c",
+};
+
 static const char * const test_data_4_le[] __initconst = {
 	"7bdb32be", "b293180a", "24c4ba70", "9b34837d",
 	"ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143",
 };
 
+static const char * const test_data_4_be[] __initconst = {
+	"be32db7b", "0a1893b2", "70bac424", "7d83349b",
+	"a69c31ad", "9c0face9", "4cd11999", "43b1af0c",
+};
+
 static const char * const test_data_8_le[] __initconst = {
 	"b293180a7bdb32be", "9b34837d24c4ba70",
 	"e9ac0f9cad319ca6", "0cafb1439919d14c",
 };
 
+static const char * const test_data_8_be[] __initconst = {
+	"be32db7b0a1893b2", "70bac4247d83349b",
+	"a69c31ad9c0face9", "4cd1199943b1af0c",
+};
+
 #define FILL_CHAR	'#'
 
 static unsigned total_tests __initdata;
@@ -56,6 +73,7 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize,
 	size_t l = len;
 	int gs = groupsize, rs = rowsize;
 	unsigned int i;
+	const bool is_be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
 
 	if (rs != 16 && rs != 32)
 		rs = 16;
@@ -67,13 +85,13 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize,
 		gs = 1;
 
 	if (gs == 8)
-		result = test_data_8_le;
+		result = is_be ? test_data_8_be : test_data_8_le;
 	else if (gs == 4)
-		result = test_data_4_le;
+		result = is_be ? test_data_4_be : test_data_4_le;
 	else if (gs == 2)
-		result = test_data_2_le;
+		result = is_be ? test_data_2_be : test_data_2_le;
 	else
-		result = test_data_1_le;
+		result = test_data_1;
 
 	/* hex dump */
 	p = test;
-- 
cgit v1.2.3


From d729593e492e1e4c7cd8a418ee227d0bd4d5f36d Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:57:26 -0700
Subject: checkpatch: add a --strict test for structs with bool member
 definitions

A struct with a bool member can have different sizes on various
architectures because neither bool size nor alignment is standardized.

So emit a message on the use of bool in structs only in .h files and not
.c files.

There is the real possibility that this test could have a false positive
when a bool is declared as an automatic, so limit the test to .h files
where the only false positive is for declarations in static inline
functions.

Link: http://lkml.kernel.org/r/95477c93db187bab6da8a8ba7c57836868446179.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 447857ffaf6b..5cb7cfd08ce2 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6251,6 +6251,13 @@ sub process {
 			     "Avoid using bool as bitfield.  Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr);
 		}
 
+# check for bool use in .h files
+		if ($realfile =~ /\.h$/ &&
+		    $sline =~ /^.\s+bool\s*$Ident\s*(?::\s*d+\s*)?;/) {
+			CHK("BOOL_MEMBER",
+			    "Avoid using bool structure members because of possible alignment issues - see: https://lkml.org/lkml/2017/11/21/384\n" . $herecurr);
+		}
+
 # check for semaphores initialized locked
 		if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) {
 			WARN("CONSIDER_COMPLETION",
-- 
cgit v1.2.3


From 79682c0c00893c683678d85a402392b75e90311d Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:57:29 -0700
Subject: checkpatch: add --fix for CONCATENATED_STRING and STRING_FRAGMENTS

Add the ability to --fix these string issues.

e.g.:
	printk(KERN_INFO"bar" "baz"QUX);
converts to
	printk(KERN_INFO "barbaz" QUX);

Link: http://lkml.kernel.org/r/a9fb505ccfedffc5869d08832a7ff05a21d85621.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 5cb7cfd08ce2..f2e00dc0ae89 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5330,15 +5330,28 @@ sub process {
 		}
 
 # concatenated string without spaces between elements
-		if ($line =~ /$String[A-Z_]/ || $line =~ /[A-Za-z0-9_]$String/) {
-			CHK("CONCATENATED_STRING",
-			    "Concatenated strings should use spaces between elements\n" . $herecurr);
+		if ($line =~ /$String[A-Za-z0-9_]/ || $line =~ /[A-Za-z0-9_]$String/) {
+			if (CHK("CONCATENATED_STRING",
+				"Concatenated strings should use spaces between elements\n" . $herecurr) &&
+			    $fix) {
+				while ($line =~ /($String)/g) {
+					my $extracted_string = substr($rawline, $-[0], $+[0] - $-[0]);
+					$fixed[$fixlinenr] =~ s/\Q$extracted_string\E([A-Za-z0-9_])/$extracted_string $1/;
+					$fixed[$fixlinenr] =~ s/([A-Za-z0-9_])\Q$extracted_string\E/$1 $extracted_string/;
+				}
+			}
 		}
 
 # uncoalesced string fragments
 		if ($line =~ /$String\s*"/) {
-			WARN("STRING_FRAGMENTS",
-			     "Consecutive strings are generally better as a single string\n" . $herecurr);
+			if (WARN("STRING_FRAGMENTS",
+				 "Consecutive strings are generally better as a single string\n" . $herecurr) &&
+			    $fix) {
+				while ($line =~ /($String)(?=\s*")/g) {
+					my $extracted_string = substr($rawline, $-[0], $+[0] - $-[0]);
+					$fixed[$fixlinenr] =~ s/\Q$extracted_string\E\s*"/substr($extracted_string, 0, -1)/e;
+				}
+			}
 		}
 
 # check for non-standard and hex prefixed decimal printf formats
-- 
cgit v1.2.3


From 5b57980de6b228696dfaf70ad50a8550fbb14fc7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:57:33 -0700
Subject: checkpatch: improve runtime execution speed a little

checkpatch repeatedly uses a runtime minimum version check that validates
the minimum perl version required for a regex match by using a "$^V ge
5.10.0" runtime string match.

Only perform that minimum version test once and store the result to reduce
string matching time.

This reduces runtime execution time for patches or files with high line
counts.

An example runtime improvement:

new: $ time ./scripts/checkpatch.pl -f drivers/net/ethernet/intel/i40e/i40e_main.c > /dev/null

real	0m11.856s
user	0m11.831s
sys	0m0.025s

old: $ time ./scripts/checkpatch.pl -f drivers/net/ethernet/intel/i40e/i40e_main.c > /dev/null

real	0m13.330s
user	0m13.282s
sys	0m0.049s

Link: http://lkml.kernel.org/r/db21aa9703833bad65ab70cc4e8a78da5b399138.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 66 +++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index f2e00dc0ae89..0f9788ab988b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -240,11 +240,11 @@ $check_orig = $check;
 
 my $exit = 0;
 
+my $perl_version_ok = 1;
 if ($^V && $^V lt $minimum_perl_version) {
+	$perl_version_ok = 0;
 	printf "$P: requires at least perl version %vd\n", $minimum_perl_version;
-	if (!$ignore_perl_version) {
-		exit(1);
-	}
+	exit(1) if (!$ignore_perl_version);
 }
 
 #if no filenames are given, push '-' to read patch from stdin
@@ -1026,11 +1026,11 @@ if (!$quiet) {
 	hash_show_words(\%use_type, "Used");
 	hash_show_words(\%ignore_type, "Ignored");
 
-	if ($^V lt 5.10.0) {
+	if (!$perl_version_ok) {
 		print << "EOM"
 
 NOTE: perl $^V is not modern enough to detect all possible issues.
-      An upgrade to at least perl v5.10.0 is suggested.
+      An upgrade to at least perl $minimum_perl_version is suggested.
 EOM
 	}
 	if ($exit) {
@@ -3079,7 +3079,7 @@ sub process {
 		}
 
 # check indentation starts on a tab stop
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$)|$Declare\s*$Ident\s*[;=])/) {
 			my $indent = length($1);
 			if ($indent % 8) {
@@ -3092,7 +3092,7 @@ sub process {
 		}
 
 # check multi-line statement indentation matches previous line
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $prevline =~ /^\+([ \t]*)((?:$c90_Keywords(?:\s+if)\s*)|(?:$Declare\s*)?(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*|(?:\*\s*)*$Lval\s*=\s*$Ident\s*)\(.*(\&\&|\|\||,)\s*$/) {
 			$prevline =~ /^\+(\t*)(.*)$/;
 			my $oldindent = $1;
@@ -3967,7 +3967,7 @@ sub process {
 
 # function brace can't be on same line, except for #defines of do while,
 # or if closed on same line
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $sline =~ /$Type\s*$Ident\s*$balanced_parens\s*\{/ &&
 		    $sline !~ /\#\s*define\b.*do\s*\{/ &&
 		    $sline !~ /}/) {
@@ -4578,7 +4578,7 @@ sub process {
 # check for unnecessary parentheses around comparisons in if uses
 # when !drivers/staging or command-line uses --strict
 		if (($realfile !~ m@^(?:drivers/staging/)@ || $check_orig) &&
-		    $^V && $^V ge 5.10.0 && defined($stat) &&
+		    $perl_version_ok && defined($stat) &&
 		    $stat =~ /(^.\s*if\s*($balanced_parens))/) {
 			my $if_stat = $1;
 			my $test = substr($2, 1, -1);
@@ -4615,7 +4615,7 @@ sub process {
 # return is not a function
 		if (defined($stat) && $stat =~ /^.\s*return(\s*)\(/s) {
 			my $spacing = $1;
-			if ($^V && $^V ge 5.10.0 &&
+			if ($perl_version_ok &&
 			    $stat =~ /^.\s*return\s*($balanced_parens)\s*;\s*$/) {
 				my $value = $1;
 				$value = deparenthesize($value);
@@ -4642,7 +4642,7 @@ sub process {
                }
 
 # if statements using unnecessary parentheses - ie: if ((foo == bar))
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $line =~ /\bif\s*((?:\(\s*){2,})/) {
 			my $openparens = $1;
 			my $count = $openparens =~ tr@\(@\(@;
@@ -4659,7 +4659,7 @@ sub process {
 #	avoid cases like "foo + BAR < baz"
 #	only fix matches surrounded by parentheses to avoid incorrect
 #	conversions like "FOO < baz() + 5" being "misfixed" to "baz() > FOO + 5"
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $line =~ /^\+(.*)\b($Constant|[A-Z_][A-Z0-9_]*)\s*($Compare)\s*($LvalOrFunc)/) {
 			my $lead = $1;
 			my $const = $2;
@@ -5084,7 +5084,7 @@ sub process {
 # do {} while (0) macro tests:
 # single-statement macros do not need to be enclosed in do while (0) loop,
 # macro should not end with a semicolon
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $realfile !~ m@/vmlinux.lds.h$@ &&
 		    $line =~ /^.\s*\#\s*define\s+$Ident(\()?/) {
 			my $ln = $linenr;
@@ -5460,7 +5460,7 @@ sub process {
 		}
 
 # check for mask then right shift without a parentheses
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $line =~ /$LvalOrFunc\s*\&\s*($LvalOrFunc)\s*>>/ &&
 		    $4 !~ /^\&/) { # $LvalOrFunc may be &foo, ignore if so
 			WARN("MASK_THEN_SHIFT",
@@ -5468,7 +5468,7 @@ sub process {
 		}
 
 # check for pointer comparisons to NULL
-		if ($^V && $^V ge 5.10.0) {
+		if ($perl_version_ok) {
 			while ($line =~ /\b$LvalOrFunc\s*(==|\!=)\s*NULL\b/g) {
 				my $val = $1;
 				my $equal = "!";
@@ -5740,7 +5740,7 @@ sub process {
 		}
 
 # Check for __attribute__ weak, or __weak declarations (may have link issues)
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $line =~ /(?:$Declare|$DeclareMisordered)\s*$Ident\s*$balanced_parens\s*(?:$Attribute)?\s*;/ &&
 		    ($line =~ /\b__attribute__\s*\(\s*\(.*\bweak\b/ ||
 		     $line =~ /\b__weak\b/)) {
@@ -5822,7 +5822,7 @@ sub process {
 		}
 
 # check for vsprintf extension %p<foo> misuses
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s &&
 		    $1 !~ /^_*volatile_*$/) {
@@ -5869,7 +5869,7 @@ sub process {
 		}
 
 # Check for misused memsets
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*$FuncArg\s*\)/) {
 
@@ -5887,7 +5887,7 @@ sub process {
 		}
 
 # Check for memcpy(foo, bar, ETH_ALEN) that could be ether_addr_copy(foo, bar)
-#		if ($^V && $^V ge 5.10.0 &&
+#		if ($perl_version_ok &&
 #		    defined $stat &&
 #		    $stat =~ /^\+(?:.*?)\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
 #			if (WARN("PREFER_ETHER_ADDR_COPY",
@@ -5898,7 +5898,7 @@ sub process {
 #		}
 
 # Check for memcmp(foo, bar, ETH_ALEN) that could be ether_addr_equal*(foo, bar)
-#		if ($^V && $^V ge 5.10.0 &&
+#		if ($perl_version_ok &&
 #		    defined $stat &&
 #		    $stat =~ /^\+(?:.*?)\bmemcmp\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
 #			WARN("PREFER_ETHER_ADDR_EQUAL",
@@ -5907,7 +5907,7 @@ sub process {
 
 # check for memset(foo, 0x0, ETH_ALEN) that could be eth_zero_addr
 # check for memset(foo, 0xFF, ETH_ALEN) that could be eth_broadcast_addr
-#		if ($^V && $^V ge 5.10.0 &&
+#		if ($perl_version_ok &&
 #		    defined $stat &&
 #		    $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) {
 #
@@ -5929,7 +5929,7 @@ sub process {
 #		}
 
 # typecasts on min/max could be min_t/max_t
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $stat =~ /^\+(?:.*?)\b(min|max)\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\)/) {
 			if (defined $2 || defined $7) {
@@ -5953,7 +5953,7 @@ sub process {
 		}
 
 # check usleep_range arguments
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $stat =~ /^\+(?:.*?)\busleep_range\s*\(\s*($FuncArg)\s*,\s*($FuncArg)\s*\)/) {
 			my $min = $1;
@@ -5969,7 +5969,7 @@ sub process {
 		}
 
 # check for naked sscanf
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $line =~ /\bsscanf\b/ &&
 		    ($stat !~ /$Ident\s*=\s*sscanf\s*$balanced_parens/ &&
@@ -5983,7 +5983,7 @@ sub process {
 		}
 
 # check for simple sscanf that should be kstrto<foo>
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $line =~ /\bsscanf\b/) {
 			my $lc = $stat =~ tr@\n@@;
@@ -6055,7 +6055,7 @@ sub process {
 		}
 
 # check for function definitions
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $stat =~ /^.\s*(?:$Storage\s+)?$Type\s*($Ident)\s*$balanced_parens\s*{/s) {
 			$context_function = $1;
@@ -6095,14 +6095,14 @@ sub process {
 
 # alloc style
 # p = alloc(sizeof(struct foo), ...) should be p = alloc(sizeof(*p), ...)
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*([kv][mz]alloc(?:_node)?)\s*\(\s*(sizeof\s*\(\s*struct\s+$Lval\s*\))/) {
 			CHK("ALLOC_SIZEOF_STRUCT",
 			    "Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr);
 		}
 
 # check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $stat =~ /^\+\s*($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)\s*,/) {
 			my $oldfunc = $3;
@@ -6131,7 +6131,7 @@ sub process {
 		}
 
 # check for krealloc arg reuse
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) {
 			WARN("KREALLOC_ARG_REUSE",
 			     "Reusing the krealloc arg is almost always a bug\n" . $herecurr);
@@ -6200,7 +6200,7 @@ sub process {
 		}
 
 # check for switch/default statements without a break;
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) {
 			my $cnt = statement_rawlines($stat);
@@ -6317,7 +6317,7 @@ sub process {
 		}
 
 # likely/unlikely comparisons similar to "(likely(foo) > 0)"
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    $line =~ /\b((?:un)?likely)\s*\(\s*$FuncArg\s*\)\s*$Compare/) {
 			WARN("LIKELY_MISUSE",
 			     "Using $1 should generally have parentheses around the comparison\n" . $herecurr);
@@ -6360,7 +6360,7 @@ sub process {
 # check for DEVICE_ATTR uses that could be DEVICE_ATTR_<FOO>
 # and whether or not function naming is typical and if
 # DEVICE_ATTR permissions uses are unusual too
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $stat =~ /\bDEVICE_ATTR\s*\(\s*(\w+)\s*,\s*\(?\s*(\s*(?:${multi_mode_perms_string_search}|0[0-7]{3,3})\s*)\s*\)?\s*,\s*(\w+)\s*,\s*(\w+)\s*\)/) {
 			my $var = $1;
@@ -6420,7 +6420,7 @@ sub process {
 #   specific definition of not visible in sysfs.
 # o Ignore proc_create*(...) uses with a decimal 0 permission as that means
 #   use the default permissions
-		if ($^V && $^V ge 5.10.0 &&
+		if ($perl_version_ok &&
 		    defined $stat &&
 		    $line =~ /$mode_perms_search/) {
 			foreach my $entry (@mode_permission_funcs) {
-- 
cgit v1.2.3


From 33aa4597dda21348590452656728af2553d2d572 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 21 Aug 2018 21:57:36 -0700
Subject: checkpatch: update section keywords

As of commit bd721ea73e1f ("treewide: replace obsolete _refok by
__ref"), __init_refok no longer exists, so it can be removed.  While at
it, add the modern variants that were still missing.

Link: http://lkml.kernel.org/r/20180706084205.26367-1-geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 0f9788ab988b..b5c875d7132b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -346,9 +346,10 @@ our $Sparse	= qr{
 			__force|
 			__iomem|
 			__must_check|
-			__init_refok|
 			__kprobes|
 			__ref|
+			__refconst|
+			__refdata|
 			__rcu|
 			__private
 		}x;
-- 
cgit v1.2.3


From cd2614967d8b636911a39dd639aa5b87cf280945 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 21 Aug 2018 21:57:40 -0700
Subject: checkpatch: warn if missing author Signed-off-by

Print a warning if none of the Signed-off-by lines cover the patch author.

Non-ASCII quoted printable encoding in From: headers and (lack of) double
quotes are handled.  Split From: headers are not fully handled: only the
first part is compared.

[geert+renesas@glider.be: only encode UTF-8 quoted printable mail headers]
  Link: http://lkml.kernel.org/r/20180718145254.4770-1-geert+renesas@glider.be
Link: http://lkml.kernel.org/r/20180712100323.26684-1-geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Joe Perches <joe@perches.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index b5c875d7132b..022a77b98123 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -13,6 +13,7 @@ use POSIX;
 use File::Basename;
 use Cwd 'abs_path';
 use Term::ANSIColor qw(:constants);
+use Encode qw(decode encode);
 
 my $P = $0;
 my $D = dirname(abs_path($P));
@@ -2236,6 +2237,8 @@ sub process {
 
 	our $clean = 1;
 	my $signoff = 0;
+	my $author = '';
+	my $authorsignoff = 0;
 	my $is_patch = 0;
 	my $in_header_lines = $file ? 0 : 1;
 	my $in_commit_log = 0;		#Scanning lines before patch
@@ -2518,10 +2521,24 @@ sub process {
 			}
 		}
 
+# Check the patch for a From:
+		if (decode("MIME-Header", $line) =~ /^From:\s*(.*)/) {
+			$author = $1;
+			$author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i);
+			$author =~ s/"//g;
+		}
+
 # Check the patch for a signoff:
 		if ($line =~ /^\s*signed-off-by:/i) {
 			$signoff++;
 			$in_commit_log = 0;
+			if ($author ne '') {
+				my $l = $line;
+				$l =~ s/"//g;
+				if ($l =~ /^\s*signed-off-by:\s*\Q$author\E/i) {
+				    $authorsignoff = 1;
+				}
+			}
 		}
 
 # Check if MAINTAINERS is being updated.  If so, there's probably no need to
@@ -6507,9 +6524,14 @@ sub process {
 		ERROR("NOT_UNIFIED_DIFF",
 		      "Does not appear to be a unified-diff format patch\n");
 	}
-	if ($is_patch && $has_commit_log && $chk_signoff && $signoff == 0) {
-		ERROR("MISSING_SIGN_OFF",
-		      "Missing Signed-off-by: line(s)\n");
+	if ($is_patch && $has_commit_log && $chk_signoff) {
+		if ($signoff == 0) {
+			ERROR("MISSING_SIGN_OFF",
+			      "Missing Signed-off-by: line(s)\n");
+		} elsif (!$authorsignoff) {
+			WARN("NO_AUTHOR_SIGN_OFF",
+			     "Missing Signed-off-by: line by nominal patch author '$author'\n");
+		}
 	}
 
 	print report_dump();
-- 
cgit v1.2.3


From 8c8c45cfdd5d8dd555da0d13c1d53b48f99ffe6f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:57:43 -0700
Subject: checkpatch: fix macro argument reuse test

Multiple line macro definitions where the arguments are separated by line
continuations can cause checkpatch to emit invalid syntax regex tests.

This can occur when a single argument is modified in a part of a patch.

For example: (to not add a diff in the commit message)

$ ./scripts/checkpatch.pl --git db023296f0115d2fe01fdabad54678f2b806da23
Unterminated \g... pattern in regex; <very long regex omitted>

And, the test does not work correctly when these arguments are all new as
the initial patch line addition "+" is used in the argument name.

Fix this by stripping the line continuations and any "+" from the list of
arguments.

Link: http://lkml.kernel.org/r/86cdb43a4db70670c102020093f7fb4eb3003e01.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 022a77b98123..34e4683de7a3 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -4967,6 +4967,7 @@ sub process {
 			if (defined $define_args && $define_args ne "") {
 				$define_args = substr($define_args, 1, length($define_args) - 2);
 				$define_args =~ s/\s*//g;
+				$define_args =~ s/\\\+?//g;
 				@def_args = split(",", $define_args);
 			}
 
-- 
cgit v1.2.3


From 3b6e8ac9e740b71b07df4a1e89ba5841686d8e9f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:57:47 -0700
Subject: checkpatch: validate SPDX license with spdxcheck.py

Use the existing scripts/spdxcheck.py to validate any
SPDX-License-Identifier found in line 1 or 2 of patches or files.

Miscellanea:

o Properly indent the existing SPDX-License-Identifier block.

Link: http://lkml.kernel.org/r/05b832407b24e0a27e419906187cd863bc1617c7.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Rob Herring <robh@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 34e4683de7a3..4523dd6d0476 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -849,6 +849,16 @@ sub is_maintained_obsolete {
 	return $status =~ /obsolete/i;
 }
 
+sub is_SPDX_License_valid {
+	my ($license) = @_;
+
+	return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py"));
+
+	my $status = `echo "$license" | python $root/scripts/spdxcheck.py -`;
+	return 0 if ($status ne "");
+	return 1;
+}
+
 my $camelcase_seeded = 0;
 sub seed_camelcase_includes {
 	return if ($camelcase_seeded);
@@ -2978,8 +2988,14 @@ sub process {
 
 				if ($comment !~ /^$/ &&
 				    $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) {
-					WARN("SPDX_LICENSE_TAG",
-					     "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+					 WARN("SPDX_LICENSE_TAG",
+					      "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+				} elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) {
+					 my $spdx_license = $1;
+					 if (!is_SPDX_License_valid($spdx_license)) {
+						  WARN("SPDX_LICENSE_TAG",
+						       "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
+					 }
 				}
 			}
 		}
-- 
cgit v1.2.3


From 4cab63cea374c684eb0db352b40a09e3f7c45057 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:57:50 -0700
Subject: checkpatch: fix krealloc reuse test

The current krealloc test does not function correctly when the temporary
pointer return name contains the original pointer name.

Fix that by maximally matching the return pointer name and the original
pointer name and doing a separate comparison of the both names.

Link: http://lkml.kernel.org/r/e617ecb8c019a9c4c56540a1bec16c8aed43a4e4.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Reported-by: Lars-Peter Clausen <lars@metafoo.de>
Cc: Manish Narani <manish.narani@xilinx.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4523dd6d0476..d43a446941ce 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6167,7 +6167,8 @@ sub process {
 
 # check for krealloc arg reuse
 		if ($perl_version_ok &&
-		    $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) {
+		    $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*($Lval)\s*,/ &&
+		    $1 eq $3) {
 			WARN("KREALLOC_ARG_REUSE",
 			     "Reusing the krealloc arg is almost always a bug\n" . $herecurr);
 		}
-- 
cgit v1.2.3


From 60f890105547f7a48ea1d67c5ae69966c7245b23 Mon Sep 17 00:00:00 2001
From: Prakruthi Deepak Heragu <pheragu@codeaurora.org>
Date: Tue, 21 Aug 2018 21:57:57 -0700
Subject: checkpatch: check for #if 0/#if 1

The #if 0 or #if 1 is used to toggle features. Warn if #if 0 or #if 1
is present and suggest that they can be removed.

[akpm@linux-foundation.org: fix spacing around periods, per Joe\
Link: http://lkml.kernel.org/r/1532625218-24321-1-git-send-email-pheragu@codeaurora.org
Signed-off-by: Abhijeet Dharmapurikar <adharmap@codeaurora.org>
Signed-off-by: Prakruthi Deepak Heragu <pheragu@codeaurora.org>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index d43a446941ce..b5eade941f4a 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5422,9 +5422,14 @@ sub process {
 
 # warn about #if 0
 		if ($line =~ /^.\s*\#\s*if\s+0\b/) {
-			CHK("REDUNDANT_CODE",
-			    "if this code is redundant consider removing it\n" .
-				$herecurr);
+			WARN("IF_0",
+			     "Consider removing the code enclosed by this #if 0 and its #endif\n" . $herecurr);
+		}
+
+# warn about #if 1
+		if ($line =~ /^.\s*\#\s*if\s+1\b/) {
+			WARN("IF_1",
+			     "Consider removing the #if 1 and its #endif\n" . $herecurr);
 		}
 
 # check for needless "if (<foo>) fn(<foo>)" uses
-- 
cgit v1.2.3


From 490b292c834c5e72057f4315ef915692cbd6defd Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:58:01 -0700
Subject: checkpatch: warn when a patch doesn't have a description

Potential patches should have a commit description.  Emit a warning when
there isn't one.

[akpm@linux-foundation.org: s/else if/elsif/]
Link: http://lkml.kernel.org/r/1b099f4d8373aa583a17011992676bf0f3f09eee.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Suggested-by: Prakruthi Deepak Heragu <pheragu@codeaurora.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index b5eade941f4a..25adf9811a2c 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2253,6 +2253,7 @@ sub process {
 	my $in_header_lines = $file ? 0 : 1;
 	my $in_commit_log = 0;		#Scanning lines before patch
 	my $has_commit_log = 0;		#Encountered lines before patch
+	my $commit_log_lines = 0;	#Number of commit log lines
 	my $commit_log_possible_stack_dump = 0;
 	my $commit_log_long_line = 0;
 	my $commit_log_has_diff = 0;
@@ -2510,6 +2511,18 @@ sub process {
 
 		$cnt_lines++ if ($realcnt != 0);
 
+# Verify the existence of a commit log if appropriate
+# 2 is used because a $signature is counted in $commit_log_lines
+		if ($in_commit_log) {
+			if ($line !~ /^\s*$/) {
+				$commit_log_lines++;	#could be a $signature
+			}
+		} elsif ($has_commit_log && $commit_log_lines < 2) {
+			WARN("COMMIT_MESSAGE",
+			     "Missing commit description - Add an appropriate one\n");
+			$commit_log_lines = 2;	#warn only once
+		}
+
 # Check if the commit log has what seems like a diff which can confuse patch
 		if ($in_commit_log && !$commit_log_has_diff &&
 		    (($line =~ m@^\s+diff\b.*a/[\w/]+@ &&
-- 
cgit v1.2.3


From 56294112791ac11c6fcdc6544f1381f9272202db Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:58:04 -0700
Subject: checkpatch: fix SPDX license check with --root=<path>

checkpatch uses the in-kernel script spdxcheck.py to validate the specific
license in a file or script.

This check can currently fail for a couple reasons:

o spdxcheck.py assumes the existence of git tree that may not
  exist for a bare source tree from something like a tarball
o the spdxcheck.py must be run from the top level root directory

So add a git existence test and set the subprocess subdirectory.

Link: http://lkml.kernel.org/r/2b32864324ae9c92948b002ec4c0c22409ed98f1.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Reported-by: Charlemagne Lasse <charlemagnelasse@gmail.com>
Tested-by: Charlemagne Lasse <charlemagnelasse@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 25adf9811a2c..eced9b303995 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -852,9 +852,10 @@ sub is_maintained_obsolete {
 sub is_SPDX_License_valid {
 	my ($license) = @_;
 
-	return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py"));
+	return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py") || !(-e "$root/.git"));
 
-	my $status = `echo "$license" | python $root/scripts/spdxcheck.py -`;
+	my $root_path = abs_path($root);
+	my $status = `cd "$root_path"; echo "$license" | python scripts/spdxcheck.py -`;
 	return 0 if ($status ne "");
 	return 1;
 }
-- 
cgit v1.2.3


From 6ad724e2a48fc24dd9788490d85a3490cb0117c1 Mon Sep 17 00:00:00 2001
From: Michal Zylowski <michal.zylowski@intel.com>
Date: Tue, 21 Aug 2018 21:58:08 -0700
Subject: checkpatch: check for space after "else" keyword

Current checkpatch implementation permits notation like

	} else{

in kernel code.  It looks like oversight and inconsistency in checkpatch
rules (e.g.  instruction like 'do' is tested).

Add regex for checking space after 'else' keyword and trigger error if
space is not present.

Link: http://lkml.kernel.org/r/1533545753-8870-1-git-send-email-michal.zylowski@intel.com
Signed-off-by: Michal Zylowski <michal.zylowski@intel.com>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index eced9b303995..f43d95a25712 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -4531,11 +4531,11 @@ sub process {
 
 #need space before brace following if, while, etc
 		if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\)\{/) ||
-		    $line =~ /do\{/) {
+		    $line =~ /\b(?:else|do)\{/) {
 			if (ERROR("SPACING",
 				  "space required before the open brace '{'\n" . $herecurr) &&
 			    $fix) {
-				$fixed[$fixlinenr] =~ s/^(\+.*(?:do|\)))\{/$1 {/;
+				$fixed[$fixlinenr] =~ s/^(\+.*(?:do|else|\)))\{/$1 {/;
 			}
 		}
 
-- 
cgit v1.2.3


From 809e082e973da2c7b751dab9348853cb8cc3f25c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 21 Aug 2018 21:58:12 -0700
Subject: checkpatch: warn on unnecessary int declarations

On Sun, 2018-08-05 at 08:52 -0700, Linus Torvalds wrote:
> "long unsigned int" isn't _technically_ wrong. But we normally
> call that type "unsigned long".

So add a checkpatch test for it.

Link: http://lkml.kernel.org/r/7bbd97dc0a1e5896a0251fada7bb68bb33643f77.camel@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index f43d95a25712..5971b0c11616 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3829,6 +3829,26 @@ sub process {
 			     "type '$tmp' should be specified in [[un]signed] [short|int|long|long long] order\n" . $herecurr);
 		}
 
+# check for unnecessary <signed> int declarations of short/long/long long
+		while ($sline =~ m{\b($TypeMisordered(\s*\*)*|$C90_int_types)\b}g) {
+			my $type = trim($1);
+			next if ($type !~ /\bint\b/);
+			next if ($type !~ /\b(?:short|long\s+long|long)\b/);
+			my $new_type = $type;
+			$new_type =~ s/\b\s*int\s*\b/ /;
+			$new_type =~ s/\b\s*(?:un)?signed\b\s*/ /;
+			$new_type =~ s/^const\s+//;
+			$new_type = "unsigned $new_type" if ($type =~ /\bunsigned\b/);
+			$new_type = "const $new_type" if ($type =~ /^const\b/);
+			$new_type =~ s/\s+/ /g;
+			$new_type = trim($new_type);
+			if (WARN("UNNECESSARY_INT",
+				 "Prefer '$new_type' over '$type' as the int is unnecessary\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$fixlinenr] =~ s/\b\Q$type\E\b/$new_type/;
+			}
+		}
+
 # check for static const char * arrays.
 		if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) {
 			WARN("STATIC_CONST_CHAR_ARRAY",
-- 
cgit v1.2.3


From 133712a2ec8493596565e9de3d25659d6e66a47e Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 21 Aug 2018 21:58:16 -0700
Subject: checkpatch: DT bindings should be a separate patch

Devicetree bindings should be their own patch as documented in
Documentation/devicetree/bindings/submitting-patches.txt section I.1.
This is because bindings are logically independent from a driver
implementation, they have a different maintainer (even though they often
are applied via the same tree), and it makes for a cleaner history in the
DT only tree created with git-filter-branch.

[robh@kernel.org: add doc pointer to warning, simplify logic]
  Link: http://lkml.kernel.org/r/20180810170513.26284-1-robh@kernel.org
[robh@kernel.org: v3]
  Link: http://lkml.kernel.org/r/20180810225049.20452-1-robh@kernel.org
Link: http://lkml.kernel.org/r/20180809205032.22205-1-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 5971b0c11616..5219280bf7ff 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2251,6 +2251,7 @@ sub process {
 	my $author = '';
 	my $authorsignoff = 0;
 	my $is_patch = 0;
+	my $is_binding_patch = -1;
 	my $in_header_lines = $file ? 0 : 1;
 	my $in_commit_log = 0;		#Scanning lines before patch
 	my $has_commit_log = 0;		#Encountered lines before patch
@@ -2501,6 +2502,19 @@ sub process {
 				$check = $check_orig;
 			}
 			$checklicenseline = 1;
+
+			if ($realfile !~ /^MAINTAINERS/) {
+				my $last_binding_patch = $is_binding_patch;
+
+				$is_binding_patch = () = $realfile =~ m@^(?:Documentation/devicetree/|include/dt-bindings/)@;
+
+				if (($last_binding_patch != -1) &&
+				    ($last_binding_patch ^ $is_binding_patch)) {
+					WARN("DT_SPLIT_BINDING_PATCH",
+					     "DT binding docs and includes should be a separate patch. See: Documentation/devicetree/bindings/submitting-patches.txt\n");
+				}
+			}
+
 			next;
 		}
 
-- 
cgit v1.2.3


From 514056d506e44084369f5ce1c8186e4253901a05 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 21:58:19 -0700
Subject: fs/eventpoll.c: simply CONFIG_NET_RX_BUSY_POLL ifdefery

... 'tis easier on the eye.

[akpm@linux-foundation.org: use inlines rather than macros]
Link: http://lkml.kernel.org/r/20180725185620.11020-1-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 58b96d8a898a..b5e43e11f1e3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -391,7 +391,6 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
 
 	return ep_events_available(ep) || busy_loop_timeout(start_time);
 }
-#endif /* CONFIG_NET_RX_BUSY_POLL */
 
 /*
  * Busy poll if globally on and supporting sockets found && no events,
@@ -401,20 +400,16 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
  */
 static void ep_busy_loop(struct eventpoll *ep, int nonblock)
 {
-#ifdef CONFIG_NET_RX_BUSY_POLL
 	unsigned int napi_id = READ_ONCE(ep->napi_id);
 
 	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
 		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
-#endif
 }
 
 static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
 {
-#ifdef CONFIG_NET_RX_BUSY_POLL
 	if (ep->napi_id)
 		ep->napi_id = 0;
-#endif
 }
 
 /*
@@ -422,7 +417,6 @@ static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
  */
 static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 {
-#ifdef CONFIG_NET_RX_BUSY_POLL
 	struct eventpoll *ep;
 	unsigned int napi_id;
 	struct socket *sock;
@@ -452,9 +446,24 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 
 	/* record NAPI ID for use in next busy poll */
 	ep->napi_id = napi_id;
-#endif
 }
 
+#else
+
+static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
+{
+}
+
+static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+{
+}
+
+static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
+{
+}
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
 /**
  * ep_call_nested - Perform a bound (possibly) nested call, by checking
  *                  that the recursion limit is not exceeded, and that
-- 
cgit v1.2.3


From 679abf381a18e945457b01921f667cee9e656a7f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 21:58:23 -0700
Subject: fs/eventpoll.c: loosen irq safety in ep_poll()

Similar to other calls, ep_poll() is not called with interrupts disabled,
and we can therefore avoid the irq save/restore dance and just disable
local irqs.  In fact, the call should never be called in irq context at
all, considering that the only path is

epoll_wait(2) -> do_epoll_wait() -> ep_poll().

When running on a 2 socket 40-core (ht) IvyBridge a common pipe based
epoll_wait(2) microbenchmark, the following performance improvements are
seen:

    # threads       vanilla         dirty
	 1          1805587	    2106412
	 2          1854064	    2090762
	 4          1805484	    2017436
	 8          1751222	    1974475
	 16         1725299	    1962104
	 32         1378463	    1571233
	 64          787368	     900784

Which is a pretty constantly near 15%.

Also add a lockdep check such that we detect any mischief before
deadlocking.

Link: http://lkml.kernel.org/r/20180727053432.16679-2-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Baron <jbaron@akamai.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b5e43e11f1e3..88473e6271ef 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1746,11 +1746,12 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout)
 {
 	int res = 0, eavail, timed_out = 0;
-	unsigned long flags;
 	u64 slack = 0;
 	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
 
+	lockdep_assert_irqs_enabled();
+
 	if (timeout > 0) {
 		struct timespec64 end_time = ep_set_mstimeout(timeout);
 
@@ -1763,7 +1764,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		 * caller specified a non blocking operation.
 		 */
 		timed_out = 1;
-		spin_lock_irqsave(&ep->wq.lock, flags);
+		spin_lock_irq(&ep->wq.lock);
 		goto check_events;
 	}
 
@@ -1772,7 +1773,7 @@ fetch_events:
 	if (!ep_events_available(ep))
 		ep_busy_loop(ep, timed_out);
 
-	spin_lock_irqsave(&ep->wq.lock, flags);
+	spin_lock_irq(&ep->wq.lock);
 
 	if (!ep_events_available(ep)) {
 		/*
@@ -1814,11 +1815,11 @@ fetch_events:
 				break;
 			}
 
-			spin_unlock_irqrestore(&ep->wq.lock, flags);
+			spin_unlock_irq(&ep->wq.lock);
 			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
 				timed_out = 1;
 
-			spin_lock_irqsave(&ep->wq.lock, flags);
+			spin_lock_irq(&ep->wq.lock);
 		}
 
 		__remove_wait_queue(&ep->wq, &wait);
@@ -1828,7 +1829,7 @@ check_events:
 	/* Is it worth to try to dig for events ? */
 	eavail = ep_events_available(ep);
 
-	spin_unlock_irqrestore(&ep->wq.lock, flags);
+	spin_unlock_irq(&ep->wq.lock);
 
 	/*
 	 * Try to transfer events to user space. In case we get 0 events and
-- 
cgit v1.2.3


From 992991c03ca033f779aae1afb5b0c27bcb7139c3 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 21:58:26 -0700
Subject: fs/eventpoll.c: simplify ep_is_linked() callers

Instead of having each caller pass the rdllink explicitly, just have
ep_is_linked() pass it while the callers just need the epi pointer.  This
helper is all about the rdllink, and this change, furthermore, improves
the function's self documentation.

Link: http://lkml.kernel.org/r/20180727053432.16679-3-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 88473e6271ef..42bbe6824b4b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -336,9 +336,9 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 }
 
 /* Tells us if the item is currently linked */
-static inline int ep_is_linked(struct list_head *p)
+static inline int ep_is_linked(struct epitem *epi)
 {
-	return !list_empty(p);
+	return !list_empty(&epi->rdllink);
 }
 
 static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
@@ -721,7 +721,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
 		 * queued into ->ovflist but the "txlist" might already
 		 * contain them, and the list_splice() below takes care of them.
 		 */
-		if (!ep_is_linked(&epi->rdllink)) {
+		if (!ep_is_linked(epi)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 			ep_pm_stay_awake(epi);
 		}
@@ -790,7 +790,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	rb_erase_cached(&epi->rbn, &ep->rbr);
 
 	spin_lock_irq(&ep->wq.lock);
-	if (ep_is_linked(&epi->rdllink))
+	if (ep_is_linked(epi))
 		list_del_init(&epi->rdllink);
 	spin_unlock_irq(&ep->wq.lock);
 
@@ -1171,7 +1171,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 	}
 
 	/* If this file is already in the ready list we exit soon */
-	if (!ep_is_linked(&epi->rdllink)) {
+	if (!ep_is_linked(epi)) {
 		list_add_tail(&epi->rdllink, &ep->rdllist);
 		ep_pm_stay_awake_rcu(epi);
 	}
@@ -1495,7 +1495,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	ep_set_busy_poll_napi_id(epi);
 
 	/* If the file is already "ready" we drop it inside the ready list */
-	if (revents && !ep_is_linked(&epi->rdllink)) {
+	if (revents && !ep_is_linked(epi)) {
 		list_add_tail(&epi->rdllink, &ep->rdllist);
 		ep_pm_stay_awake(epi);
 
@@ -1533,7 +1533,7 @@ error_unregister:
 	 * And ep_insert() is called with "mtx" held.
 	 */
 	spin_lock_irq(&ep->wq.lock);
-	if (ep_is_linked(&epi->rdllink))
+	if (ep_is_linked(epi))
 		list_del_init(&epi->rdllink);
 	spin_unlock_irq(&ep->wq.lock);
 
@@ -1601,7 +1601,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 	 */
 	if (ep_item_poll(epi, &pt, 1)) {
 		spin_lock_irq(&ep->wq.lock);
-		if (!ep_is_linked(&epi->rdllink)) {
+		if (!ep_is_linked(epi)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 			ep_pm_stay_awake(epi);
 
-- 
cgit v1.2.3


From 6ad018e3ca9efd80a724e32c02ae7552fd0829f2 Mon Sep 17 00:00:00 2001
From: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Date: Tue, 21 Aug 2018 21:58:30 -0700
Subject: init/: remove ineffective sparse disabling

Sparse checking used to be disabled on init/do_mounts.c and a few related
files because "Many of the syscalls used in this file expect some of the
arguments to be __user pointers not __kernel pointers".

However since 28128c61e ("kconfig.h: Include compiler types to avoid
missed struct attributes") the checks are, in fact, not disabled anymore
because of the more early include of "linux/compiler_types.h"

So remove the now ineffective #undefery that was done to disable these
warnings, as well as the associated comment.

Link: http://lkml.kernel.org/r/20180617115355.53799-1-luc.vanoostenryck@gmail.com
Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/do_mounts.c        | 10 ----------
 init/do_mounts_initrd.c | 10 ----------
 init/do_mounts_md.c     | 10 ----------
 init/do_mounts_rd.c     | 10 ----------
 init/initramfs.c        | 10 ----------
 5 files changed, 50 deletions(-)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 2c71dabe5626..e1c9afa9d8c9 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -1,13 +1,3 @@
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers.  To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/ctype.h>
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index 5a91aefa7305..d1a5d885ce13 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -1,14 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers.  To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
 #include <linux/unistd.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 7d85d172bc7e..b84031528dd4 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -1,14 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers.  To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
 #include <linux/delay.h>
 #include <linux/raid/md_u.h>
 #include <linux/raid/md_p.h>
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index 035a5f0ab26b..32fb049d18f9 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -1,14 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers.  To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/minix_fs.h>
diff --git a/init/initramfs.c b/init/initramfs.c
index 13643c46ebab..640557788026 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -1,14 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-/*
- * Many of the syscalls used in this file expect some of the arguments
- * to be __user pointers not __kernel pointers.  To limit the sparse
- * noise, turn off sparse checking for this file.
- */
-#ifdef __CHECKER__
-#undef __CHECKER__
-#warning "Sparse checking disabled for this file"
-#endif
-
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-- 
cgit v1.2.3


From 3903bf940b1faec568a1bff0603f8470bdfc21c1 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 21 Aug 2018 21:58:34 -0700
Subject: init/Kconfig: fix its typos

Correct typos of "it's" to "its.

Link: http://lkml.kernel.org/r/0ac627b6-5527-55f4-0489-1631aa34fc11@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 9bd50ba8253f..5eb1ef7a275b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -436,7 +436,7 @@ config BSD_PROCESS_ACCT_V3
 	help
 	  If you say Y here, the process accounting information is written
 	  in a new file format that also logs the process IDs of each
-	  process and it's parent. Note that this file format is incompatible
+	  process and its parent. Note that this file format is incompatible
 	  with previous v0/v1/v2 file formats, so you will need updated tools
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.gnu.org/software/acct/>.
@@ -1702,7 +1702,7 @@ config MMAP_ALLOW_UNINITIALIZED
 	default n
 	help
 	  Normally, and according to the Linux spec, anonymous memory obtained
-	  from mmap() has it's contents cleared before it is passed to
+	  from mmap() has its contents cleared before it is passed to
 	  userspace.  Enabling this config option allows you to request that
 	  mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
 	  providing a huge performance boost.  If this option is not enabled,
-- 
cgit v1.2.3


From 3f5c15d8a7d86fb642fe07df58c2065190f3e531 Mon Sep 17 00:00:00 2001
From: Paul Menzel <pmenzel@molgen.mpg.de>
Date: Tue, 21 Aug 2018 21:58:37 -0700
Subject: init/main.c: log init process file name

Add a log message to `run_init_process()`.

This log message serves two purposes.

1.  If the init process is not specified on the Linux Kernel command
    line, the user sees, what file was chosen.

2.  The time stamps shows exactly, when the Linux kernel handed over
    control to the init process.

Link: http://lkml.kernel.org/r/b1fc97fa-4aa9-1904-ddb5-859e78995c41@molgen.mpg.de
Signed-off-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/init/main.c b/init/main.c
index 3a6ce89e128f..18f8f0140fa0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1002,6 +1002,7 @@ void __init load_default_modules(void)
 static int run_init_process(const char *init_filename)
 {
 	argv_init[0] = init_filename;
+	pr_info("Run %s as init process\n", init_filename);
 	return do_execve(getname_kernel(init_filename),
 		(const char __user *const __user *)argv_init,
 		(const char __user *const __user *)envp_init);
-- 
cgit v1.2.3


From d4d79b8195bfc6d5d8f82f9189c1bc828cc7e03a Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:58:41 -0700
Subject: autofs: fix directory and symlink access

Depending on how it is configured the autofs user space daemon can leave
in use mounts mounted at exit and re-connect to them at start up.  But for
this to work best the state of the autofs file system needs to be left
intact over the restart.

Also, at system shutdown, mounts in an autofs file system might be
umounted exposing a mount point trigger for which subsequent access can
lead to a hang.  So recent versions of automount(8) now does its best to
set autofs file system mounts catatonic at shutdown.

When autofs file system mounts are catatonic it's currently possible to
create and remove directories and symlinks which can be a problem at
restart, as described above.

So return EACCES in the directory, symlink and unlink methods if the
autofs file system is catatonic.

Link: http://lkml.kernel.org/r/152902119090.4144.9561910674530214291.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/root.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index a3d414150578..782e57b911ab 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -559,6 +559,13 @@ static int autofs_dir_symlink(struct inode *dir,
 	if (!autofs_oz_mode(sbi))
 		return -EACCES;
 
+	/* autofs_oz_mode() needs to allow path walks when the
+	 * autofs mount is catatonic but the state of an autofs
+	 * file system needs to be preserved over restarts.
+	 */
+	if (sbi->catatonic)
+		return -EACCES;
+
 	BUG_ON(!ino);
 
 	autofs_clean_ino(ino);
@@ -612,9 +619,15 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	struct autofs_info *p_ino;
 
-	/* This allows root to remove symlinks */
-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (!autofs_oz_mode(sbi))
+		return -EACCES;
+
+	/* autofs_oz_mode() needs to allow path walks when the
+	 * autofs mount is catatonic but the state of an autofs
+	 * file system needs to be preserved over restarts.
+	 */
+	if (sbi->catatonic)
+		return -EACCES;
 
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs_dentry_ino(dentry->d_parent);
@@ -697,6 +710,13 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!autofs_oz_mode(sbi))
 		return -EACCES;
 
+	/* autofs_oz_mode() needs to allow path walks when the
+	 * autofs mount is catatonic but the state of an autofs
+	 * file system needs to be preserved over restarts.
+	 */
+	if (sbi->catatonic)
+		return -EACCES;
+
 	spin_lock(&sbi->lookup_lock);
 	if (!simple_empty(dentry)) {
 		spin_unlock(&sbi->lookup_lock);
@@ -735,6 +755,13 @@ static int autofs_dir_mkdir(struct inode *dir,
 	if (!autofs_oz_mode(sbi))
 		return -EACCES;
 
+	/* autofs_oz_mode() needs to allow path walks when the
+	 * autofs mount is catatonic but the state of an autofs
+	 * file system needs to be preserved over restarts.
+	 */
+	if (sbi->catatonic)
+		return -EACCES;
+
 	pr_debug("dentry %p, creating %pd\n", dentry, dentry);
 
 	BUG_ON(!ino);
-- 
cgit v1.2.3


From 2fd9944f0fd41f7bc2f590169a9a758e1186b345 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:58:44 -0700
Subject: autofs: fix inconsistent use of now variable

The global variable "now" in fs/autofs/expire.c is used in an inconsistent
way, sometimes using jiffies directly, and sometimes using the "now"
variable, and setting it isn't done consistently either.

But the autofs dentry info last_used field is only updated during path
walks or during expire so jiffies can be used directly and the global
variable "now" removed.

Link: http://lkml.kernel.org/r/152937731702.21213.7371321165189170865.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/expire.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index b332d3f6e730..295feec10ea6 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -10,8 +10,6 @@
 
 #include "autofs_i.h"
 
-static unsigned long now;
-
 /* Check if a dentry can be expired */
 static inline int autofs_can_expire(struct dentry *dentry,
 				    unsigned long timeout, int do_now)
@@ -24,7 +22,7 @@ static inline int autofs_can_expire(struct dentry *dentry,
 
 	if (!do_now) {
 		/* Too young to die */
-		if (!timeout || time_after(ino->last_used + timeout, now))
+		if (!timeout || time_after(ino->last_used + timeout, jiffies))
 			return 0;
 	}
 	return 1;
@@ -307,7 +305,6 @@ struct dentry *autofs_expire_direct(struct super_block *sb,
 	if (!root)
 		return NULL;
 
-	now = jiffies;
 	timeout = sbi->exp_timeout;
 
 	if (!autofs_direct_busy(mnt, root, timeout, do_now)) {
@@ -442,7 +439,6 @@ struct dentry *autofs_expire_indirect(struct super_block *sb,
 	if (!root)
 		return NULL;
 
-	now = jiffies;
 	timeout = sbi->exp_timeout;
 
 	dentry = NULL;
@@ -575,7 +571,7 @@ int autofs_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs_dentry_ino(dentry);
 	/* avoid rapid-fire expire attempts if expiry fails */
-	ino->last_used = now;
+	ino->last_used = jiffies;
 	ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
@@ -605,7 +601,7 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 
 		spin_lock(&sbi->fs_lock);
 		/* avoid rapid-fire expire attempts if expiry fails */
-		ino->last_used = now;
+		ino->last_used = jiffies;
 		ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
-- 
cgit v1.2.3


From d1055565bdc27fd4dc90a91988b938b949662025 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:58:48 -0700
Subject: autofs: fix clearing AUTOFS_EXP_LEAVES in autofs_expire_indirect()

The expire flag AUTOFS_EXP_LEAVES is cleared before the second call to
should_expire() in autofs_expire_indirect() but the parameter passed in
the second call is incorrect.

Fortunately AUTOFS_EXP_LEAVES expire flag has not been used for a long
time but might be needed in the future so fix it rather than remove the
expire leaves functionality.

Link: http://lkml.kernel.org/r/152937732410.21213.7447294898147765076.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/expire.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 295feec10ea6..41855cdc5630 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -467,7 +467,7 @@ struct dentry *autofs_expire_indirect(struct super_block *sb,
 		 * things have changed.
 		 */
 		flags &= ~AUTOFS_EXP_LEAVES;
-		found = should_expire(expired, mnt, timeout, how);
+		found = should_expire(expired, mnt, timeout, flags);
 		if (!found || found != expired)
 			/* Something has changed, continue */
 			goto next;
-- 
cgit v1.2.3


From 5d30517d67e349164838ad039f0f2d09dde15f59 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:58:51 -0700
Subject: autofs: make autofs_expire_direct() static

autofs_expire_direct() isn't used outside of fs/autofs/expire.c so make it
static.

Link: http://lkml.kernel.org/r/152937732944.21213.11821977712410930973.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/autofs_i.h | 3 ---
 fs/autofs/expire.c   | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 5057b9f0f846..fe1b62cfc769 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -156,9 +156,6 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			   struct autofs_sb_info *sbi, int when);
 int autofs_expire_multi(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *, int __user *);
-struct dentry *autofs_expire_direct(struct super_block *sb,
-				    struct vfsmount *mnt,
-				    struct autofs_sb_info *sbi, int how);
 struct dentry *autofs_expire_indirect(struct super_block *sb,
 				      struct vfsmount *mnt,
 				      struct autofs_sb_info *sbi, int how);
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 41855cdc5630..64e6eba2c628 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -292,10 +292,10 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
 }
 
 /* Check if we can expire a direct mount (possibly a tree) */
-struct dentry *autofs_expire_direct(struct super_block *sb,
-				    struct vfsmount *mnt,
-				    struct autofs_sb_info *sbi,
-				    int how)
+static struct dentry *autofs_expire_direct(struct super_block *sb,
+					   struct vfsmount *mnt,
+					   struct autofs_sb_info *sbi,
+					   int how)
 {
 	unsigned long timeout;
 	struct dentry *root = dget(sb->s_root);
-- 
cgit v1.2.3


From 571bc35c42f3455bc55393f22cb97f7407a5a6d1 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:58:54 -0700
Subject: autofs: make autofs_expire_indirect() static

autofs_expire_indirect() isn't used outside of fs/autofs/expire.c so make
it static.

Link: http://lkml.kernel.org/r/152937733512.21213.10509996499623738446.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/autofs_i.h | 3 ---
 fs/autofs/expire.c   | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index fe1b62cfc769..f53ca4bd5365 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -156,9 +156,6 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			   struct autofs_sb_info *sbi, int when);
 int autofs_expire_multi(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *, int __user *);
-struct dentry *autofs_expire_indirect(struct super_block *sb,
-				      struct vfsmount *mnt,
-				      struct autofs_sb_info *sbi, int how);
 
 /* Device node initialization */
 
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 64e6eba2c628..5fb47a4ca48c 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -424,10 +424,10 @@ static struct dentry *should_expire(struct dentry *dentry,
  *  - it is unused by any user process
  *  - it has been unused for exp_timeout time
  */
-struct dentry *autofs_expire_indirect(struct super_block *sb,
-				      struct vfsmount *mnt,
-				      struct autofs_sb_info *sbi,
-				      int how)
+static struct dentry *autofs_expire_indirect(struct super_block *sb,
+					     struct vfsmount *mnt,
+					     struct autofs_sb_info *sbi,
+					     int how)
 {
 	unsigned long timeout;
 	struct dentry *root = sb->s_root;
-- 
cgit v1.2.3


From e5c85e1fe19c03777cbacf4b5a5167b2f5ff90fb Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:58:58 -0700
Subject: autofs: make expire flags usage consistent with v5 params

Make the usage of the expire flags consistent by naming the expire flags
the same as it is named in the version 5 miscelaneous ioctl parameters and
only check the bit flags when needed.

Link: http://lkml.kernel.org/r/152937734046.21213.9454131988766280028.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/autofs_i.h |  2 +-
 fs/autofs/expire.c   | 61 ++++++++++++++++++++++++----------------------------
 2 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index f53ca4bd5365..633986a6a93a 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -153,7 +153,7 @@ int autofs_expire_run(struct super_block *, struct vfsmount *,
 		      struct autofs_sb_info *,
 		      struct autofs_packet_expire __user *);
 int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
-			   struct autofs_sb_info *sbi, int when);
+			   struct autofs_sb_info *sbi, unsigned int how);
 int autofs_expire_multi(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *, int __user *);
 
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 5fb47a4ca48c..dfb666c5b8a2 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -12,7 +12,7 @@
 
 /* Check if a dentry can be expired */
 static inline int autofs_can_expire(struct dentry *dentry,
-				    unsigned long timeout, int do_now)
+				    unsigned long timeout, unsigned int how)
 {
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 
@@ -20,7 +20,7 @@ static inline int autofs_can_expire(struct dentry *dentry,
 	if (ino == NULL)
 		return 0;
 
-	if (!do_now) {
+	if (!(how & AUTOFS_EXP_IMMEDIATE)) {
 		/* Too young to die */
 		if (!timeout || time_after(ino->last_used + timeout, jiffies))
 			return 0;
@@ -185,7 +185,7 @@ again:
 static int autofs_direct_busy(struct vfsmount *mnt,
 			      struct dentry *top,
 			      unsigned long timeout,
-			      int do_now)
+			      unsigned int how)
 {
 	pr_debug("top %p %pd\n", top, top);
 
@@ -200,7 +200,7 @@ static int autofs_direct_busy(struct vfsmount *mnt,
 	}
 
 	/* Timeout of a direct mount is determined by its top dentry */
-	if (!autofs_can_expire(top, timeout, do_now))
+	if (!autofs_can_expire(top, timeout, how))
 		return 1;
 
 	return 0;
@@ -213,7 +213,7 @@ static int autofs_direct_busy(struct vfsmount *mnt,
 static int autofs_tree_busy(struct vfsmount *mnt,
 			    struct dentry *top,
 			    unsigned long timeout,
-			    int do_now)
+			    unsigned int how)
 {
 	struct autofs_info *top_ino = autofs_dentry_ino(top);
 	struct dentry *p;
@@ -259,7 +259,7 @@ static int autofs_tree_busy(struct vfsmount *mnt,
 	}
 
 	/* Timeout of a tree mount is ultimately determined by its top dentry */
-	if (!autofs_can_expire(top, timeout, do_now))
+	if (!autofs_can_expire(top, timeout, how))
 		return 1;
 
 	return 0;
@@ -268,7 +268,7 @@ static int autofs_tree_busy(struct vfsmount *mnt,
 static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
 					  struct dentry *parent,
 					  unsigned long timeout,
-					  int do_now)
+					  unsigned int how)
 {
 	struct dentry *p;
 
@@ -284,7 +284,7 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
 				continue;
 
 			/* Can we expire this guy */
-			if (autofs_can_expire(p, timeout, do_now))
+			if (autofs_can_expire(p, timeout, how))
 				return p;
 		}
 	}
@@ -295,19 +295,18 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
 static struct dentry *autofs_expire_direct(struct super_block *sb,
 					   struct vfsmount *mnt,
 					   struct autofs_sb_info *sbi,
-					   int how)
+					   unsigned int how)
 {
-	unsigned long timeout;
 	struct dentry *root = dget(sb->s_root);
-	int do_now = how & AUTOFS_EXP_IMMEDIATE;
 	struct autofs_info *ino;
+	unsigned long timeout;
 
 	if (!root)
 		return NULL;
 
 	timeout = sbi->exp_timeout;
 
-	if (!autofs_direct_busy(mnt, root, timeout, do_now)) {
+	if (!autofs_direct_busy(mnt, root, timeout, how)) {
 		spin_lock(&sbi->fs_lock);
 		ino = autofs_dentry_ino(root);
 		/* No point expiring a pending mount */
@@ -318,7 +317,7 @@ static struct dentry *autofs_expire_direct(struct super_block *sb,
 		ino->flags |= AUTOFS_INF_WANT_EXPIRE;
 		spin_unlock(&sbi->fs_lock);
 		synchronize_rcu();
-		if (!autofs_direct_busy(mnt, root, timeout, do_now)) {
+		if (!autofs_direct_busy(mnt, root, timeout, how)) {
 			spin_lock(&sbi->fs_lock);
 			ino->flags |= AUTOFS_INF_EXPIRING;
 			init_completion(&ino->expire_complete);
@@ -343,10 +342,8 @@ out:
 static struct dentry *should_expire(struct dentry *dentry,
 				    struct vfsmount *mnt,
 				    unsigned long timeout,
-				    int how)
+				    unsigned int how)
 {
-	int do_now = how & AUTOFS_EXP_IMMEDIATE;
-	int exp_leaves = how & AUTOFS_EXP_LEAVES;
 	struct autofs_info *ino = autofs_dentry_ino(dentry);
 	unsigned int ino_count;
 
@@ -368,7 +365,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 			return NULL;
 
 		/* Can we expire this guy */
-		if (autofs_can_expire(dentry, timeout, do_now))
+		if (autofs_can_expire(dentry, timeout, how))
 			return dentry;
 		return NULL;
 	}
@@ -379,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 		 * A symlink can't be "busy" in the usual sense so
 		 * just check last used for expire timeout.
 		 */
-		if (autofs_can_expire(dentry, timeout, do_now))
+		if (autofs_can_expire(dentry, timeout, how))
 			return dentry;
 		return NULL;
 	}
@@ -388,13 +385,13 @@ static struct dentry *should_expire(struct dentry *dentry,
 		return NULL;
 
 	/* Case 2: tree mount, expire iff entire tree is not busy */
-	if (!exp_leaves) {
+	if (!(how & AUTOFS_EXP_LEAVES)) {
 		/* Path walk currently on this dentry? */
 		ino_count = atomic_read(&ino->count) + 1;
 		if (d_count(dentry) > ino_count)
 			return NULL;
 
-		if (!autofs_tree_busy(mnt, dentry, timeout, do_now))
+		if (!autofs_tree_busy(mnt, dentry, timeout, how))
 			return dentry;
 	/*
 	 * Case 3: pseudo direct mount, expire individual leaves
@@ -408,7 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 		if (d_count(dentry) > ino_count)
 			return NULL;
 
-		expired = autofs_check_leaves(mnt, dentry, timeout, do_now);
+		expired = autofs_check_leaves(mnt, dentry, timeout, how);
 		if (expired) {
 			if (expired == dentry)
 				dput(dentry);
@@ -427,7 +424,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 static struct dentry *autofs_expire_indirect(struct super_block *sb,
 					     struct vfsmount *mnt,
 					     struct autofs_sb_info *sbi,
-					     int how)
+					     unsigned int how)
 {
 	unsigned long timeout;
 	struct dentry *root = sb->s_root;
@@ -443,8 +440,6 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
 
 	dentry = NULL;
 	while ((dentry = get_next_positive_subdir(dentry, root))) {
-		int flags = how;
-
 		spin_lock(&sbi->fs_lock);
 		ino = autofs_dentry_ino(dentry);
 		if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
@@ -453,7 +448,7 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
 		}
 		spin_unlock(&sbi->fs_lock);
 
-		expired = should_expire(dentry, mnt, timeout, flags);
+		expired = should_expire(dentry, mnt, timeout, how);
 		if (!expired)
 			continue;
 
@@ -466,8 +461,8 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
 		/* Make sure a reference is not taken on found if
 		 * things have changed.
 		 */
-		flags &= ~AUTOFS_EXP_LEAVES;
-		found = should_expire(expired, mnt, timeout, flags);
+		how &= ~AUTOFS_EXP_LEAVES;
+		found = should_expire(expired, mnt, timeout, how);
 		if (!found || found != expired)
 			/* Something has changed, continue */
 			goto next;
@@ -580,15 +575,15 @@ int autofs_expire_run(struct super_block *sb,
 }
 
 int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
-			   struct autofs_sb_info *sbi, int when)
+			   struct autofs_sb_info *sbi, unsigned int how)
 {
 	struct dentry *dentry;
 	int ret = -EAGAIN;
 
 	if (autofs_type_trigger(sbi->type))
-		dentry = autofs_expire_direct(sb, mnt, sbi, when);
+		dentry = autofs_expire_direct(sb, mnt, sbi, how);
 	else
-		dentry = autofs_expire_indirect(sb, mnt, sbi, when);
+		dentry = autofs_expire_indirect(sb, mnt, sbi, how);
 
 	if (dentry) {
 		struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -618,10 +613,10 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			struct autofs_sb_info *sbi, int __user *arg)
 {
-	int do_now = 0;
+	unsigned int how = 0;
 
-	if (arg && get_user(do_now, arg))
+	if (arg && get_user(how, arg))
 		return -EFAULT;
 
-	return autofs_do_expire_multi(sb, mnt, sbi, do_now);
+	return autofs_do_expire_multi(sb, mnt, sbi, how);
 }
-- 
cgit v1.2.3


From cbf6898fd69455092c43cd573b38d42c86ddb1e0 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 21 Aug 2018 21:59:01 -0700
Subject: autofs: add AUTOFS_EXP_FORCED flag

The userspace automount(8) daemon is meant to perform a forced expire when
sent a SIGUSR2.

But since the expiration is routed through the kernel and the kernel
doesn't send an expire request if the mount is busy this hasn't worked at
least since autofs version 5.

Add an AUTOFS_EXP_FORCED flag to allow implemention of the feature and
bump the protocol version so user space can check if it's implemented if
needed.

Link: http://lkml.kernel.org/r/152937734715.21213.6594007182776598970.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs/expire.c           | 62 +++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/auto_fs.h |  8 +++---
 2 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index dfb666c5b8a2..d441244b79df 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -29,7 +29,8 @@ static inline int autofs_can_expire(struct dentry *dentry,
 }
 
 /* Check a mount point for busyness */
-static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
+static int autofs_mount_busy(struct vfsmount *mnt,
+			     struct dentry *dentry, unsigned int how)
 {
 	struct dentry *top = dentry;
 	struct path path = {.mnt = mnt, .dentry = dentry};
@@ -50,6 +51,12 @@ static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 			goto done;
 	}
 
+	/* Not a submount, has a forced expire been requested */
+	if (how & AUTOFS_EXP_FORCED) {
+		status = 0;
+		goto done;
+	}
+
 	/* Update the expiry counter if fs is busy */
 	if (!may_umount_tree(path.mnt)) {
 		struct autofs_info *ino;
@@ -189,6 +196,10 @@ static int autofs_direct_busy(struct vfsmount *mnt,
 {
 	pr_debug("top %p %pd\n", top, top);
 
+	/* Forced expire, user space handles busy mounts */
+	if (how & AUTOFS_EXP_FORCED)
+		return 0;
+
 	/* If it's busy update the expiry counters */
 	if (!may_umount_tree(mnt)) {
 		struct autofs_info *ino;
@@ -235,7 +246,7 @@ static int autofs_tree_busy(struct vfsmount *mnt,
 		 * If the fs is busy update the expiry counter.
 		 */
 		if (d_mountpoint(p)) {
-			if (autofs_mount_busy(mnt, p)) {
+			if (autofs_mount_busy(mnt, p, how)) {
 				top_ino->last_used = jiffies;
 				dput(p);
 				return 1;
@@ -258,6 +269,10 @@ static int autofs_tree_busy(struct vfsmount *mnt,
 		}
 	}
 
+	/* Forced expire, user space handles busy mounts */
+	if (how & AUTOFS_EXP_FORCED)
+		return 0;
+
 	/* Timeout of a tree mount is ultimately determined by its top dentry */
 	if (!autofs_can_expire(top, timeout, how))
 		return 1;
@@ -280,9 +295,15 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
 
 		if (d_mountpoint(p)) {
 			/* Can we umount this guy */
-			if (autofs_mount_busy(mnt, p))
+			if (autofs_mount_busy(mnt, p, how))
 				continue;
 
+			/* This isn't a submount so if a forced expire
+			 * has been requested, user space handles busy
+			 * mounts */
+			if (how & AUTOFS_EXP_FORCED)
+				return p;
+
 			/* Can we expire this guy */
 			if (autofs_can_expire(p, timeout, how))
 				return p;
@@ -361,9 +382,15 @@ static struct dentry *should_expire(struct dentry *dentry,
 		pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
 
 		/* Can we umount this guy */
-		if (autofs_mount_busy(mnt, dentry))
+		if (autofs_mount_busy(mnt, dentry, how))
 			return NULL;
 
+		/* This isn't a submount so if a forced expire
+		 * has been requested, user space handles busy
+		 * mounts */
+		if (how & AUTOFS_EXP_FORCED)
+			return dentry;
+
 		/* Can we expire this guy */
 		if (autofs_can_expire(dentry, timeout, how))
 			return dentry;
@@ -372,6 +399,11 @@ static struct dentry *should_expire(struct dentry *dentry,
 
 	if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
 		pr_debug("checking symlink %p %pd\n", dentry, dentry);
+
+		/* Forced expire, user space handles busy mounts */
+		if (how & AUTOFS_EXP_FORCED)
+			return dentry;
+
 		/*
 		 * A symlink can't be "busy" in the usual sense so
 		 * just check last used for expire timeout.
@@ -386,10 +418,13 @@ static struct dentry *should_expire(struct dentry *dentry,
 
 	/* Case 2: tree mount, expire iff entire tree is not busy */
 	if (!(how & AUTOFS_EXP_LEAVES)) {
-		/* Path walk currently on this dentry? */
-		ino_count = atomic_read(&ino->count) + 1;
-		if (d_count(dentry) > ino_count)
-			return NULL;
+		/* Not a forced expire? */
+		if (!(how & AUTOFS_EXP_FORCED)) {
+			/* ref-walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (d_count(dentry) > ino_count)
+				return NULL;
+		}
 
 		if (!autofs_tree_busy(mnt, dentry, timeout, how))
 			return dentry;
@@ -398,12 +433,15 @@ static struct dentry *should_expire(struct dentry *dentry,
 	 *	   (autofs-4.1).
 	 */
 	} else {
-		/* Path walk currently on this dentry? */
 		struct dentry *expired;
 
-		ino_count = atomic_read(&ino->count) + 1;
-		if (d_count(dentry) > ino_count)
-			return NULL;
+		/* Not a forced expire? */
+		if (!(how & AUTOFS_EXP_FORCED)) {
+			/* ref-walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (d_count(dentry) > ino_count)
+				return NULL;
+		}
 
 		expired = autofs_check_leaves(mnt, dentry, timeout, how);
 		if (expired) {
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index e13eec3dfb2f..df31aa9c9a8c 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -23,7 +23,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		2
+#define AUTOFS_PROTO_SUBVERSION		3
 
 /*
  * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
@@ -90,8 +90,10 @@ enum {
 /* autofs version 4 and later definitions */
 
 /* Mask for expire behaviour */
-#define AUTOFS_EXP_IMMEDIATE		1
-#define AUTOFS_EXP_LEAVES		2
+#define AUTOFS_EXP_NORMAL		0x00
+#define AUTOFS_EXP_IMMEDIATE		0x01
+#define AUTOFS_EXP_LEAVES		0x02
+#define AUTOFS_EXP_FORCED		0x04
 
 #define AUTOFS_TYPE_ANY			0U
 #define AUTOFS_TYPE_INDIRECT		1U
-- 
cgit v1.2.3


From 21a1a52dbdd5c9dc17c546bbdb95038f53515d2c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 21:59:05 -0700
Subject: nilfs2: use 64-bit superblock timstamps

The mount time field in the superblock uses a 64-bit timestamp, but
calling get_seconds() may truncate the current time to 32 bits.

This changes it to ktime_get_real_seconds() to avoid the potential
overflow.

Link: http://lkml.kernel.org/r/20180620075041.4154396-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: David Howells <dhowells@redhat.com>
Cc: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6ffeca84d7c3..1b9067cf4511 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -834,7 +834,7 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount)
 		sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
 
 	sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
-	sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+	sbp[0]->s_mtime = cpu_to_le64(ktime_get_real_seconds());
 
 skip_mount_setup:
 	sbp[0]->s_state =
-- 
cgit v1.2.3


From c8ed98cd88a2b39f504e1ce6af42423c71ebe5b7 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Tue, 21 Aug 2018 21:59:08 -0700
Subject: fs/nilfs2/file.c: use new return type vm_fault_t

Use new return type vm_fault_t for page_mkwrite handler.

Link: http://lkml.kernel.org/r/1529555928-2411-1-git-send-email-konishi.ryusuke@lab.ntt.co.jp
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index c5fa3dee72fc..7da0fac71dc2 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -51,7 +51,7 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	return err;
 }
 
-static int nilfs_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
-- 
cgit v1.2.3


From 7464726cb5998846306ed0a7d6714afb2e37b25d Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 21 Aug 2018 21:59:12 -0700
Subject: hfsplus: don't return 0 when fill_super() failed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

syzbot is reporting NULL pointer dereference at mount_fs() [1].  This is
because hfsplus_fill_super() is by error returning 0 when
hfsplus_fill_super() detected invalid filesystem image, and mount_bdev()
is returning NULL because dget(s->s_root) == NULL if s->s_root == NULL,
and mount_fs() is accessing root->d_sb because IS_ERR(root) == false if
root == NULL.  Fix this by returning -EINVAL when hfsplus_fill_super()
detected invalid filesystem image.

[1] https://syzkaller.appspot.com/bug?id=21acb6850cecbc960c927229e597158cf35f33d0

Link: http://lkml.kernel.org/r/d83ce31a-874c-dd5b-f790-41405983a5be@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+01ffaf5d9568dd1609f7@syzkaller.appspotmail.com>
Reviewed-by: Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index a6c0f54c48c3..80abba550bfa 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -524,8 +524,10 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_put_root;
 	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
 		hfs_find_exit(&fd);
-		if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
+		if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) {
+			err = -EINVAL;
 			goto out_put_root;
+		}
 		inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-- 
cgit v1.2.3


From 31651c607151f1034cfb57e5a78678bea54c362b Mon Sep 17 00:00:00 2001
From: "Ernesto A. Fernández" <ernesto.mnd.fernandez@gmail.com>
Date: Tue, 21 Aug 2018 21:59:16 -0700
Subject: hfsplus: avoid deadlock on file truncation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After an extent is removed from the extent tree, the corresponding bits
are also cleared from the block allocation file.  This is currently done
without releasing the tree lock.

The problem is that the allocation file has extents of its own; if it is
fragmented enough, some of them may be in the extent tree as well, and
hfsplus_get_block() will try to take the lock again.

To avoid deadlock, only hold the extent tree lock during the actual tree
operations.

Link: http://lkml.kernel.org/r/20180709202549.auxwkb6memlegb4a@eaf
Signed-off-by: Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Cc: Viacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/extents.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index e8770935ce6d..8e0f59767694 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -336,6 +336,9 @@ static int hfsplus_free_extents(struct super_block *sb,
 	int i;
 	int err = 0;
 
+	/* Mapping the allocation file may lock the extent tree */
+	WARN_ON(mutex_is_locked(&HFSPLUS_SB(sb)->ext_tree->tree_lock));
+
 	hfsplus_dump_extent(extent);
 	for (i = 0; i < 8; extent++, i++) {
 		count = be32_to_cpu(extent->block_count);
@@ -415,11 +418,13 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid,
 		if (res)
 			break;
 		start = be32_to_cpu(fd.key->ext.start_block);
-		hfsplus_free_extents(sb, ext_entry,
-				     total_blocks - start,
-				     total_blocks);
 		hfs_brec_remove(&fd);
+
+		mutex_unlock(&fd.tree->tree_lock);
+		hfsplus_free_extents(sb, ext_entry, total_blocks - start,
+				     total_blocks);
 		total_blocks = start;
+		mutex_lock(&fd.tree->tree_lock);
 	} while (total_blocks > blocks);
 	hfs_find_exit(&fd);
 
@@ -576,15 +581,20 @@ void hfsplus_file_truncate(struct inode *inode)
 	}
 	while (1) {
 		if (alloc_cnt == hip->first_blocks) {
+			mutex_unlock(&fd.tree->tree_lock);
 			hfsplus_free_extents(sb, hip->first_extents,
 					     alloc_cnt, alloc_cnt - blk_cnt);
 			hfsplus_dump_extent(hip->first_extents);
 			hip->first_blocks = blk_cnt;
+			mutex_lock(&fd.tree->tree_lock);
 			break;
 		}
 		res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
 		if (res)
 			break;
+		hfs_brec_remove(&fd);
+
+		mutex_unlock(&fd.tree->tree_lock);
 		start = hip->cached_start;
 		hfsplus_free_extents(sb, hip->cached_extents,
 				     alloc_cnt - start, alloc_cnt - blk_cnt);
@@ -596,7 +606,7 @@ void hfsplus_file_truncate(struct inode *inode)
 		alloc_cnt = start;
 		hip->cached_start = hip->cached_blocks = 0;
 		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
-		hfs_brec_remove(&fd);
+		mutex_lock(&fd.tree->tree_lock);
 	}
 	hfs_find_exit(&fd);
 
-- 
cgit v1.2.3


From afd6c9e1f5287ad236adcf56db8c42fef65561fa Mon Sep 17 00:00:00 2001
From: "Ernesto A. Fernández" <ernesto.mnd.fernandez@gmail.com>
Date: Tue, 21 Aug 2018 21:59:19 -0700
Subject: hfsplus: fix decomposition of Hangul characters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Files created under macOS cannot be opened under linux if their names
contain Korean characters, and vice versa.

The Korean alphabet is special because its normalization is done without a
table.  The module deals with it correctly when composing, but forgets
about it for the decomposition.

Fix this using the Hangul decomposition function provided in the Unicode
Standard.  The code fits a bit awkwardly because it requires a buffer,
while all the other normalizations are returned as pointers to the
decomposition table.  This is actually also a bug because reordering may
still be needed, but for now leave it as it is.

The patch will cause trouble for Hangul filenames already created by the
module in the past.  This shouldn't really be concern because its main
purpose was always sharing with macOS.  If a user actually needs to access
such a file the nodecompose mount option should be enough.

Link: http://lkml.kernel.org/r/20180717220951.p6qqrgautc4pxvzu@eaf
Signed-off-by: Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
Reported-by: Ting-Chang Hou <tchou@synology.com>
Tested-by: Ting-Chang Hou <tchou@synology.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/unicode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index dfa90c21948f..c8d1b2be7854 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -272,8 +272,8 @@ static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
 	return size;
 }
 
-/* Decomposes a single unicode character. */
-static inline u16 *decompose_unichar(wchar_t uc, int *size)
+/* Decomposes a non-Hangul unicode character. */
+static u16 *hfsplus_decompose_nonhangul(wchar_t uc, int *size)
 {
 	int off;
 
@@ -296,6 +296,51 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size)
 	return hfsplus_decompose_table + (off / 4);
 }
 
+/*
+ * Try to decompose a unicode character as Hangul. Return 0 if @uc is not
+ * precomposed Hangul, otherwise return the length of the decomposition.
+ *
+ * This function was adapted from sample code from the Unicode Standard
+ * Annex #15: Unicode Normalization Forms, version 3.2.0.
+ *
+ * Copyright (C) 1991-2018 Unicode, Inc.  All rights reserved.  Distributed
+ * under the Terms of Use in http://www.unicode.org/copyright.html.
+ */
+static int hfsplus_try_decompose_hangul(wchar_t uc, u16 *result)
+{
+	int index;
+	int l, v, t;
+
+	index = uc - Hangul_SBase;
+	if (index < 0 || index >= Hangul_SCount)
+		return 0;
+
+	l = Hangul_LBase + index / Hangul_NCount;
+	v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount;
+	t = Hangul_TBase + index % Hangul_TCount;
+
+	result[0] = l;
+	result[1] = v;
+	if (t != Hangul_TBase) {
+		result[2] = t;
+		return 3;
+	}
+	return 2;
+}
+
+/* Decomposes a single unicode character. */
+static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer)
+{
+	u16 *result;
+
+	/* Hangul is handled separately */
+	result = hangul_buffer;
+	*size = hfsplus_try_decompose_hangul(uc, result);
+	if (*size == 0)
+		result = hfsplus_decompose_nonhangul(uc, size);
+	return result;
+}
+
 int hfsplus_asc2uni(struct super_block *sb,
 		    struct hfsplus_unistr *ustr, int max_unistr_len,
 		    const char *astr, int len)
@@ -303,13 +348,14 @@ int hfsplus_asc2uni(struct super_block *sb,
 	int size, dsize, decompose;
 	u16 *dstr, outlen = 0;
 	wchar_t c;
+	u16 dhangul[3];
 
 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
 	while (outlen < max_unistr_len && len > 0) {
 		size = asc2unichar(sb, astr, len, &c);
 
 		if (decompose)
-			dstr = decompose_unichar(c, &dsize);
+			dstr = decompose_unichar(c, &dsize, dhangul);
 		else
 			dstr = NULL;
 		if (dstr) {
@@ -344,6 +390,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
 	unsigned long hash;
 	wchar_t c;
 	u16 c2;
+	u16 dhangul[3];
 
 	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
@@ -357,7 +404,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
 		len -= size;
 
 		if (decompose)
-			dstr = decompose_unichar(c, &dsize);
+			dstr = decompose_unichar(c, &dsize, dhangul);
 		else
 			dstr = NULL;
 		if (dstr) {
@@ -396,6 +443,7 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
 	const char *astr1, *astr2;
 	u16 c1, c2;
 	wchar_t c;
+	u16 dhangul_1[3], dhangul_2[3];
 
 	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
@@ -413,7 +461,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
 			len1 -= size;
 
 			if (decompose)
-				dstr1 = decompose_unichar(c, &dsize1);
+				dstr1 = decompose_unichar(c, &dsize1,
+							  dhangul_1);
 			if (!decompose || !dstr1) {
 				c1 = c;
 				dstr1 = &c1;
@@ -427,7 +476,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
 			len2 -= size;
 
 			if (decompose)
-				dstr2 = decompose_unichar(c, &dsize2);
+				dstr2 = decompose_unichar(c, &dsize2,
+							  dhangul_2);
 			if (!decompose || !dstr2) {
 				c2 = c;
 				dstr2 = &c2;
-- 
cgit v1.2.3


From f168d9fd634a4612d308d7dbe0a4d2a9b366c045 Mon Sep 17 00:00:00 2001
From: "Ernesto A. Fernández" <ernesto.mnd.fernandez@gmail.com>
Date: Tue, 21 Aug 2018 21:59:23 -0700
Subject: hfsplus: drop ACL support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HFS+ Access Control Lists have not worked at all for the past five
years, and nobody seems to have noticed.  Besides, POSIX draft ACLs are
not compatible with MacOS.  Drop the feature entirely.

Link: http://lkml.kernel.org/r/20180714190608.wtnmmtjqeyladkut@eaf
Signed-off-by: Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Viacheslav Dubeyko <slava@dubeyko.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/Kconfig          |  15 -----
 fs/hfsplus/Makefile         |   2 -
 fs/hfsplus/acl.h            |  28 ---------
 fs/hfsplus/dir.c            |   9 +--
 fs/hfsplus/hfsplus_fs.h     |   1 -
 fs/hfsplus/inode.c          |  11 ----
 fs/hfsplus/posix_acl.c      | 144 --------------------------------------------
 fs/hfsplus/super.c          |   4 +-
 fs/hfsplus/xattr.c          |   6 --
 fs/hfsplus/xattr.h          |   3 -
 fs/hfsplus/xattr_security.c |  13 ----
 11 files changed, 4 insertions(+), 232 deletions(-)
 delete mode 100644 fs/hfsplus/acl.h
 delete mode 100644 fs/hfsplus/posix_acl.c

diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index 7cc8b4acf66a..a63371815aab 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -11,18 +11,3 @@ config HFSPLUS_FS
 	  MacOS 8. It includes all Mac specific filesystem data such as
 	  data forks and creator codes, but it also has several UNIX
 	  style features such as file ownership and permissions.
-
-config HFSPLUS_FS_POSIX_ACL
-	bool "HFS+ POSIX Access Control Lists"
-	depends on HFSPLUS_FS
-	select FS_POSIX_ACL
-	help
-	  POSIX Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  It needs to understand that POSIX ACLs are treated only under
-	  Linux. POSIX ACLs doesn't mean something under Mac OS X.
-	  Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs,
-	  which are part of the NFSv4 standard.
-
-	  If you don't know what Access Control Lists are, say N
diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index f6a56542f8d7..9ed20e64b983 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -8,5 +8,3 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
 hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
 		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
 		attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
-
-hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL)	+= posix_acl.o
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
deleted file mode 100644
index 488c2b75cf41..000000000000
--- a/fs/hfsplus/acl.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * linux/fs/hfsplus/acl.h
- *
- * Vyacheslav Dubeyko <slava@dubeyko.com>
- *
- * Handler for Posix Access Control Lists (ACLs) support.
- */
-
-#include <linux/posix_acl_xattr.h>
-
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-
-/* posix_acl.c */
-struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
-int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
-		int type);
-extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
-
-#else  /* CONFIG_HFSPLUS_FS_POSIX_ACL */
-#define hfsplus_get_posix_acl NULL
-#define hfsplus_set_posix_acl NULL
-
-static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
-{
-	return 0;
-}
-#endif  /* CONFIG_HFSPLUS_FS_POSIX_ACL */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index b5254378f011..c5a70f83dbe7 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -18,7 +18,6 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 #include "xattr.h"
-#include "acl.h"
 
 static inline void hfsplus_instantiate(struct dentry *dentry,
 				       struct inode *inode, u32 cnid)
@@ -455,7 +454,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 	if (res)
 		goto out_err;
 
-	res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+	res = hfsplus_init_security(inode, dir, &dentry->d_name);
 	if (res == -EOPNOTSUPP)
 		res = 0; /* Operation is not supported. */
 	else if (res) {
@@ -496,7 +495,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 	if (res)
 		goto failed_mknod;
 
-	res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+	res = hfsplus_init_security(inode, dir, &dentry->d_name);
 	if (res == -EOPNOTSUPP)
 		res = 0; /* Operation is not supported. */
 	else if (res) {
@@ -567,10 +566,6 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 	.mknod			= hfsplus_mknod,
 	.rename			= hfsplus_rename,
 	.listxattr		= hfsplus_listxattr,
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-	.get_acl		= hfsplus_get_posix_acl,
-	.set_acl		= hfsplus_set_posix_acl,
-#endif
 };
 
 const struct file_operations hfsplus_dir_operations = {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d9255abafb81..8e039435958a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -31,7 +31,6 @@
 #define DBG_EXTENT	0x00000020
 #define DBG_BITMAP	0x00000040
 #define DBG_ATTR_MOD	0x00000080
-#define DBG_ACL_MOD	0x00000100
 
 #if 0
 #define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c824f702feec..8e9427a42b81 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -21,7 +21,6 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 #include "xattr.h"
-#include "acl.h"
 
 static int hfsplus_readpage(struct file *file, struct page *page)
 {
@@ -267,12 +266,6 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 
-	if (attr->ia_valid & ATTR_MODE) {
-		error = posix_acl_chmod(inode, inode->i_mode);
-		if (unlikely(error))
-			return error;
-	}
-
 	return 0;
 }
 
@@ -336,10 +329,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.setattr	= hfsplus_setattr,
 	.listxattr	= hfsplus_listxattr,
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-	.get_acl	= hfsplus_get_posix_acl,
-	.set_acl	= hfsplus_set_posix_acl,
-#endif
 };
 
 static const struct file_operations hfsplus_file_operations = {
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
deleted file mode 100644
index 066114dcc3a2..000000000000
--- a/fs/hfsplus/posix_acl.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/hfsplus/posix_acl.c
- *
- * Vyacheslav Dubeyko <slava@dubeyko.com>
- *
- * Handler for Posix Access Control Lists (ACLs) support.
- */
-
-#include "hfsplus_fs.h"
-#include "xattr.h"
-#include "acl.h"
-
-struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
-{
-	struct posix_acl *acl;
-	char *xattr_name;
-	char *value = NULL;
-	ssize_t size;
-
-	hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
-		break;
-	case ACL_TYPE_DEFAULT:
-		xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
-		break;
-	default:
-		return ERR_PTR(-EINVAL);
-	}
-
-	size = __hfsplus_getxattr(inode, xattr_name, NULL, 0);
-
-	if (size > 0) {
-		value = (char *)hfsplus_alloc_attr_entry();
-		if (unlikely(!value))
-			return ERR_PTR(-ENOMEM);
-		size = __hfsplus_getxattr(inode, xattr_name, value, size);
-	}
-
-	if (size > 0)
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-	else if (size == -ENODATA)
-		acl = NULL;
-	else
-		acl = ERR_PTR(size);
-
-	hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
-
-	return acl;
-}
-
-static int __hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
-				   int type)
-{
-	int err;
-	char *xattr_name;
-	size_t size = 0;
-	char *value = NULL;
-
-	hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-
-	switch (type) {
-	case ACL_TYPE_ACCESS:
-		xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
-		break;
-
-	case ACL_TYPE_DEFAULT:
-		xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
-		if (!S_ISDIR(inode->i_mode))
-			return acl ? -EACCES : 0;
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	if (acl) {
-		size = posix_acl_xattr_size(acl->a_count);
-		if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE))
-			return -ENOMEM;
-		value = (char *)hfsplus_alloc_attr_entry();
-		if (unlikely(!value))
-			return -ENOMEM;
-		err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
-		if (unlikely(err < 0))
-			goto end_set_acl;
-	}
-
-	err = __hfsplus_setxattr(inode, xattr_name, value, size, 0);
-
-end_set_acl:
-	hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
-
-	if (!err)
-		set_cached_acl(inode, type, acl);
-
-	return err;
-}
-
-int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, int type)
-{
-	int err;
-
-	if (type == ACL_TYPE_ACCESS && acl) {
-		err = posix_acl_update_mode(inode, &inode->i_mode, &acl);
-		if (err)
-			return err;
-	}
-	return __hfsplus_set_posix_acl(inode, acl, type);
-}
-
-int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
-{
-	int err = 0;
-	struct posix_acl *default_acl, *acl;
-
-	hfs_dbg(ACL_MOD,
-		"[%s]: ino %lu, dir->ino %lu\n",
-		__func__, inode->i_ino, dir->i_ino);
-
-	if (S_ISLNK(inode->i_mode))
-		return 0;
-
-	err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-	if (err)
-		return err;
-
-	if (default_acl) {
-		err = __hfsplus_set_posix_acl(inode, default_acl,
-					      ACL_TYPE_DEFAULT);
-		posix_acl_release(default_acl);
-	}
-
-	if (acl) {
-		if (!err)
-			err = __hfsplus_set_posix_acl(inode, acl,
-						      ACL_TYPE_ACCESS);
-		posix_acl_release(acl);
-	}
-	return err;
-}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 80abba550bfa..eb4535eba95d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -564,8 +564,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 				goto out_put_hidden_dir;
 			}
 
-			err = hfsplus_init_inode_security(sbi->hidden_dir,
-								root, &str);
+			err = hfsplus_init_security(sbi->hidden_dir,
+							root, &str);
 			if (err == -EOPNOTSUPP)
 				err = 0; /* Operation is not supported. */
 			else if (err) {
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e538b758c448..d5403b4004c9 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -8,10 +8,8 @@
  */
 
 #include "hfsplus_fs.h"
-#include <linux/posix_acl_xattr.h>
 #include <linux/nls.h>
 #include "xattr.h"
-#include "acl.h"
 
 static int hfsplus_removexattr(struct inode *inode, const char *name);
 
@@ -19,10 +17,6 @@ const struct xattr_handler *hfsplus_xattr_handlers[] = {
 	&hfsplus_xattr_osx_handler,
 	&hfsplus_xattr_user_handler,
 	&hfsplus_xattr_trusted_handler,
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
-	&posix_acl_access_xattr_handler,
-	&posix_acl_default_xattr_handler,
-#endif
 	&hfsplus_xattr_security_handler,
 	NULL
 };
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index a4e611d69710..d14e362b3eba 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -38,7 +38,4 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int hfsplus_init_security(struct inode *inode, struct inode *dir,
 				const struct qstr *qstr);
 
-int hfsplus_init_inode_security(struct inode *inode, struct inode *dir,
-				const struct qstr *qstr);
-
 #endif
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index f5550b006e88..cfbe6a3bfb1e 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -12,7 +12,6 @@
 
 #include "hfsplus_fs.h"
 #include "xattr.h"
-#include "acl.h"
 
 static int hfsplus_security_getxattr(const struct xattr_handler *handler,
 				     struct dentry *unused, struct inode *inode,
@@ -72,18 +71,6 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir,
 					&hfsplus_initxattrs, NULL);
 }
 
-int hfsplus_init_inode_security(struct inode *inode,
-						struct inode *dir,
-						const struct qstr *qstr)
-{
-	int err;
-
-	err = hfsplus_init_posix_acl(inode, dir);
-	if (!err)
-		err = hfsplus_init_security(inode, dir, qstr);
-	return err;
-}
-
 const struct xattr_handler hfsplus_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.get	= hfsplus_security_getxattr,
-- 
cgit v1.2.3


From 34d082604a7c8856051ff441fef0e22d93afd848 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 21:59:26 -0700
Subject: reiserfs: use monotonic time for j_trans_start_time

Using CLOCK_REALTIME time_t timestamps breaks on 32-bit systems in 2038,
and gives surprising results with a concurrent settimeofday().

This changes the reiserfs journal timestamps to use ktime_get_seconds()
instead, which makes it use a 64-bit CLOCK_MONOTONIC stamp.

In the procfs output, the monotonic timestamp needs to be converted back
to CLOCK_REALTIME to keep the existing ABI.

Link: http://lkml.kernel.org/r/20180620142522.27639-2-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c  | 22 +++++++++++-----------
 fs/reiserfs/procfs.c   | 11 +++++++++--
 fs/reiserfs/reiserfs.h |  2 +-
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 52eb5d293a34..8a76f9d14bc6 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2381,7 +2381,7 @@ static int journal_read(struct super_block *sb)
 	struct reiserfs_journal_desc *desc;
 	unsigned int oldest_trans_id = 0;
 	unsigned int oldest_invalid_trans_id = 0;
-	time_t start;
+	time64_t start;
 	unsigned long oldest_start = 0;
 	unsigned long cur_dblock = 0;
 	unsigned long newest_mount_id = 9;
@@ -2395,7 +2395,7 @@ static int journal_read(struct super_block *sb)
 	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 	reiserfs_info(sb, "checking transaction log (%pg)\n",
 		      journal->j_dev_bd);
-	start = get_seconds();
+	start = ktime_get_seconds();
 
 	/*
 	 * step 1, read in the journal header block.  Check the transaction
@@ -2556,7 +2556,7 @@ start_log_replay:
 	if (replay_count > 0) {
 		reiserfs_info(sb,
 			      "replayed %d transactions in %lu seconds\n",
-			      replay_count, get_seconds() - start);
+			      replay_count, ktime_get_seconds() - start);
 	}
 	/* needed to satisfy the locking in _update_journal_header_block */
 	reiserfs_write_lock(sb);
@@ -2914,7 +2914,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
 				   int new_alloc)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
-	time_t now = get_seconds();
+	time64_t now = ktime_get_seconds();
 	/* cannot restart while nested */
 	BUG_ON(!th->t_trans_id);
 	if (th->t_refcount > 1)
@@ -3023,7 +3023,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 			      struct super_block *sb, unsigned long nblocks,
 			      int join)
 {
-	time_t now = get_seconds();
+	time64_t now = ktime_get_seconds();
 	unsigned int old_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 	struct reiserfs_transaction_handle myth;
@@ -3056,7 +3056,7 @@ relock:
 		PROC_INFO_INC(sb, journal.journal_relock_writers);
 		goto relock;
 	}
-	now = get_seconds();
+	now = ktime_get_seconds();
 
 	/*
 	 * if there is no room in the journal OR
@@ -3119,7 +3119,7 @@ relock:
 	}
 	/* we are the first writer, set trans_id */
 	if (journal->j_trans_start_time == 0) {
-		journal->j_trans_start_time = get_seconds();
+		journal->j_trans_start_time = ktime_get_seconds();
 	}
 	atomic_inc(&journal->j_wcount);
 	journal->j_len_alloc += nblocks;
@@ -3559,11 +3559,11 @@ static void flush_async_commits(struct work_struct *work)
  */
 void reiserfs_flush_old_commits(struct super_block *sb)
 {
-	time_t now;
+	time64_t now;
 	struct reiserfs_transaction_handle th;
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
-	now = get_seconds();
+	now = ktime_get_seconds();
 	/*
 	 * safety check so we don't flush while we are replaying the log during
 	 * mount
@@ -3613,7 +3613,7 @@ void reiserfs_flush_old_commits(struct super_block *sb)
 static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
 {
 
-	time_t now;
+	time64_t now;
 	int flush = flags & FLUSH_ALL;
 	int commit_now = flags & COMMIT_NOW;
 	int wait_on_commit = flags & WAIT;
@@ -3694,7 +3694,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
 	}
 
 	/* deal with old transactions where we are the last writers */
-	now = get_seconds();
+	now = ktime_get_seconds();
 	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
 		commit_now = 1;
 		journal->j_next_async_flush = 1;
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index e39b3910d24d..f2cf3441fdfc 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -297,6 +297,13 @@ static int show_oidmap(struct seq_file *m, void *unused)
 	return 0;
 }
 
+static time64_t ktime_mono_to_real_seconds(time64_t mono)
+{
+	ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2);
+
+	return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC);
+}
+
 static int show_journal(struct seq_file *m, void *unused)
 {
 	struct super_block *sb = m->private;
@@ -325,7 +332,7 @@ static int show_journal(struct seq_file *m, void *unused)
 		   "j_bcount: \t%lu\n"
 		   "j_first_unflushed_offset: \t%lu\n"
 		   "j_last_flush_trans_id: \t%u\n"
-		   "j_trans_start_time: \t%li\n"
+		   "j_trans_start_time: \t%lli\n"
 		   "j_list_bitmap_index: \t%i\n"
 		   "j_must_wait: \t%i\n"
 		   "j_next_full_flush: \t%i\n"
@@ -366,7 +373,7 @@ static int show_journal(struct seq_file *m, void *unused)
 		   JF(j_bcount),
 		   JF(j_first_unflushed_offset),
 		   JF(j_last_flush_trans_id),
-		   JF(j_trans_start_time),
+		   ktime_mono_to_real_seconds(JF(j_trans_start_time)),
 		   JF(j_list_bitmap_index),
 		   JF(j_must_wait),
 		   JF(j_next_full_flush),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index ae4811fecc1f..621b9a07080a 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -331,7 +331,7 @@ struct reiserfs_journal {
 
 	struct buffer_head *j_header_bh;
 
-	time_t j_trans_start_time;	/* time this transaction started */
+	time64_t j_trans_start_time;	/* time this transaction started */
 	struct mutex j_mutex;
 	struct mutex j_flush_mutex;
 
-- 
cgit v1.2.3


From 5b1d149c895b3229d70c0d4c69e14bdbe5fe8226 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 21:59:30 -0700
Subject: reiserfs: remove obsolete print_time function

Before linux-2.4.6, print_time() was used to pretty-print an inode time
when running reiserfs in user space, after that it has become obsolete and
is still a bit incorrect: It behaves differently on 32-bit and 64-bit
machines, and uses a static buffer to hold a string, which could lead to
undefined behavior if we ever called this from multiple places
simultaneously.

Since we always want to treat the timestamps as 'unsigned' anyway, simply
printing them as an integer is both simpler and safer while avoiding the
deprecated time_t type.

Link: http://lkml.kernel.org/r/20180620142522.27639-3-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/item_ops.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index e3c558d1b78c..3a5a752d96c7 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -33,30 +33,22 @@ static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
 	return 0;
 }
 
-static char *print_time(time_t t)
-{
-	static char timebuf[256];
-
-	sprintf(timebuf, "%ld", t);
-	return timebuf;
-}
-
 static void sd_print_item(struct item_head *ih, char *item)
 {
 	printk("\tmode | size | nlinks | first direct | mtime\n");
 	if (stat_data_v1(ih)) {
 		struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
 
-		printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd),
+		printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd),
 		       sd_v1_size(sd), sd_v1_nlink(sd),
 		       sd_v1_first_direct_byte(sd),
-		       print_time(sd_v1_mtime(sd)));
+		       sd_v1_mtime(sd));
 	} else {
 		struct stat_data *sd = (struct stat_data *)item;
 
-		printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd),
+		printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd),
 		       (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
-		       sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
+		       sd_v2_rdev(sd), sd_v2_mtime(sd));
 	}
 }
 
-- 
cgit v1.2.3


From 8b73ce6a4bae4fe12bcb2c361c0da4183c2e1b6f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 21:59:34 -0700
Subject: reiserfs: change j_timestamp type to time64_t

This uses the deprecated time_t type but is write-only, and could be
removed, but as Jeff explains, having a timestamp can be usefule for
post-mortem analysis in crash dumps.

In order to remove one of the last instances of time_t, this changes the
type to time64_t, same as j_trans_start_time.

Link: http://lkml.kernel.org/r/20180622133315.221210-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/reiserfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 621b9a07080a..e5ca9ed79e54 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -271,7 +271,7 @@ struct reiserfs_journal_list {
 
 	struct mutex j_commit_mutex;
 	unsigned int j_trans_id;
-	time_t j_timestamp;
+	time64_t j_timestamp; /* write-only but useful for crash dump analysis */
 	struct reiserfs_list_bitmap *j_list_bitmap;
 	struct buffer_head *j_commit_bh;	/* commit buffer head */
 	struct reiserfs_journal_cnode *j_realblock;
-- 
cgit v1.2.3


From a13f085d111e90469faf2d9965eb39b11c114d7e Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 21 Aug 2018 21:59:37 -0700
Subject: reiserfs: fix broken xattr handling (heap corruption, bad retval)

This fixes the following issues:

- When a buffer size is supplied to reiserfs_listxattr() such that each
  individual name fits, but the concatenation of all names doesn't fit,
  reiserfs_listxattr() overflows the supplied buffer.  This leads to a
  kernel heap overflow (verified using KASAN) followed by an out-of-bounds
  usercopy and is therefore a security bug.

- When a buffer size is supplied to reiserfs_listxattr() such that a
  name doesn't fit, -ERANGE should be returned.  But reiserfs instead just
  truncates the list of names; I have verified that if the only xattr on a
  file has a longer name than the supplied buffer length, listxattr()
  incorrectly returns zero.

With my patch applied, -ERANGE is returned in both cases and the memory
corruption doesn't happen anymore.

Credit for making me clean this code up a bit goes to Al Viro, who pointed
out that the ->actor calling convention is suboptimal and should be
changed.

Link: http://lkml.kernel.org/r/20180802151539.5373-1-jannh@google.com
Fixes: 48b32a3553a5 ("reiserfs: use generic xattr handlers")
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Jeff Mahoney <jeffm@suse.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/xattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ff94fad477e4..48cdfc81fe10 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -792,8 +792,10 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
 			return 0;
 		size = namelen + 1;
 		if (b->buf) {
-			if (size > b->size)
+			if (b->pos + size > b->size) {
+				b->pos = -ERANGE;
 				return -ERANGE;
+			}
 			memcpy(b->buf + b->pos, name, namelen);
 			b->buf[b->pos + namelen] = 0;
 		}
-- 
cgit v1.2.3


From f663b5b38fffeb31841f8bfaf0ef87a445b0ffee Mon Sep 17 00:00:00 2001
From: Wentao Wang <witallwang@gmail.com>
Date: Tue, 21 Aug 2018 21:59:41 -0700
Subject: fat: add FITRIM ioctl for FAT file system

Add FITRIM ioctl for FAT file system

[witallwang@gmail.com: use u64s]
  Link: http://lkml.kernel.org/r/87h8l37hub.fsf@mail.parknet.co.jp
[hirofumi@mail.parknet.co.jp: bug fixes, coding style fixes, add signal check]
Link: http://lkml.kernel.org/r/87fu10anhj.fsf@mail.parknet.co.jp
Signed-off-by: Wentao Wang <witallwang@gmail.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h    |   1 +
 fs/fat/fatent.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fat/file.c   |  33 ++++++++++++++++++
 3 files changed, 136 insertions(+)

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 8fc1093da47d..154ae54a6b3a 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -357,6 +357,7 @@ extern int fat_alloc_clusters(struct inode *inode, int *cluster,
 			      int nr_cluster);
 extern int fat_free_clusters(struct inode *inode, int cluster);
 extern int fat_count_free_clusters(struct super_block *sb);
+extern int fat_trim_fs(struct inode *inode, struct fstrim_range *range);
 
 /* fat/file.c */
 extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index bac10de678cc..25d43a5e8a4d 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/sched/signal.h>
 #include "fat.h"
 
 struct fatent_operations {
@@ -690,3 +691,104 @@ out:
 	unlock_fat(sbi);
 	return err;
 }
+
+static int fat_trim_clusters(struct super_block *sb, u32 clus, u32 nr_clus)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	return sb_issue_discard(sb, fat_clus_to_blknr(sbi, clus),
+				nr_clus * sbi->sec_per_clus, GFP_NOFS, 0);
+}
+
+int fat_trim_fs(struct inode *inode, struct fstrim_range *range)
+{
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	const struct fatent_operations *ops = sbi->fatent_ops;
+	struct fat_entry fatent;
+	u64 ent_start, ent_end, minlen, trimmed = 0;
+	u32 free = 0;
+	unsigned long reada_blocks, reada_mask, cur_block = 0;
+	int err = 0;
+
+	/*
+	 * FAT data is organized as clusters, trim at the granulary of cluster.
+	 *
+	 * fstrim_range is in byte, convert vaules to cluster index.
+	 * Treat sectors before data region as all used, not to trim them.
+	 */
+	ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT);
+	ent_end = ent_start + (range->len >> sbi->cluster_bits) - 1;
+	minlen = range->minlen >> sbi->cluster_bits;
+
+	if (ent_start >= sbi->max_cluster || range->len < sbi->cluster_size)
+		return -EINVAL;
+	if (ent_end >= sbi->max_cluster)
+		ent_end = sbi->max_cluster - 1;
+
+	reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
+	reada_mask = reada_blocks - 1;
+
+	fatent_init(&fatent);
+	lock_fat(sbi);
+	fatent_set_entry(&fatent, ent_start);
+	while (fatent.entry <= ent_end) {
+		/* readahead of fat blocks */
+		if ((cur_block & reada_mask) == 0) {
+			unsigned long rest = sbi->fat_length - cur_block;
+			fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
+		}
+		cur_block++;
+
+		err = fat_ent_read_block(sb, &fatent);
+		if (err)
+			goto error;
+		do {
+			if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
+				free++;
+			} else if (free) {
+				if (free >= minlen) {
+					u32 clus = fatent.entry - free;
+
+					err = fat_trim_clusters(sb, clus, free);
+					if (err && err != -EOPNOTSUPP)
+						goto error;
+					if (!err)
+						trimmed += free;
+					err = 0;
+				}
+				free = 0;
+			}
+		} while (fat_ent_next(sbi, &fatent) && fatent.entry <= ent_end);
+
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			goto error;
+		}
+
+		if (need_resched()) {
+			fatent_brelse(&fatent);
+			unlock_fat(sbi);
+			cond_resched();
+			lock_fat(sbi);
+		}
+	}
+	/* handle scenario when tail entries are all free */
+	if (free && free >= minlen) {
+		u32 clus = fatent.entry - free;
+
+		err = fat_trim_clusters(sb, clus, free);
+		if (err && err != -EOPNOTSUPP)
+			goto error;
+		if (!err)
+			trimmed += free;
+		err = 0;
+	}
+
+error:
+	fatent_brelse(&fatent);
+	unlock_fat(sbi);
+
+	range->len = trimmed << sbi->cluster_bits;
+
+	return err;
+}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4724cc9ad650..4f3d72fb1e60 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -121,6 +121,37 @@ static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
 	return put_user(sbi->vol_id, user_attr);
 }
 
+static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
+{
+	struct super_block *sb = inode->i_sb;
+	struct fstrim_range __user *user_range;
+	struct fstrim_range range;
+	struct request_queue *q = bdev_get_queue(sb->s_bdev);
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	user_range = (struct fstrim_range __user *)arg;
+	if (copy_from_user(&range, user_range, sizeof(range)))
+		return -EFAULT;
+
+	range.minlen = max_t(unsigned int, range.minlen,
+			     q->limits.discard_granularity);
+
+	err = fat_trim_fs(inode, &range);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(user_range, &range, sizeof(range)))
+		return -EFAULT;
+
+	return 0;
+}
+
 long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -133,6 +164,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return fat_ioctl_set_attributes(filp, user_attr);
 	case FAT_IOCTL_GET_VOLUME_ID:
 		return fat_ioctl_get_volume_id(inode, user_attr);
+	case FITRIM:
+		return fat_ioctl_fitrim(inode, arg);
 	default:
 		return -ENOTTY;	/* Inappropriate ioctl for device */
 	}
-- 
cgit v1.2.3


From 0afa9626667c3659ef8bd82d42a11e39fedf235c Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Tue, 21 Aug 2018 21:59:44 -0700
Subject: fat: validate ->i_start before using

On corrupted FATfs may have invalid ->i_start.  To handle it, this checks
->i_start before using, and return proper error code.

Link: http://lkml.kernel.org/r/87o9f8y1t5.fsf_-_@mail.parknet.co.jp
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Tested-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/cache.c  | 19 ++++++++++++-------
 fs/fat/fat.h    |  5 +++++
 fs/fat/fatent.c |  6 +++---
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index e9bed49df6b7..78d501c1fb65 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -225,7 +225,8 @@ static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus)
 int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 {
 	struct super_block *sb = inode->i_sb;
-	const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	const int limit = sb->s_maxbytes >> sbi->cluster_bits;
 	struct fat_entry fatent;
 	struct fat_cache_id cid;
 	int nr;
@@ -234,6 +235,12 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 
 	*fclus = 0;
 	*dclus = MSDOS_I(inode)->i_start;
+	if (!fat_valid_entry(sbi, *dclus)) {
+		fat_fs_error_ratelimit(sb,
+			"%s: invalid start cluster (i_pos %lld, start %08x)",
+			__func__, MSDOS_I(inode)->i_pos, *dclus);
+		return -EIO;
+	}
 	if (cluster == 0)
 		return 0;
 
@@ -250,9 +257,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 		/* prevent the infinite loop of cluster chain */
 		if (*fclus > limit) {
 			fat_fs_error_ratelimit(sb,
-					"%s: detected the cluster chain loop"
-					" (i_pos %lld)", __func__,
-					MSDOS_I(inode)->i_pos);
+				"%s: detected the cluster chain loop (i_pos %lld)",
+				__func__, MSDOS_I(inode)->i_pos);
 			nr = -EIO;
 			goto out;
 		}
@@ -262,9 +268,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 			goto out;
 		else if (nr == FAT_ENT_FREE) {
 			fat_fs_error_ratelimit(sb,
-				       "%s: invalid cluster chain (i_pos %lld)",
-				       __func__,
-				       MSDOS_I(inode)->i_pos);
+				"%s: invalid cluster chain (i_pos %lld)",
+				__func__, MSDOS_I(inode)->i_pos);
 			nr = -EIO;
 			goto out;
 		} else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 154ae54a6b3a..df84d5710b59 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -348,6 +348,11 @@ static inline void fatent_brelse(struct fat_entry *fatent)
 	fatent->fat_inode = NULL;
 }
 
+static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry)
+{
+	return FAT_START_ENT <= entry && entry < sbi->max_cluster;
+}
+
 extern void fat_ent_access_init(struct super_block *sb);
 extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
 			int entry);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 25d43a5e8a4d..defc2168de91 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -24,7 +24,7 @@ static void fat12_ent_blocknr(struct super_block *sb, int entry,
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	int bytes = entry + (entry >> 1);
-	WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry);
+	WARN_ON(!fat_valid_entry(sbi, entry));
 	*offset = bytes & (sb->s_blocksize - 1);
 	*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
 }
@@ -34,7 +34,7 @@ static void fat_ent_blocknr(struct super_block *sb, int entry,
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	int bytes = (entry << sbi->fatent_shift);
-	WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry);
+	WARN_ON(!fat_valid_entry(sbi, entry));
 	*offset = bytes & (sb->s_blocksize - 1);
 	*blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
 }
@@ -354,7 +354,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
 	int err, offset;
 	sector_t blocknr;
 
-	if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
+	if (!fat_valid_entry(sbi, entry)) {
 		fatent_brelse(fatent);
 		fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
 		return -EIO;
-- 
cgit v1.2.3


From f423420c23899469a3ba4e100def43ab26f2e0bf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 21:59:48 -0700
Subject: fat: propagate 64-bit inode timestamps

Now that we pass down 64-bit timestamps from VFS, we just need to convert
that correctly into on-disk timestamps.  To make that work correctly, this
changes the last use of time_to_tm() in the kernel to time64_to_tm(),
which also lets use remove that deprecated interfaces.

Similarly, the time_t use in fat_time_fat2unix() truncates the timestamp
on the way in, which can be avoided by using types that are wide enough to
hold the intermediate values during the conversion.

[hirofumi@mail.parknet.co.jp: remove useless temporary variable, needless long long]
Link: http://lkml.kernel.org/r/20180619153646.3637529-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c         |  2 +-
 fs/fat/fat.h         |  6 +++---
 fs/fat/inode.c       | 20 ++++++--------------
 fs/fat/misc.c        | 13 +++++++------
 fs/fat/namei_msdos.c | 17 ++++++-----------
 fs/fat/namei_vfat.c  | 20 +++++++-------------
 6 files changed, 30 insertions(+), 48 deletions(-)

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 8e100c3bf72c..7f5f3699fc6c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1130,7 +1130,7 @@ error:
 	return err;
 }
 
-int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
+int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts)
 {
 	struct super_block *sb = dir->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index df84d5710b59..9d7d2d5da28b 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -304,7 +304,7 @@ extern int fat_scan_logstart(struct inode *dir, int i_logstart,
 			     struct fat_slot_info *sinfo);
 extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
 				struct msdos_dir_entry **de);
-extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts);
 extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 			   struct fat_slot_info *sinfo);
 extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
@@ -412,9 +412,9 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
 	 } while (0)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
-extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
 			      __le16 __time, __le16 __date, u8 time_cs);
-extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
 			      __le16 *time, __le16 *date, u8 *time_cs);
 extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index bfd589ea74c0..d6b81e31f9f5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -508,7 +508,6 @@ static int fat_validate_dir(struct inode *dir)
 /* doesn't deal with root inode */
 int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 {
-	struct timespec ts;
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	int error;
 
@@ -559,14 +558,11 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 
-	fat_time_fat2unix(sbi, &ts, de->time, de->date, 0);
-	inode->i_mtime = timespec_to_timespec64(ts);
+	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
 	if (sbi->options.isvfat) {
-		fat_time_fat2unix(sbi, &ts, de->ctime,
+		fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
 				  de->cdate, de->ctime_cs);
-		inode->i_ctime = timespec_to_timespec64(ts);
-		fat_time_fat2unix(sbi, &ts, 0, de->adate, 0);
-		inode->i_atime = timespec_to_timespec64(ts);
+		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
 	} else
 		inode->i_ctime = inode->i_atime = inode->i_mtime;
 
@@ -843,7 +839,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int __fat_write_inode(struct inode *inode, int wait)
 {
-	struct timespec ts;
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh;
@@ -881,16 +876,13 @@ retry:
 		raw_entry->size = cpu_to_le32(inode->i_size);
 	raw_entry->attr = fat_make_attrs(inode);
 	fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
-	ts = timespec64_to_timespec(inode->i_mtime);
-	fat_time_unix2fat(sbi, &ts, &raw_entry->time,
+	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
 			  &raw_entry->date, NULL);
 	if (sbi->options.isvfat) {
 		__le16 atime;
-		ts = timespec64_to_timespec(inode->i_ctime);
-		fat_time_unix2fat(sbi, &ts, &raw_entry->ctime,
+		fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
 				  &raw_entry->cdate, &raw_entry->ctime_cs);
-		ts = timespec64_to_timespec(inode->i_atime);
-		fat_time_unix2fat(sbi, &ts, &atime,
+		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
 				  &raw_entry->adate, NULL);
 	}
 	spin_unlock(&sbi->inode_hash_lock);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index f9bdc1e01c98..573836dcaefc 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -180,17 +180,18 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 #define IS_LEAP_YEAR(y)	(!((y) & 3) && (y) != YEAR_2100)
 
 /* Linear day numbers of the respective 1sts in non-leap years. */
-static time_t days_in_year[] = {
+static long days_in_year[] = {
 	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
 	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
 };
 
 /* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
-void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
 		       __le16 __time, __le16 __date, u8 time_cs)
 {
 	u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
-	time_t second, day, leap_day, month, year;
+	time64_t second;
+	long day, leap_day, month, year;
 
 	year  = date >> 9;
 	month = max(1, (date >> 5) & 0xf);
@@ -205,7 +206,7 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
 	second =  (time & 0x1f) << 1;
 	second += ((time >> 5) & 0x3f) * SECS_PER_MIN;
 	second += (time >> 11) * SECS_PER_HOUR;
-	second += (year * 365 + leap_day
+	second += (time64_t)(year * 365 + leap_day
 		   + days_in_year[month] + day
 		   + DAYS_DELTA) * SECS_PER_DAY;
 
@@ -224,11 +225,11 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
 }
 
 /* Convert linear UNIX date to a FAT time/date pair. */
-void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
 		       __le16 *time, __le16 *date, u8 *time_cs)
 {
 	struct tm tm;
-	time_to_tm(ts->tv_sec,
+	time64_to_tm(ts->tv_sec,
 		   (sbi->options.tz_set ? sbi->options.time_offset :
 		   -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
 
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 16a832c37d66..efb8c40c9d27 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -225,7 +225,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 /***** Creates a directory entry (name is already formatted). */
 static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 			   int is_dir, int is_hid, int cluster,
-			   struct timespec *ts, struct fat_slot_info *sinfo)
+			   struct timespec64 *ts, struct fat_slot_info *sinfo)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
 	struct msdos_dir_entry de;
@@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 	if (err)
 		return err;
 
-	dir->i_ctime = dir->i_mtime = timespec_to_timespec64(*ts);
+	dir->i_ctime = dir->i_mtime = *ts;
 	if (IS_DIRSYNC(dir))
 		(void)fat_sync_inode(dir);
 	else
@@ -267,7 +267,6 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct inode *inode = NULL;
 	struct fat_slot_info sinfo;
 	struct timespec64 ts;
-	struct timespec t;
 	unsigned char msdos_name[MSDOS_NAME];
 	int err, is_hid;
 
@@ -286,8 +285,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 
 	ts = current_time(dir);
-	t = timespec64_to_timespec(ts);
-	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &t, &sinfo);
+	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
 	if (err)
 		goto out;
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
@@ -347,7 +345,6 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	unsigned char msdos_name[MSDOS_NAME];
 	struct timespec64 ts;
-	struct timespec t;
 	int err, is_hid, cluster;
 
 	mutex_lock(&MSDOS_SB(sb)->s_lock);
@@ -365,13 +362,12 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	}
 
 	ts = current_time(dir);
-	t = timespec64_to_timespec(ts);
-	cluster = fat_alloc_new_dir(dir, &t);
+	cluster = fat_alloc_new_dir(dir, &ts);
 	if (cluster < 0) {
 		err = cluster;
 		goto out;
 	}
-	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &t, &sinfo);
+	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
 	if (err)
 		goto out_free;
 	inc_nlink(dir);
@@ -503,9 +499,8 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		new_i_pos = MSDOS_I(new_inode)->i_pos;
 		fat_detach(new_inode);
 	} else {
-		struct timespec t = timespec64_to_timespec(ts);
 		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
-				      &t, &sinfo);
+				      &ts, &sinfo);
 		if (err)
 			goto out;
 		new_i_pos = sinfo.i_pos;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 9a5469120caa..82cd1e69cbdf 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -577,7 +577,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
 
 static int vfat_build_slots(struct inode *dir, const unsigned char *name,
 			    int len, int is_dir, int cluster,
-			    struct timespec *ts,
+			    struct timespec64 *ts,
 			    struct msdos_dir_slot *slots, int *nr_slots)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
@@ -653,7 +653,7 @@ out_free:
 }
 
 static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
-			  int is_dir, int cluster, struct timespec *ts,
+			  int is_dir, int cluster, struct timespec64 *ts,
 			  struct fat_slot_info *sinfo)
 {
 	struct msdos_dir_slot *slots;
@@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
 		goto cleanup;
 
 	/* update timestamp */
-	dir->i_ctime = dir->i_mtime = dir->i_atime = timespec_to_timespec64(*ts);
+	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
 	if (IS_DIRSYNC(dir))
 		(void)fat_sync_inode(dir);
 	else
@@ -762,14 +762,12 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct inode *inode;
 	struct fat_slot_info sinfo;
 	struct timespec64 ts;
-	struct timespec t;
 	int err;
 
 	mutex_lock(&MSDOS_SB(sb)->s_lock);
 
 	ts = current_time(dir);
-	t = timespec64_to_timespec(ts);
-	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &t, &sinfo);
+	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
 	if (err)
 		goto out;
 	inode_inc_iversion(dir);
@@ -853,19 +851,17 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	struct fat_slot_info sinfo;
 	struct timespec64 ts;
-	struct timespec t;
 	int err, cluster;
 
 	mutex_lock(&MSDOS_SB(sb)->s_lock);
 
 	ts = current_time(dir);
-	t = timespec64_to_timespec(ts);
-	cluster = fat_alloc_new_dir(dir, &t);
+	cluster = fat_alloc_new_dir(dir, &ts);
 	if (cluster < 0) {
 		err = cluster;
 		goto out;
 	}
-	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &t, &sinfo);
+	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
 	if (err)
 		goto out_free;
 	inode_inc_iversion(dir);
@@ -904,7 +900,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct inode *old_inode, *new_inode;
 	struct fat_slot_info old_sinfo, sinfo;
 	struct timespec64 ts;
-	struct timespec t;
 	loff_t new_i_pos;
 	int err, is_dir, update_dotdot, corrupt = 0;
 	struct super_block *sb = old_dir->i_sb;
@@ -939,9 +934,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_i_pos = MSDOS_I(new_inode)->i_pos;
 		fat_detach(new_inode);
 	} else {
-		t = timespec64_to_timespec(ts);
 		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
-				     &t, &sinfo);
+				     &ts, &sinfo);
 		if (err)
 			goto out;
 		new_i_pos = sinfo.i_pos;
-- 
cgit v1.2.3


From 52cba1a274818c3ee6299c1a1b476e7bc5037a2f Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 21:59:51 -0700
Subject: signal: make force_sigsegv() void

Patch series "signal: refactor some functions", v3.

This series refactors a bunch of functions in signal.c to simplify parts
of the code.

The greatest single change is declaring the static do_sigpending() helper
as void which makes it possible to remove a bunch of unnecessary checks in
the syscalls later on.

This patch (of 17):

force_sigsegv() returned 0 unconditionally so it doesn't make sense to have
it return at all. In addition, there are no callers that check
force_sigsegv()'s return value.

Link: http://lkml.kernel.org/r/20180602103653.18181-2-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/signal.h | 2 +-
 kernel/signal.c              | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 113d1ad1ced7..e138ac16c650 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -314,7 +314,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey);
 int force_sig_ptrace_errno_trap(int errno, void __user *addr);
 
 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
-extern int force_sigsegv(int, struct task_struct *);
+extern void force_sigsegv(int sig, struct task_struct *p);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8d8a940422a8..8a828baa0f93 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1458,8 +1458,7 @@ send_sig(int sig, struct task_struct *p, int priv)
 	return send_sig_info(sig, __si_special(priv), p);
 }
 
-void
-force_sig(int sig, struct task_struct *p)
+void force_sig(int sig, struct task_struct *p)
 {
 	force_sig_info(sig, SEND_SIG_PRIV, p);
 }
@@ -1470,8 +1469,7 @@ force_sig(int sig, struct task_struct *p)
  * the problem was already a SIGSEGV, we'll want to
  * make sure we don't even try to deliver the signal..
  */
-int
-force_sigsegv(int sig, struct task_struct *p)
+void force_sigsegv(int sig, struct task_struct *p)
 {
 	if (sig == SIGSEGV) {
 		unsigned long flags;
@@ -1480,7 +1478,6 @@ force_sigsegv(int sig, struct task_struct *p)
 		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	}
 	force_sig(SIGSEGV, p);
-	return 0;
 }
 
 int force_sig_fault(int sig, int code, void __user *addr
-- 
cgit v1.2.3


From bb17fcca078fc56ea7c7611cbe92687021cf6a31 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 21:59:55 -0700
Subject: signal: make kill_as_cred_perm() return bool

kill_as_cred_perm() already behaves like a boolean function.  Let's
actually declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-3-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 8a828baa0f93..5b0fb57b23f0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1339,14 +1339,15 @@ static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
-static int kill_as_cred_perm(const struct cred *cred,
-			     struct task_struct *target)
+static inline bool kill_as_cred_perm(const struct cred *cred,
+				     struct task_struct *target)
 {
 	const struct cred *pcred = __task_cred(target);
-	if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
-	    !uid_eq(cred->uid,  pcred->suid) && !uid_eq(cred->uid,  pcred->uid))
-		return 0;
-	return 1;
+
+	return uid_eq(cred->euid, pcred->suid) ||
+	       uid_eq(cred->euid, pcred->uid) ||
+	       uid_eq(cred->uid, pcred->suid) ||
+	       uid_eq(cred->uid, pcred->uid);
 }
 
 /* like kill_pid_info(), but doesn't use uid/euid of "current" */
-- 
cgit v1.2.3


From 6527de953354e550bf6d941fdcd1aaf832317e8d Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 21:59:59 -0700
Subject: signal: make may_ptrace_stop() return bool

may_ptrace_stop() already behaves like a boolean function.  Let's actually
declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-4-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 5b0fb57b23f0..f03886cd064e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1875,10 +1875,10 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	spin_unlock_irqrestore(&sighand->siglock, flags);
 }
 
-static inline int may_ptrace_stop(void)
+static inline bool may_ptrace_stop(void)
 {
 	if (!likely(current->ptrace))
-		return 0;
+		return false;
 	/*
 	 * Are we in the middle of do_coredump?
 	 * If so and our tracer is also part of the coredump stopping
@@ -1894,9 +1894,9 @@ static inline int may_ptrace_stop(void)
 	 */
 	if (unlikely(current->mm->core_state) &&
 	    unlikely(current->mm == current->parent->mm))
-		return 0;
+		return false;
 
-	return 1;
+	return true;
 }
 
 /*
-- 
cgit v1.2.3


From b1d294c80393e7ba18b3c646fe9d8e6a206c7988 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:02 -0700
Subject: signal: make do_sigpending() void

do_sigpending() returned 0 unconditionally so it doesn't make sense to
have it return at all.  This allows us to simplify a bunch of syscall
callers.

Link: http://lkml.kernel.org/r/20180602103653.18181-5-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index f03886cd064e..10918c7b2ee3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2753,7 +2753,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 }
 #endif
 
-static int do_sigpending(sigset_t *set)
+static void do_sigpending(sigset_t *set)
 {
 	spin_lock_irq(&current->sighand->siglock);
 	sigorsets(set, &current->pending.signal,
@@ -2762,7 +2762,6 @@ static int do_sigpending(sigset_t *set)
 
 	/* Outside the lock because only this thread touches it.  */
 	sigandsets(set, &current->blocked, set);
-	return 0;
 }
 
 /**
@@ -2774,15 +2773,16 @@ static int do_sigpending(sigset_t *set)
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 {
 	sigset_t set;
-	int err;
 
 	if (sigsetsize > sizeof(*uset))
 		return -EINVAL;
 
-	err = do_sigpending(&set);
-	if (!err && copy_to_user(uset, &set, sigsetsize))
-		err = -EFAULT;
-	return err;
+	do_sigpending(&set);
+
+	if (copy_to_user(uset, &set, sigsetsize))
+		return -EFAULT;
+
+	return 0;
 }
 
 #ifdef CONFIG_COMPAT
@@ -2790,15 +2790,13 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 		compat_size_t, sigsetsize)
 {
 	sigset_t set;
-	int err;
 
 	if (sigsetsize > sizeof(*uset))
 		return -EINVAL;
 
-	err = do_sigpending(&set);
-	if (!err)
-		err = put_compat_sigset(uset, &set, sigsetsize);
-	return err;
+	do_sigpending(&set);
+
+	return put_compat_sigset(uset, &set, sigsetsize);
 }
 #endif
 
@@ -3560,25 +3558,26 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
 SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
 {
 	sigset_t set;
-	int err;
 
 	if (sizeof(old_sigset_t) > sizeof(*uset))
 		return -EINVAL;
 
-	err = do_sigpending(&set);
-	if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t)))
-		err = -EFAULT;
-	return err;
+	do_sigpending(&set);
+
+	if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
+		return -EFAULT;
+
+	return 0;
 }
 
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
 {
 	sigset_t set;
-	int err = do_sigpending(&set);
-	if (!err)
-		err = put_user(set.sig[0], set32);
-	return err;
+
+	do_sigpending(&set);
+
+	return put_user(set.sig[0], set32);
 }
 #endif
 
-- 
cgit v1.2.3


From d8f993b3dba05ff354f9b9592afc44b1b3dbfd46 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:07 -0700
Subject: signal: simplify rt_sigaction()

The goto is not needed and does not add any clarity.  Simply return
-EINVAL on unexpected sigset_t struct size directly.

Link: http://lkml.kernel.org/r/20180602103653.18181-6-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 10918c7b2ee3..adeee5095039 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3648,25 +3648,23 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		size_t, sigsetsize)
 {
 	struct k_sigaction new_sa, old_sa;
-	int ret = -EINVAL;
+	int ret;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
 	if (sigsetsize != sizeof(sigset_t))
-		goto out;
+		return -EINVAL;
 
-	if (act) {
-		if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
-			return -EFAULT;
-	}
+	if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
+		return -EFAULT;
 
 	ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
+	if (ret)
+		return ret;
 
-	if (!ret && oact) {
-		if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
-			return -EFAULT;
-	}
-out:
-	return ret;
+	if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
+		return -EFAULT;
+
+	return 0;
 }
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
-- 
cgit v1.2.3


From 2a9b90940957d3c0f744011b02b4575323be5947 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:11 -0700
Subject: signal: make kill_ok_by_cred() return bool

kill_ok_by_cred() already behaves like a boolean function.  Let's actually
declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-7-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index adeee5095039..0e1bb87415c4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -717,21 +717,16 @@ static inline bool si_fromuser(const struct siginfo *info)
 /*
  * called with RCU read lock from check_kill_permission()
  */
-static int kill_ok_by_cred(struct task_struct *t)
+static bool kill_ok_by_cred(struct task_struct *t)
 {
 	const struct cred *cred = current_cred();
 	const struct cred *tcred = __task_cred(t);
 
-	if (uid_eq(cred->euid, tcred->suid) ||
-	    uid_eq(cred->euid, tcred->uid)  ||
-	    uid_eq(cred->uid,  tcred->suid) ||
-	    uid_eq(cred->uid,  tcred->uid))
-		return 1;
-
-	if (ns_capable(tcred->user_ns, CAP_KILL))
-		return 1;
-
-	return 0;
+	return uid_eq(cred->euid, tcred->suid) ||
+	       uid_eq(cred->euid, tcred->uid) ||
+	       uid_eq(cred->uid, tcred->suid) ||
+	       uid_eq(cred->uid, tcred->uid) ||
+	       ns_capable(tcred->user_ns, CAP_KILL);
 }
 
 /*
-- 
cgit v1.2.3


From e4a8b4efbfdf8ce86b1d410cc96d2790a05f1fd5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:15 -0700
Subject: signal: make sig_handler_ignored() return bool

sig_handler_ignored() already behaves like a boolean function.  Let's
actually declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-8-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 0e1bb87415c4..a032a9bae5fb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -65,11 +65,11 @@ static void __user *sig_handler(struct task_struct *t, int sig)
 	return t->sighand->action[sig - 1].sa.sa_handler;
 }
 
-static int sig_handler_ignored(void __user *handler, int sig)
+static inline bool sig_handler_ignored(void __user *handler, int sig)
 {
 	/* Is it explicitly or implicitly ignored? */
 	return handler == SIG_IGN ||
-		(handler == SIG_DFL && sig_kernel_ignore(sig));
+	       (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
 static int sig_task_ignored(struct task_struct *t, int sig, bool force)
-- 
cgit v1.2.3


From 41aaa481197dc566f691d403e0247bb53a017770 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:19 -0700
Subject: signal: make sig_task_ignored() return bool

sig_task_ignored() already behaves like a boolean function.  Let's
actually declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-9-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index a032a9bae5fb..6e92adddd667 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -72,7 +72,7 @@ static inline bool sig_handler_ignored(void __user *handler, int sig)
 	       (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_task_ignored(struct task_struct *t, int sig, bool force)
+static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
 {
 	void __user *handler;
 
@@ -80,7 +80,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
 	    handler == SIG_DFL && !(force && sig_kernel_only(sig)))
-		return 1;
+		return true;
 
 	return sig_handler_ignored(handler, sig);
 }
-- 
cgit v1.2.3


From 6a0cdcd78892d4508e77cbec913eaf099ab4a8e9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:23 -0700
Subject: signal: make sig_ignored() return bool

sig_ignored() already behaves like a boolean function.  Let's actually
declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-10-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 6e92adddd667..bcd4272639ca 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -85,7 +85,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
 	return sig_handler_ignored(handler, sig);
 }
 
-static int sig_ignored(struct task_struct *t, int sig, bool force)
+static bool sig_ignored(struct task_struct *t, int sig, bool force)
 {
 	/*
 	 * Blocked signals are never ignored, since the
@@ -93,7 +93,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
 	 * unblocked.
 	 */
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
-		return 0;
+		return false;
 
 	/*
 	 * Tracers may want to know about even ignored signal unless it
@@ -101,7 +101,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
 	 * by SIGNAL_UNKILLABLE task.
 	 */
 	if (t->ptrace && sig != SIGKILL)
-		return 0;
+		return false;
 
 	return sig_task_ignored(t, sig, force);
 }
-- 
cgit v1.2.3


From 938696a82974c33b93be9657b72521bc84f5c587 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:27 -0700
Subject: signal: make has_pending_signals() return bool

has_pending_signals() already behaves like a boolean function.  Let's
actually declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-11-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index bcd4272639ca..e00e96a7b24f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -110,7 +110,7 @@ static bool sig_ignored(struct task_struct *t, int sig, bool force)
  * Re-calculate pending state from the set of locally pending
  * signals, globally pending signals, and blocked signals.
  */
-static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
 {
 	unsigned long ready;
 	long i;
-- 
cgit v1.2.3


From 09ae854edb2d275d98a874c21c24c58cee4df14e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:30 -0700
Subject: signal: make recalc_sigpending_tsk() return bool

recalc_sigpending_tsk() already behaves like a boolean function.  Let's
actually declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-12-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index e00e96a7b24f..7eec2dba597e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -138,20 +138,21 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
 
 #define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
 
-static int recalc_sigpending_tsk(struct task_struct *t)
+static bool recalc_sigpending_tsk(struct task_struct *t)
 {
 	if ((t->jobctl & JOBCTL_PENDING_MASK) ||
 	    PENDING(&t->pending, &t->blocked) ||
 	    PENDING(&t->signal->shared_pending, &t->blocked)) {
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
-		return 1;
+		return true;
 	}
+
 	/*
 	 * We must never clear the flag in another thread, or in current
 	 * when it's possible the current syscall is returning -ERESTART*.
 	 * So we don't clear it here, and only callers who know they should do.
 	 */
-	return 0;
+	return false;
 }
 
 /*
-- 
cgit v1.2.3


From 67a48a24478841525ba73ec6d64caaf079cf3b7c Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:34 -0700
Subject: signal: make unhandled_signal() return bool

unhandled_signal() already behaves like a boolean function.  Let's
actually declare it as such too.  All callers treat it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-13-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 2 +-
 kernel/signal.c        | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 3c5200137b24..fa23cae66758 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -287,7 +287,7 @@ static inline void disallow_signal(int sig)
 
 extern struct kmem_cache *sighand_cachep;
 
-int unhandled_signal(struct task_struct *tsk, int sig);
+extern bool unhandled_signal(struct task_struct *tsk, int sig);
 
 /*
  * In POSIX a signal is sent either to a specific thread (Linux task)
diff --git a/kernel/signal.c b/kernel/signal.c
index 7eec2dba597e..c10c09fc4ec3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -505,13 +505,15 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 	}
 }
 
-int unhandled_signal(struct task_struct *tsk, int sig)
+bool unhandled_signal(struct task_struct *tsk, int sig)
 {
 	void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
 	if (is_global_init(tsk))
-		return 1;
+		return true;
+
 	if (handler != SIG_IGN && handler != SIG_DFL)
-		return 0;
+		return false;
+
 	/* if ptraced, let the tracer determine */
 	return !tsk->ptrace;
 }
-- 
cgit v1.2.3


From 8f11351eee9524c179361fcbf1538c993f2390e4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:38 -0700
Subject: signal: make flush_sigqueue_mask() void

The return value of flush_sigqueue_mask() is never checked anywhere.

Link: http://lkml.kernel.org/r/20180602103653.18181-14-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index c10c09fc4ec3..fe0e9e82ce44 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -687,14 +687,14 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
  *
  * All callers must be holding the siglock.
  */
-static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
+static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
 {
 	struct sigqueue *q, *n;
 	sigset_t m;
 
 	sigandsets(&m, mask, &s->signal);
 	if (sigisemptyset(&m))
-		return 0;
+		return;
 
 	sigandnsets(&s->signal, &s->signal, mask);
 	list_for_each_entry_safe(q, n, &s->list, list) {
@@ -703,7 +703,6 @@ static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
 			__sigqueue_free(q);
 		}
 	}
-	return 1;
 }
 
 static inline int is_si_special(const struct siginfo *info)
-- 
cgit v1.2.3


From acd14e62f075f44cd04e1ac2920a515f9b80146e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:42 -0700
Subject: signal: make wants_signal() return bool

wants_signal() already behaves like a boolean function.  Let's actually
declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-15-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index fe0e9e82ce44..0e48dbc6649e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -879,16 +879,20 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
  * as soon as they're available, so putting the signal on the shared queue
  * will be equivalent to sending it to one such thread.
  */
-static inline int wants_signal(int sig, struct task_struct *p)
+static inline bool wants_signal(int sig, struct task_struct *p)
 {
 	if (sigismember(&p->blocked, sig))
-		return 0;
+		return false;
+
 	if (p->flags & PF_EXITING)
-		return 0;
+		return false;
+
 	if (sig == SIGKILL)
-		return 1;
+		return true;
+
 	if (task_is_stopped_or_traced(p))
-		return 0;
+		return false;
+
 	return task_curr(p) || !signal_pending(p);
 }
 
-- 
cgit v1.2.3


From a19e2c01f71bb4b8442db450200a01a28db36a04 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:46 -0700
Subject: signal: make legacy_queue() return bool

legacy_queue() already behaves like a boolean function.  Let's actually
declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-16-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 0e48dbc6649e..3de1ba2af032 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -972,7 +972,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	return;
 }
 
-static inline int legacy_queue(struct sigpending *signals, int sig)
+static inline bool legacy_queue(struct sigpending *signals, int sig)
 {
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
-- 
cgit v1.2.3


From f99e9d8c5c7f5539e6d97fd04a4ad213e52daac0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:50 -0700
Subject: signal: make sigkill_pending() return bool

sigkill_pending() already behaves like a boolean function.  Let's actually
declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-17-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 3de1ba2af032..9f9b4183178e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1904,10 +1904,10 @@ static inline bool may_ptrace_stop(void)
  * Return non-zero if there is a SIGKILL that should be waking us up.
  * Called with the siglock held.
  */
-static int sigkill_pending(struct task_struct *tsk)
+static bool sigkill_pending(struct task_struct *tsk)
 {
-	return	sigismember(&tsk->pending.signal, SIGKILL) ||
-		sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
+	return sigismember(&tsk->pending.signal, SIGKILL) ||
+	       sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
 }
 
 /*
-- 
cgit v1.2.3


From 20ab7218d2507b2dbec9c053c2f1b96054e266b6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 21 Aug 2018 22:00:54 -0700
Subject: signal: make get_signal() return bool

make get_signal() already behaves like a boolean function.  Let's actually
declare it as such too.

Link: http://lkml.kernel.org/r/20180602103653.18181-18-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Morris <james.morris@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 2 +-
 kernel/signal.c        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/signal.h b/include/linux/signal.h
index fa23cae66758..e3d9e0640e6e 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -265,7 +265,7 @@ extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
 
-extern int get_signal(struct ksignal *ksig);
+extern bool get_signal(struct ksignal *ksig);
 extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
 extern void exit_signals(struct task_struct *tsk);
 extern void kernel_sigaction(int, __sighandler_t);
diff --git a/kernel/signal.c b/kernel/signal.c
index 9f9b4183178e..786bacc60649 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2287,7 +2287,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
 	return signr;
 }
 
-int get_signal(struct ksignal *ksig)
+bool get_signal(struct ksignal *ksig)
 {
 	struct sighand_struct *sighand = current->sighand;
 	struct signal_struct *signal = current->signal;
@@ -2297,7 +2297,7 @@ int get_signal(struct ksignal *ksig)
 		task_work_run();
 
 	if (unlikely(uprobe_deny_signal()))
-		return 0;
+		return false;
 
 	/*
 	 * Do this once, we can't return to user-mode if freezing() == T.
-- 
cgit v1.2.3


From 06e62a46bbba20aa5286102016a04214bb446141 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 21 Aug 2018 22:00:58 -0700
Subject: fork: don't copy inconsistent signal handler state to child

Before this change, if a multithreaded process forks while one of its
threads is changing a signal handler using sigaction(), the memcpy() in
copy_sighand() can race with the struct assignment in do_sigaction().  It
isn't clear whether this can cause corruption of the userspace signal
handler pointer, but it definitely can cause inconsistency between
different fields of struct sigaction.

Take the appropriate spinlock to avoid this.

I have tested that this patch prevents inconsistency between sa_sigaction
and sa_flags, which is possible before this patch.

Link: http://lkml.kernel.org/r/20180702145108.73189-1-jannh@google.com
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/fork.c b/kernel/fork.c
index 8bcef2413f1b..23eb960c701d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1427,7 +1427,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 		return -ENOMEM;
 
 	atomic_set(&sig->count, 1);
+	spin_lock_irq(&current->sighand->siglock);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+	spin_unlock_irq(&current->sighand->siglock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0ba7f398f39ebc33a695058a8d12c4d795430ffa Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 21 Aug 2018 22:01:01 -0700
Subject: drivers/rapidio/devices/rio_mport_cdev.c: remove redundant pointer md

Pointer md is being assigned but is never used hence it is redundant and
can be removed.

Cleans up clang warning:
warning: variable 'md' set but not used [-Wunused-but-set-variable]

Link: http://lkml.kernel.org/r/20180711082346.5223-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Alexandre Bounine <alex.bou9@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/devices/rio_mport_cdev.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index a8cb8d2f2abb..cbe467ff1aba 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -1006,7 +1006,6 @@ out_free:
 static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg)
 {
 	struct mport_cdev_priv *priv;
-	struct mport_dev *md;
 	struct rio_async_tx_wait w_param;
 	struct mport_dma_req *req;
 	dma_cookie_t cookie;
@@ -1016,7 +1015,6 @@ static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg)
 	int ret;
 
 	priv = (struct mport_cdev_priv *)filp->private_data;
-	md = priv->md;
 
 	if (unlikely(copy_from_user(&w_param, arg, sizeof(w_param))))
 		return -EFAULT;
-- 
cgit v1.2.3


From 5f733e8a2dd130eaeed24cbaef49d349206cdb8b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 21 Aug 2018 22:01:06 -0700
Subject: kernel/sysctl.c: fix typos in comments

Fix a few typos/spellos in kernel/sysctl.c.

Link: http://lkml.kernel.org/r/bb09a8b9-f984-6dd4-b07b-3ecaf200862e@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dbdcb6673b27..71ceb6c13c1a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -225,7 +225,7 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
 
 #ifdef CONFIG_MAGIC_SYSRQ
-/* Note: sysrq code uses it's own private copy */
+/* Note: sysrq code uses its own private copy */
 static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
 
 static int sysrq_sysctl_handler(struct ctl_table *table, int write,
@@ -1976,13 +1976,13 @@ static void warn_sysctl_write(struct ctl_table *table)
 }
 
 /**
- * proc_first_pos_non_zero_ignore - check if firs position is allowed
+ * proc_first_pos_non_zero_ignore - check if first position is allowed
  * @ppos: file position
  * @table: the sysctl table
  *
  * Returns true if the first position is non-zero and the sysctl_writes_strict
  * mode indicates this is not allowed for numeric input types. String proc
- * hadlers can ignore the return value.
+ * handlers can ignore the return value.
  */
 static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
 					   struct ctl_table *table)
-- 
cgit v1.2.3


From d9edcbc42c77b719e03dedb2aff719ae19659a0f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 22:01:09 -0700
Subject: adfs: use timespec64 for time conversion

We just truncate the seconds to 32-bit in one place now, so this can
trivially be converted over to using timespec64 consistently.

Link: http://lkml.kernel.org/r/20180620100133.4035614-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/adfs/inode.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e91028d4340a..66621e96f9af 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -167,7 +167,7 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode)
  * of time to convert from RISC OS epoch to Unix epoch.
  */
 static void
-adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
+adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode)
 {
 	unsigned int high, low;
 	/* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
@@ -195,11 +195,11 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
 	/* convert from RISC OS to Unix epoch */
 	nsec -= nsec_unix_epoch_diff_risc_os_epoch;
 
-	*tv = ns_to_timespec(nsec);
+	*tv = ns_to_timespec64(nsec);
 	return;
 
  cur_time:
-	*tv = timespec64_to_timespec(current_time(inode));
+	*tv = current_time(inode);
 	return;
 
  too_early:
@@ -242,7 +242,6 @@ adfs_unix2adfs_time(struct inode *inode, unsigned int secs)
 struct inode *
 adfs_iget(struct super_block *sb, struct object_info *obj)
 {
-	struct timespec ts;
 	struct inode *inode;
 
 	inode = new_inode(sb);
@@ -271,9 +270,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	ADFS_I(inode)->stamped   = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
 
 	inode->i_mode	 = adfs_atts2mode(sb, inode);
-	ts = timespec64_to_timespec(inode->i_mtime);
-	adfs_adfs2unix_time(&ts, inode);
-	inode->i_mtime = timespec_to_timespec64(ts);
+	adfs_adfs2unix_time(&inode->i_mtime, inode);
 	inode->i_atime = inode->i_mtime;
 	inode->i_ctime = inode->i_mtime;
 
-- 
cgit v1.2.3


From 3e811f053aec66e8a6d2a0ee3d031e7c988e3d15 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 21 Aug 2018 22:01:13 -0700
Subject: fs/sysv/inode.c: use ktime_get_real_seconds() for superblock stamp

get_seconds() is deprecated in favor of ktime_get_real_seconds(), which
returns a 64-bit timestamp.

In the SYSV file system, the superblock timestamp is only 32 bits wide,
and it is used to check whether a file system is clean, so the best
solution seems to be to force a wraparound and explicitly convert it to an
unsigned 32-bit value.

This is independent of the inode timestamps that are also 32-bit wide on
disk and that come from current_time().

Link: http://lkml.kernel.org/r/20180713145236.3152513-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sysv/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index bec9f79adb25..499a20a5a010 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -35,7 +35,7 @@
 static int sysv_sync_fs(struct super_block *sb, int wait)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
-	unsigned long time = get_seconds(), old_time;
+	u32 time = (u32)ktime_get_real_seconds(), old_time;
 
 	mutex_lock(&sbi->s_lock);
 
@@ -46,8 +46,8 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
 	 */
 	old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
 	if (sbi->s_type == FSTYPE_SYSV4) {
-		if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
-			*sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38 - time);
+		if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time))
+			*sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time);
 		*sbi->s_sb_time = cpu_to_fs32(sbi, time);
 		mark_buffer_dirty(sbi->s_bh2);
 	}
-- 
cgit v1.2.3


From 5cb366bb3a746f6b06ea086b322e21e345401c9d Mon Sep 17 00:00:00 2001
From: Adrian Reber <adrian@lisas.de>
Date: Tue, 21 Aug 2018 22:01:17 -0700
Subject: init/Kconfig: remove EXPERT from CHECKPOINT_RESTORE

The CHECKPOINT_RESTORE configuration option was introduced in 2012 and
combined with EXPERT.  CHECKPOINT_RESTORE is already enabled in many
distribution kernels and also part of the defconfigs of various
architectures.

To make it easier for distributions to enable CHECKPOINT_RESTORE this
removes EXPERT and moves the configuration option out of the EXPERT block.

Link: http://lkml.kernel.org/r/20180712130733.11510-1-adrian@lisas.de
Signed-off-by: Adrian Reber <adrian@lisas.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Hendrik Brueckner <brueckner@linux.ibm.com>
Acked-by: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 5eb1ef7a275b..641dd7dd7c8a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -967,6 +967,18 @@ config NET_NS
 
 endif # NAMESPACES
 
+config CHECKPOINT_RESTORE
+	bool "Checkpoint/restore support"
+	select PROC_CHILDREN
+	default n
+	help
+	  Enables additional kernel features in a sake of checkpoint/restore.
+	  In particular it adds auxiliary prctl codes to setup process text,
+	  data and heap segment sizes, and a few additional /proc filesystem
+	  entries.
+
+	  If unsure, say N here.
+
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
 	select CGROUPS
@@ -1383,18 +1395,6 @@ config MEMBARRIER
 
 	  If unsure, say Y.
 
-config CHECKPOINT_RESTORE
-	bool "Checkpoint/restore support" if EXPERT
-	select PROC_CHILDREN
-	default n
-	help
-	  Enables additional kernel features in a sake of checkpoint/restore.
-	  In particular it adds auxiliary prctl codes to setup process text,
-	  data and heap segment sizes, and a few additional /proc filesystem
-	  entries.
-
-	  If unsure, say N here.
-
 config KALLSYMS
 	 bool "Load all symbols for debugging/ksymoops" if EXPERT
 	 default y
-- 
cgit v1.2.3


From 615c999cd8a07b7c3c93bbdee89ef705d2ce52e1 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 21 Aug 2018 22:01:21 -0700
Subject: ipc: compute kern_ipc_perm.id under the ipc lock

ipc_addid() initializes kern_ipc_perm.id after having called
ipc_idr_alloc().

Thus a parallel semctl() or msgctl() that uses e.g.  MSG_STAT may use this
unitialized value as the return code.

The patch moves all accesses to kern_ipc_perm.id under the spin_lock().

The issues is related to the finding of
syzbot+2827ef6b3385deb07eaf@syzkaller.appspotmail.com: syzbot found an
issue with kern_ipc_perm.seq

Link: http://lkml.kernel.org/r/20180712185241.4017-2-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/msg.c | 19 ++++++++++++++-----
 ipc/sem.c | 18 +++++++++++++-----
 ipc/shm.c | 19 ++++++++++++++-----
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 203281198079..7cb89a9b24e2 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -492,7 +492,6 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid,
 			 int cmd, struct msqid64_ds *p)
 {
 	struct msg_queue *msq;
-	int id = 0;
 	int err;
 
 	memset(p, 0, sizeof(*p));
@@ -504,7 +503,6 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid,
 			err = PTR_ERR(msq);
 			goto out_unlock;
 		}
-		id = msq->q_perm.id;
 	} else { /* IPC_STAT */
 		msq = msq_obtain_object_check(ns, msqid);
 		if (IS_ERR(msq)) {
@@ -549,10 +547,21 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid,
 	p->msg_lspid  = pid_vnr(msq->q_lspid);
 	p->msg_lrpid  = pid_vnr(msq->q_lrpid);
 
-	ipc_unlock_object(&msq->q_perm);
-	rcu_read_unlock();
-	return id;
+	if (cmd == IPC_STAT) {
+		/*
+		 * As defined in SUS:
+		 * Return 0 on success
+		 */
+		err = 0;
+	} else {
+		/*
+		 * MSG_STAT and MSG_STAT_ANY (both Linux specific)
+		 * Return the full id, including the sequence number
+		 */
+		err = msq->q_perm.id;
+	}
 
+	ipc_unlock_object(&msq->q_perm);
 out_unlock:
 	rcu_read_unlock();
 	return err;
diff --git a/ipc/sem.c b/ipc/sem.c
index 00ef2f743a62..18e50f464f76 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1223,7 +1223,6 @@ static int semctl_stat(struct ipc_namespace *ns, int semid,
 {
 	struct sem_array *sma;
 	time64_t semotime;
-	int id = 0;
 	int err;
 
 	memset(semid64, 0, sizeof(*semid64));
@@ -1235,7 +1234,6 @@ static int semctl_stat(struct ipc_namespace *ns, int semid,
 			err = PTR_ERR(sma);
 			goto out_unlock;
 		}
-		id = sma->sem_perm.id;
 	} else { /* IPC_STAT */
 		sma = sem_obtain_object_check(ns, semid);
 		if (IS_ERR(sma)) {
@@ -1275,10 +1273,20 @@ static int semctl_stat(struct ipc_namespace *ns, int semid,
 #endif
 	semid64->sem_nsems = sma->sem_nsems;
 
+	if (cmd == IPC_STAT) {
+		/*
+		 * As defined in SUS:
+		 * Return 0 on success
+		 */
+		err = 0;
+	} else {
+		/*
+		 * SEM_STAT and SEM_STAT_ANY (both Linux specific)
+		 * Return the full id, including the sequence number
+		 */
+		err = sma->sem_perm.id;
+	}
 	ipc_unlock_object(&sma->sem_perm);
-	rcu_read_unlock();
-	return id;
-
 out_unlock:
 	rcu_read_unlock();
 	return err;
diff --git a/ipc/shm.c b/ipc/shm.c
index b204feb38274..b3b089315d3b 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -962,7 +962,6 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid,
 			int cmd, struct shmid64_ds *tbuf)
 {
 	struct shmid_kernel *shp;
-	int id = 0;
 	int err;
 
 	memset(tbuf, 0, sizeof(*tbuf));
@@ -974,7 +973,6 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid,
 			err = PTR_ERR(shp);
 			goto out_unlock;
 		}
-		id = shp->shm_perm.id;
 	} else { /* IPC_STAT */
 		shp = shm_obtain_object_check(ns, shmid);
 		if (IS_ERR(shp)) {
@@ -1024,10 +1022,21 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid,
 	tbuf->shm_lpid	= pid_vnr(shp->shm_lprid);
 	tbuf->shm_nattch = shp->shm_nattch;
 
-	ipc_unlock_object(&shp->shm_perm);
-	rcu_read_unlock();
-	return id;
+	if (cmd == IPC_STAT) {
+		/*
+		 * As defined in SUS:
+		 * Return 0 on success
+		 */
+		err = 0;
+	} else {
+		/*
+		 * SHM_STAT and SHM_STAT_ANY (both Linux specific)
+		 * Return the full id, including the sequence number
+		 */
+		err = shp->shm_perm.id;
+	}
 
+	ipc_unlock_object(&shp->shm_perm);
 out_unlock:
 	rcu_read_unlock();
 	return err;
-- 
cgit v1.2.3


From e2652ae6bd7492cdc0436817cbcd09282eb0bb03 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 21 Aug 2018 22:01:25 -0700
Subject: ipc: reorganize initialization of kern_ipc_perm.seq

ipc_addid() initializes kern_ipc_perm.seq after having called idr_alloc()
(within ipc_idr_alloc()).

Thus a parallel semop() or msgrcv() that uses ipc_obtain_object_check()
may see an uninitialized value.

The patch moves the initialization of kern_ipc_perm.seq before the calls
of idr_alloc().

Notes:
1) This patch has a user space visible side effect:
If /proc/sys/kernel/*_next_id is used (i.e.: checkpoint/restore) and
if semget()/msgget()/shmget() fails in the final step of adding the id
to the rhash tree, then .._next_id is cleared. Before the patch, is
remained unmodified.

There is no change of the behavior after a successful ..get() call: It
always clears .._next_id, there is no impact to non checkpoint/restore
code as that code does not use .._next_id.

2) The patch correctly documents that after a call to ipc_idr_alloc(),
the full tear-down sequence must be used. The callers of ipc_addid()
do not fullfill that, i.e. more bugfixes are required.

The patch is a squash of a patch from Dmitry and my own changes.

Link: http://lkml.kernel.org/r/20180712185241.4017-3-manfred@colorfullife.com
Reported-by: syzbot+2827ef6b3385deb07eaf@syzkaller.appspotmail.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt |  3 +-
 ipc/util.c                      | 91 ++++++++++++++++++++++-------------------
 2 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 9d38e75b19a0..37a679501ddc 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -464,7 +464,8 @@ Notes:
 1) kernel doesn't guarantee, that new object will have desired id. So,
 it's up to userspace, how to handle an object with "wrong" id.
 2) Toggle with non-default value will be set back to -1 by kernel after
-successful IPC object allocation.
+successful IPC object allocation. If an IPC object allocation syscall
+fails, it is undefined if the value remains unmodified or is reset to -1.
 
 ==============================================================
 
diff --git a/ipc/util.c b/ipc/util.c
index fdffff41f65b..e5c9e2b2e4c4 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -194,46 +194,54 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
 	return NULL;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
 /*
- * Specify desired id for next allocated IPC object.
+ * Insert new IPC object into idr tree, and set sequence number and id
+ * in the correct order.
+ * Especially:
+ * - the sequence number must be set before inserting the object into the idr,
+ *   because the sequence number is accessed without a lock.
+ * - the id can/must be set after inserting the object into the idr.
+ *   All accesses must be done after getting kern_ipc_perm.lock.
+ *
+ * The caller must own kern_ipc_perm.lock.of the new object.
+ * On error, the function returns a (negative) error code.
  */
-#define ipc_idr_alloc(ids, new)						\
-	idr_alloc(&(ids)->ipcs_idr, (new),				\
-		  (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\
-		  0, GFP_NOWAIT)
-
-static inline int ipc_buildid(int id, struct ipc_ids *ids,
-			      struct kern_ipc_perm *new)
+static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
 {
-	if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */
+	int idx, next_id = -1;
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	next_id = ids->next_id;
+	ids->next_id = -1;
+#endif
+
+	/*
+	 * As soon as a new object is inserted into the idr,
+	 * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it,
+	 * and the lockless preparations for ipc operations can start.
+	 * This means especially: permission checks, audit calls, allocation
+	 * of undo structures, ...
+	 *
+	 * Thus the object must be fully initialized, and if something fails,
+	 * then the full tear-down sequence must be followed.
+	 * (i.e.: set new->deleted, reduce refcount, call_rcu())
+	 */
+
+	if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
 		new->seq = ids->seq++;
 		if (ids->seq > IPCID_SEQ_MAX)
 			ids->seq = 0;
+		idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT);
 	} else {
-		new->seq = ipcid_to_seqx(ids->next_id);
-		ids->next_id = -1;
+		new->seq = ipcid_to_seqx(next_id);
+		idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id),
+				0, GFP_NOWAIT);
 	}
-
-	return SEQ_MULTIPLIER * new->seq + id;
+	if (idx >= 0)
+		new->id = SEQ_MULTIPLIER * new->seq + idx;
+	return idx;
 }
 
-#else
-#define ipc_idr_alloc(ids, new)					\
-	idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT)
-
-static inline int ipc_buildid(int id, struct ipc_ids *ids,
-			      struct kern_ipc_perm *new)
-{
-	new->seq = ids->seq++;
-	if (ids->seq > IPCID_SEQ_MAX)
-		ids->seq = 0;
-
-	return SEQ_MULTIPLIER * new->seq + id;
-}
-
-#endif /* CONFIG_CHECKPOINT_RESTORE */
-
 /**
  * ipc_addid - add an ipc identifier
  * @ids: ipc identifier set
@@ -251,7 +259,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 {
 	kuid_t euid;
 	kgid_t egid;
-	int id, err;
+	int idx, err;
 
 	if (limit > IPCMNI)
 		limit = IPCMNI;
@@ -271,30 +279,27 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 	new->cuid = new->uid = euid;
 	new->gid = new->cgid = egid;
 
-	id = ipc_idr_alloc(ids, new);
+	idx = ipc_idr_alloc(ids, new);
 	idr_preload_end();
 
-	if (id >= 0 && new->key != IPC_PRIVATE) {
+	if (idx >= 0 && new->key != IPC_PRIVATE) {
 		err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
 					     ipc_kht_params);
 		if (err < 0) {
-			idr_remove(&ids->ipcs_idr, id);
-			id = err;
+			idr_remove(&ids->ipcs_idr, idx);
+			idx = err;
 		}
 	}
-	if (id < 0) {
+	if (idx < 0) {
 		spin_unlock(&new->lock);
 		rcu_read_unlock();
-		return id;
+		return idx;
 	}
 
 	ids->in_use++;
-	if (id > ids->max_id)
-		ids->max_id = id;
-
-	new->id = ipc_buildid(id, ids, new);
-
-	return id;
+	if (idx > ids->max_id)
+		ids->max_id = idx;
+	return idx;
 }
 
 /**
-- 
cgit v1.2.3


From 39cfffd774a2e8818250360a3e028b5eac9d5392 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 21 Aug 2018 22:01:29 -0700
Subject: ipc/util.c: use ipc_rcu_putref() for failues in ipc_addid()

ipc_addid() is impossible to use:
- for certain failures, the caller must not use ipc_rcu_putref(),
  because the reference counter is not yet initialized.
- for other failures, the caller must use ipc_rcu_putref(),
  because parallel operations could be ongoing already.

The patch cleans that up, by initializing the refcount early, and by
modifying all callers.

The issues is related to the finding of
syzbot+2827ef6b3385deb07eaf@syzkaller.appspotmail.com: syzbot found an
issue with reading kern_ipc_perm.seq, here both read and write to already
released memory could happen.

Link: http://lkml.kernel.org/r/20180712185241.4017-4-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/msg.c  |  2 +-
 ipc/sem.c  |  2 +-
 ipc/shm.c  |  2 ++
 ipc/util.c | 10 ++++++++--
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 7cb89a9b24e2..3132aa27ff88 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -163,7 +163,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
 	/* ipc_addid() locks msq upon success. */
 	retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
 	if (retval < 0) {
-		call_rcu(&msq->q_perm.rcu, msg_rcu_free);
+		ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
 		return retval;
 	}
 
diff --git a/ipc/sem.c b/ipc/sem.c
index 18e50f464f76..8e72037b894a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -557,7 +557,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	/* ipc_addid() locks sma upon success. */
 	retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
 	if (retval < 0) {
-		call_rcu(&sma->sem_perm.rcu, sem_rcu_free);
+		ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
 		return retval;
 	}
 	ns->used_sems += nsems;
diff --git a/ipc/shm.c b/ipc/shm.c
index b3b089315d3b..6e7bd9830549 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -684,6 +684,8 @@ no_id:
 	if (is_file_hugepages(file) && shp->mlock_user)
 		user_shm_unlock(size, shp->mlock_user);
 	fput(file);
+	ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
+	return error;
 no_file:
 	call_rcu(&shp->shm_perm.rcu, shm_rcu_free);
 	return error;
diff --git a/ipc/util.c b/ipc/util.c
index e5c9e2b2e4c4..465bbd21d234 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -251,7 +251,9 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
  * Add an entry 'new' to the ipc ids idr. The permissions object is
  * initialised and the first free entry is set up and the id assigned
  * is returned. The 'new' entry is returned in a locked state on success.
+ *
  * On failure the entry is not locked and a negative err-code is returned.
+ * The caller must use ipc_rcu_putref() to free the identifier.
  *
  * Called with writer ipc_ids.rwsem held.
  */
@@ -261,6 +263,9 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 	kgid_t egid;
 	int idx, err;
 
+	/* 1) Initialize the refcount so that ipc_rcu_putref works */
+	refcount_set(&new->refcount, 1);
+
 	if (limit > IPCMNI)
 		limit = IPCMNI;
 
@@ -269,9 +274,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 
 	idr_preload(GFP_KERNEL);
 
-	refcount_set(&new->refcount, 1);
 	spin_lock_init(&new->lock);
-	new->deleted = false;
 	rcu_read_lock();
 	spin_lock(&new->lock);
 
@@ -279,6 +282,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 	new->cuid = new->uid = euid;
 	new->gid = new->cgid = egid;
 
+	new->deleted = false;
+
 	idx = ipc_idr_alloc(ids, new);
 	idr_preload_end();
 
@@ -291,6 +296,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 		}
 	}
 	if (idx < 0) {
+		new->deleted = true;
 		spin_unlock(&new->lock);
 		rcu_read_unlock();
 		return idx;
-- 
cgit v1.2.3


From 4241c1a304078569f544d51eeaf8bc270b6e377a Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 21 Aug 2018 22:01:34 -0700
Subject: ipc: rename ipcctl_pre_down_nolock()

Both the comment and the name of ipcctl_pre_down_nolock() are misleading:
The function must be called while holdling the rw semaphore.

Therefore the patch renames the function to ipcctl_obtain_check(): This
name matches the other names used in util.c:

- "obtain" function look up a pointer in the idr, without
  acquiring the object lock.
- The caller is responsible for locking.
- _check means that the sequence number is checked.

Link: http://lkml.kernel.org/r/20180712185241.4017-5-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/msg.c  | 2 +-
 ipc/sem.c  | 2 +-
 ipc/shm.c  | 2 +-
 ipc/util.c | 8 ++++----
 ipc/util.h | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 3132aa27ff88..b9b47d6cd7ee 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -386,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 	down_write(&msg_ids(ns).rwsem);
 	rcu_read_lock();
 
-	ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd,
+	ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd,
 				      &msqid64->msg_perm, msqid64->msg_qbytes);
 	if (IS_ERR(ipcp)) {
 		err = PTR_ERR(ipcp);
diff --git a/ipc/sem.c b/ipc/sem.c
index 8e72037b894a..cf2ba4509f0d 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1596,7 +1596,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
 	down_write(&sem_ids(ns).rwsem);
 	rcu_read_lock();
 
-	ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
+	ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd,
 				      &semid64->sem_perm, 0);
 	if (IS_ERR(ipcp)) {
 		err = PTR_ERR(ipcp);
diff --git a/ipc/shm.c b/ipc/shm.c
index 6e7bd9830549..a413ddf74dac 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -881,7 +881,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 	down_write(&shm_ids(ns).rwsem);
 	rcu_read_lock();
 
-	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
+	ipcp = ipcctl_obtain_check(ns, &shm_ids(ns), shmid, cmd,
 				      &shmid64->shm_perm, 0);
 	if (IS_ERR(ipcp)) {
 		err = PTR_ERR(ipcp);
diff --git a/ipc/util.c b/ipc/util.c
index 465bbd21d234..1d88dffd75e7 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -688,7 +688,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
 }
 
 /**
- * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd
+ * ipcctl_obtain_check - retrieve an ipc object and check permissions
  * @ns:  ipc namespace
  * @ids:  the table of ids where to look for the ipc
  * @id:   the id of the ipc to retrieve
@@ -698,16 +698,16 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
  *
  * This function does some common audit and permissions check for some IPC_XXX
  * cmd and is called from semctl_down, shmctl_down and msgctl_down.
- * It must be called without any lock held and:
  *
- *   - retrieves the ipc with the given id in the given table.
+ * It:
+ *   - retrieves the ipc object with the given id in the given table.
  *   - performs some audit and permission check, depending on the given cmd
  *   - returns a pointer to the ipc object or otherwise, the corresponding
  *     error.
  *
  * Call holding the both the rwsem and the rcu read lock.
  */
-struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
+struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns,
 					struct ipc_ids *ids, int id, int cmd,
 					struct ipc64_perm *perm, int extra_perm)
 {
diff --git a/ipc/util.h b/ipc/util.h
index 0aba3230d007..fcf81425ae98 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -148,7 +148,7 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id);
 void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
 void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
 int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
-struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
+struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns,
 					     struct ipc_ids *ids, int id, int cmd,
 					     struct ipc64_perm *perm, int extra_perm);
 
-- 
cgit v1.2.3


From 2e5ceb452c9b7f0a30c87bd61c01260e3d8464ad Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 21 Aug 2018 22:01:37 -0700
Subject: ipc/util.c: correct comment in ipc_obtain_object_check

The comment that explains ipc_obtain_object_check is wrong: The function
checks the sequence number, not the reference counter.

Note that checking the reference counter would be meaningless: The
reference counter is decreased without holding any locks, thus an object
with kern_ipc_perm.deleted=true may disappear at the end of the next rcu
grace period.

Link: http://lkml.kernel.org/r/20180712185241.4017-6-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ipc/util.c b/ipc/util.c
index 1d88dffd75e7..dcb437095cbd 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -629,8 +629,8 @@ err:
  * @ids: ipc identifier set
  * @id: ipc id to look for
  *
- * Similar to ipc_obtain_object_idr() but also checks
- * the ipc object reference counter.
+ * Similar to ipc_obtain_object_idr() but also checks the ipc object
+ * sequence number.
  *
  * Call inside the RCU critical section.
  * The ipc object is *not* locked on exit.
-- 
cgit v1.2.3


From 82061c57ce93caa50fd01c11f8eb30ddc4c4bd75 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 22:01:41 -0700
Subject: ipc: drop ipc_lock()

ipc/util.c contains multiple functions to get the ipc object pointer given
an id number.

There are two sets of function: One set verifies the sequence counter part
of the id number, other functions do not check the sequence counter.

The standard for function names in ipc/util.c is
- ..._check() functions verify the sequence counter
- ..._idr() functions do not verify the sequence counter

ipc_lock() is an exception: It does not verify the sequence counter value,
but this is not obvious from the function name.

Furthermore, shm.c is the only user of this helper.  Thus, we can simply
move the logic into shm_lock() and get rid of the function altogether.

[manfred@colorfullife.com: most of changelog]
Link: http://lkml.kernel.org/r/20180712185241.4017-7-manfred@colorfullife.com
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c  | 29 +++++++++++++++++++++++------
 ipc/util.c | 36 ------------------------------------
 ipc/util.h |  1 -
 3 files changed, 23 insertions(+), 43 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index a413ddf74dac..4c7ae625d996 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -180,16 +180,33 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace
  */
 static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
 {
-	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
+	struct kern_ipc_perm *ipcp;
+
+	rcu_read_lock();
+	ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);
+	if (IS_ERR(ipcp))
+		goto err;
 
+	ipc_lock_object(ipcp);
+	/*
+	 * ipc_rmid() may have already freed the ID while ipc_lock_object()
+	 * was spinning: here verify that the structure is still valid.
+	 * Upon races with RMID, return -EIDRM, thus indicating that
+	 * the ID points to a removed identifier.
+	 */
+	if (ipc_valid_object(ipcp)) {
+		/* return a locked ipc object upon success */
+		return container_of(ipcp, struct shmid_kernel, shm_perm);
+	}
+
+	ipc_unlock_object(ipcp);
+err:
+	rcu_read_unlock();
 	/*
 	 * Callers of shm_lock() must validate the status of the returned ipc
-	 * object pointer (as returned by ipc_lock()), and error out as
-	 * appropriate.
+	 * object pointer and error out as appropriate.
 	 */
-	if (IS_ERR(ipcp))
-		return (void *)ipcp;
-	return container_of(ipcp, struct shmid_kernel, shm_perm);
+	return (void *)ipcp;
 }
 
 static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
diff --git a/ipc/util.c b/ipc/util.c
index dcb437095cbd..bd1863b6ed39 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -588,42 +588,6 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
 	return out;
 }
 
-/**
- * ipc_lock - lock an ipc structure without rwsem held
- * @ids: ipc identifier set
- * @id: ipc id to look for
- *
- * Look for an id in the ipc ids idr and lock the associated ipc object.
- *
- * The ipc object is locked on successful exit.
- */
-struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
-{
-	struct kern_ipc_perm *out;
-
-	rcu_read_lock();
-	out = ipc_obtain_object_idr(ids, id);
-	if (IS_ERR(out))
-		goto err;
-
-	spin_lock(&out->lock);
-
-	/*
-	 * ipc_rmid() may have already freed the ID while ipc_lock()
-	 * was spinning: here verify that the structure is still valid.
-	 * Upon races with RMID, return -EIDRM, thus indicating that
-	 * the ID points to a removed identifier.
-	 */
-	if (ipc_valid_object(out))
-		return out;
-
-	spin_unlock(&out->lock);
-	out = ERR_PTR(-EIDRM);
-err:
-	rcu_read_unlock();
-	return out;
-}
-
 /**
  * ipc_obtain_object_check
  * @ids: ipc identifier set
diff --git a/ipc/util.h b/ipc/util.h
index fcf81425ae98..e3c47b21db93 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -142,7 +142,6 @@ int ipc_rcu_getref(struct kern_ipc_perm *ptr);
 void ipc_rcu_putref(struct kern_ipc_perm *ptr,
 			void (*func)(struct rcu_head *head));
 
-struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
 struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id);
 
 void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
-- 
cgit v1.2.3


From 93f976b5190df32793908d49165f78e67fcb66cf Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 22:01:45 -0700
Subject: lib/rhashtable: simplify bucket_table_alloc()

As of ce91f6ee5b3b ("mm: kvmalloc does not fallback to vmalloc for
incompatible gfp flags") we can simplify the caller and trust kvzalloc()
to just do the right thing.  For the case of the GFP_ATOMIC context, we
can drop the __GFP_NORETRY flag for obvious reasons, and for the
__GFP_NOWARN case, however, it is changed such that the caller passes the
flag instead of making bucket_table_alloc() handle it.

This slightly changes the gfp flags passed on to nested_table_alloc() as
it will now also use GFP_ATOMIC | __GFP_NOWARN.  However, I consider this
a positive consequence as for the same reasons we want nowarn semantics in
bucket_table_alloc().

[manfred@colorfullife.com: commit id extended to 12 digits, line wraps updated]
Link: http://lkml.kernel.org/r/20180712185241.4017-8-manfred@colorfullife.com
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rhashtable.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index ae4223e0f5bc..238c8daa9852 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -174,10 +174,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	int i;
 
 	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
-	if (gfp != GFP_KERNEL)
-		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
-	else
-		tbl = kvzalloc(size, gfp);
+	tbl = kvzalloc(size, gfp);
 
 	size = nbuckets;
 
@@ -450,7 +447,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 
 	err = -ENOMEM;
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
 	if (new_tbl == NULL)
 		goto fail;
 
-- 
cgit v1.2.3


From 2d22ecf6db1c390974476758681ba4229018e774 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 22:01:48 -0700
Subject: lib/rhashtable: guarantee initial hashtable allocation

rhashtable_init() may fail due to -ENOMEM, thus making the entire api
unusable.  This patch removes this scenario, however unlikely.  In order
to guarantee memory allocation, this patch always ends up doing
GFP_KERNEL|__GFP_NOFAIL for both the tbl as well as
alloc_bucket_spinlocks().

Upon the first table allocation failure, we shrink the size to the
smallest value that makes sense and retry with __GFP_NOFAIL semantics.
With the defaults, this means that from 64 buckets, we retry with only 4.
Any later issues regarding performance due to collisions or larger table
resizing (when more memory becomes available) is the least of our
problems.

Link: http://lkml.kernel.org/r/20180712185241.4017-9-manfred@colorfullife.com
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rhashtable.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 238c8daa9852..310e29b51507 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -178,10 +178,11 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 
 	size = nbuckets;
 
-	if (tbl == NULL && gfp != GFP_KERNEL) {
+	if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
 		tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
 		nbuckets = 0;
 	}
+
 	if (tbl == NULL)
 		return NULL;
 
@@ -1057,9 +1058,16 @@ int rhashtable_init(struct rhashtable *ht,
 		}
 	}
 
+	/*
+	 * This is api initialization and thus we need to guarantee the
+	 * initial rhashtable allocation. Upon failure, retry with the
+	 * smallest possible size with __GFP_NOFAIL semantics.
+	 */
 	tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-	if (tbl == NULL)
-		return -ENOMEM;
+	if (unlikely(tbl == NULL)) {
+		size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
+		tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
+	}
 
 	atomic_set(&ht->nelems, 0);
 
-- 
cgit v1.2.3


From dc2c8c84def6ce450c63529e08c1db100020994e Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 22:01:52 -0700
Subject: ipc: get rid of ids->tables_initialized hack

In sysvipc we have an ids->tables_initialized regarding the rhashtable,
introduced in 0cfb6aee70bd ("ipc: optimize semget/shmget/msgget for lots
of keys")

It's there, specifically, to prevent nil pointer dereferences, from using
an uninitialized api.  Considering how rhashtable_init() can fail
(probably due to ENOMEM, if anything), this made the overall ipc
initialization capable of failure as well.  That alone is ugly, but fine,
however I've spotted a few issues regarding the semantics of
tables_initialized (however unlikely they may be):

- There is inconsistency in what we return to userspace: ipc_addid()
  returns ENOSPC which is certainly _wrong_, while ipc_obtain_object_idr()
  returns EINVAL.

- After we started using rhashtables, ipc_findkey() can return nil upon
  !tables_initialized, but the caller expects nil for when the ipc
  structure isn't found, and can therefore call into ipcget() callbacks.

Now that rhashtable initialization cannot fail, we can properly get rid of
the hack altogether.

[manfred@colorfullife.com: commit id extended to 12 digits]
Link: http://lkml.kernel.org/r/20180712185241.4017-10-manfred@colorfullife.com
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  1 -
 ipc/util.c                    | 23 ++++++++---------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 6cea726612b7..e9ccdfdb1928 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -16,7 +16,6 @@ struct user_namespace;
 struct ipc_ids {
 	int in_use;
 	unsigned short seq;
-	bool tables_initialized;
 	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
 	int max_id;
diff --git a/ipc/util.c b/ipc/util.c
index bd1863b6ed39..7c1387c9510d 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -126,7 +126,6 @@ int ipc_init_ids(struct ipc_ids *ids)
 	if (err)
 		return err;
 	idr_init(&ids->ipcs_idr);
-	ids->tables_initialized = true;
 	ids->max_id = -1;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	ids->next_id = -1;
@@ -179,19 +178,16 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
  */
 static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
 {
-	struct kern_ipc_perm *ipcp = NULL;
+	struct kern_ipc_perm *ipcp;
 
-	if (likely(ids->tables_initialized))
-		ipcp = rhashtable_lookup_fast(&ids->key_ht, &key,
+	ipcp = rhashtable_lookup_fast(&ids->key_ht, &key,
 					      ipc_kht_params);
+	if (!ipcp)
+		return NULL;
 
-	if (ipcp) {
-		rcu_read_lock();
-		ipc_lock_object(ipcp);
-		return ipcp;
-	}
-
-	return NULL;
+	rcu_read_lock();
+	ipc_lock_object(ipcp);
+	return ipcp;
 }
 
 /*
@@ -269,7 +265,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 	if (limit > IPCMNI)
 		limit = IPCMNI;
 
-	if (!ids->tables_initialized || ids->in_use >= limit)
+	if (ids->in_use >= limit)
 		return -ENOSPC;
 
 	idr_preload(GFP_KERNEL);
@@ -578,9 +574,6 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
 	struct kern_ipc_perm *out;
 	int lid = ipcid_to_idx(id);
 
-	if (unlikely(!ids->tables_initialized))
-		return ERR_PTR(-EINVAL);
-
 	out = idr_find(&ids->ipcs_idr, lid);
 	if (!out)
 		return ERR_PTR(-EINVAL);
-- 
cgit v1.2.3


From eae04d25a713304c978d7c45dcab01b0e0811c74 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 21 Aug 2018 22:01:56 -0700
Subject: ipc: simplify ipc initialization

Now that we know that rhashtable_init() will not fail, we can get rid of a
lot of the unnecessary cleanup paths when the call errored out.

[manfred@colorfullife.com: variable name added to util.h to resolve checkpatch warning]
Link: http://lkml.kernel.org/r/20180712185241.4017-11-manfred@colorfullife.com
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/msg.c       |  9 ++++-----
 ipc/namespace.c | 20 ++++----------------
 ipc/sem.c       | 10 ++++------
 ipc/shm.c       |  9 ++++-----
 ipc/util.c      | 18 +++++-------------
 ipc/util.h      | 18 +++++++++---------
 6 files changed, 30 insertions(+), 54 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index b9b47d6cd7ee..ee78dad90460 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1238,7 +1238,7 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
 }
 #endif
 
-int msg_init_ns(struct ipc_namespace *ns)
+void msg_init_ns(struct ipc_namespace *ns)
 {
 	ns->msg_ctlmax = MSGMAX;
 	ns->msg_ctlmnb = MSGMNB;
@@ -1246,7 +1246,7 @@ int msg_init_ns(struct ipc_namespace *ns)
 
 	atomic_set(&ns->msg_bytes, 0);
 	atomic_set(&ns->msg_hdrs, 0);
-	return ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+	ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
 }
 
 #ifdef CONFIG_IPC_NS
@@ -1287,12 +1287,11 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
 }
 #endif
 
-int __init msg_init(void)
+void __init msg_init(void)
 {
-	const int err = msg_init_ns(&init_ipc_ns);
+	msg_init_ns(&init_ipc_ns);
 
 	ipc_init_proc_interface("sysvipc/msg",
 				"       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n",
 				IPC_MSG_IDS, sysvipc_msg_proc_show);
-	return err;
 }
diff --git a/ipc/namespace.c b/ipc/namespace.c
index f59a89966f92..21607791d62c 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -55,28 +55,16 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 
-	err = sem_init_ns(ns);
+	err = mq_init_ns(ns);
 	if (err)
 		goto fail_put;
-	err = msg_init_ns(ns);
-	if (err)
-		goto fail_destroy_sem;
-	err = shm_init_ns(ns);
-	if (err)
-		goto fail_destroy_msg;
 
-	err = mq_init_ns(ns);
-	if (err)
-		goto fail_destroy_shm;
+	sem_init_ns(ns);
+	msg_init_ns(ns);
+	shm_init_ns(ns);
 
 	return ns;
 
-fail_destroy_shm:
-	shm_exit_ns(ns);
-fail_destroy_msg:
-	msg_exit_ns(ns);
-fail_destroy_sem:
-	sem_exit_ns(ns);
 fail_put:
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
diff --git a/ipc/sem.c b/ipc/sem.c
index cf2ba4509f0d..e4df102f3404 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -221,14 +221,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define sc_semopm	sem_ctls[2]
 #define sc_semmni	sem_ctls[3]
 
-int sem_init_ns(struct ipc_namespace *ns)
+void sem_init_ns(struct ipc_namespace *ns)
 {
 	ns->sc_semmsl = SEMMSL;
 	ns->sc_semmns = SEMMNS;
 	ns->sc_semopm = SEMOPM;
 	ns->sc_semmni = SEMMNI;
 	ns->used_sems = 0;
-	return ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
+	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
 }
 
 #ifdef CONFIG_IPC_NS
@@ -240,14 +240,12 @@ void sem_exit_ns(struct ipc_namespace *ns)
 }
 #endif
 
-int __init sem_init(void)
+void __init sem_init(void)
 {
-	const int err = sem_init_ns(&init_ipc_ns);
-
+	sem_init_ns(&init_ipc_ns);
 	ipc_init_proc_interface("sysvipc/sem",
 				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
 				IPC_SEM_IDS, sysvipc_sem_proc_show);
-	return err;
 }
 
 /**
diff --git a/ipc/shm.c b/ipc/shm.c
index 4c7ae625d996..c4fb359f1e53 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -96,14 +96,14 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
 #endif
 
-int shm_init_ns(struct ipc_namespace *ns)
+void shm_init_ns(struct ipc_namespace *ns)
 {
 	ns->shm_ctlmax = SHMMAX;
 	ns->shm_ctlall = SHMALL;
 	ns->shm_ctlmni = SHMMNI;
 	ns->shm_rmid_forced = 0;
 	ns->shm_tot = 0;
-	return ipc_init_ids(&shm_ids(ns));
+	ipc_init_ids(&shm_ids(ns));
 }
 
 /*
@@ -136,9 +136,8 @@ void shm_exit_ns(struct ipc_namespace *ns)
 
 static int __init ipc_ns_init(void)
 {
-	const int err = shm_init_ns(&init_ipc_ns);
-	WARN(err, "ipc: sysv shm_init_ns failed: %d\n", err);
-	return err;
+	shm_init_ns(&init_ipc_ns);
+	return 0;
 }
 
 pure_initcall(ipc_ns_init);
diff --git a/ipc/util.c b/ipc/util.c
index 7c1387c9510d..a2aae8c1410d 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -88,16 +88,12 @@ struct ipc_proc_iface {
  */
 static int __init ipc_init(void)
 {
-	int err_sem, err_msg;
-
 	proc_mkdir("sysvipc", NULL);
-	err_sem = sem_init();
-	WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem);
-	err_msg = msg_init();
-	WARN(err_msg, "ipc: sysv msg_init failed: %d\n", err_msg);
+	sem_init();
+	msg_init();
 	shm_init();
 
-	return err_msg ? err_msg : err_sem;
+	return 0;
 }
 device_initcall(ipc_init);
 
@@ -116,21 +112,17 @@ static const struct rhashtable_params ipc_kht_params = {
  * Set up the sequence range to use for the ipc identifier range (limited
  * below IPCMNI) then initialise the keys hashtable and ids idr.
  */
-int ipc_init_ids(struct ipc_ids *ids)
+void ipc_init_ids(struct ipc_ids *ids)
 {
-	int err;
 	ids->in_use = 0;
 	ids->seq = 0;
 	init_rwsem(&ids->rwsem);
-	err = rhashtable_init(&ids->key_ht, &ipc_kht_params);
-	if (err)
-		return err;
+	rhashtable_init(&ids->key_ht, &ipc_kht_params);
 	idr_init(&ids->ipcs_idr);
 	ids->max_id = -1;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	ids->next_id = -1;
 #endif
-	return 0;
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/ipc/util.h b/ipc/util.h
index e3c47b21db93..6c5c77c61f85 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -18,8 +18,8 @@
 #define IPCMNI 32768  /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
 #define SEQ_MULTIPLIER	(IPCMNI)
 
-int sem_init(void);
-int msg_init(void);
+void sem_init(void);
+void msg_init(void);
 void shm_init(void);
 
 struct ipc_namespace;
@@ -34,17 +34,17 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }
 #endif
 
 #ifdef CONFIG_SYSVIPC
-int sem_init_ns(struct ipc_namespace *ns);
-int msg_init_ns(struct ipc_namespace *ns);
-int shm_init_ns(struct ipc_namespace *ns);
+void sem_init_ns(struct ipc_namespace *ns);
+void msg_init_ns(struct ipc_namespace *ns);
+void shm_init_ns(struct ipc_namespace *ns);
 
 void sem_exit_ns(struct ipc_namespace *ns);
 void msg_exit_ns(struct ipc_namespace *ns);
 void shm_exit_ns(struct ipc_namespace *ns);
 #else
-static inline int sem_init_ns(struct ipc_namespace *ns) { return 0; }
-static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; }
-static inline int shm_init_ns(struct ipc_namespace *ns) { return 0; }
+static inline void sem_init_ns(struct ipc_namespace *ns) { }
+static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline void shm_init_ns(struct ipc_namespace *ns) { }
 
 static inline void sem_exit_ns(struct ipc_namespace *ns) { }
 static inline void msg_exit_ns(struct ipc_namespace *ns) { }
@@ -83,7 +83,7 @@ struct ipc_ops {
 struct seq_file;
 struct ipc_ids;
 
-int ipc_init_ids(struct ipc_ids *);
+void ipc_init_ids(struct ipc_ids *ids);
 #ifdef CONFIG_PROC_FS
 void __init ipc_init_proc_interface(const char *path, const char *header,
 		int ids, int (*show)(struct seq_file *, void *));
-- 
cgit v1.2.3


From 27c331a174614208d0b539019583990967ad9479 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 21 Aug 2018 22:02:00 -0700
Subject: ipc/util.c: further variable name cleanups

The varable names got a mess, thus standardize them again:

id: user space id. Called semid, shmid, msgid if the type is known.
    Most functions use "id" already.
idx: "index" for the idr lookup
    Right now, some functions use lid, ipc_addid() already uses idx as
    the variable name.
seq: sequence number, to avoid quick collisions of the user space id
key: user space key, used for the rhash tree

Link: http://lkml.kernel.org/r/20180712185241.4017-12-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  2 +-
 ipc/msg.c                     |  6 +++---
 ipc/sem.c                     |  6 +++---
 ipc/shm.c                     |  4 ++--
 ipc/util.c                    | 26 +++++++++++++-------------
 ipc/util.h                    | 10 +++++-----
 6 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e9ccdfdb1928..6ab8c1bada3f 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -18,7 +18,7 @@ struct ipc_ids {
 	unsigned short seq;
 	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
-	int max_id;
+	int max_idx;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	int next_id;
 #endif
diff --git a/ipc/msg.c b/ipc/msg.c
index ee78dad90460..883642cf2b27 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -456,7 +456,7 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
 			 int cmd, struct msginfo *msginfo)
 {
 	int err;
-	int max_id;
+	int max_idx;
 
 	/*
 	 * We must not return kernel stack data.
@@ -483,9 +483,9 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
 		msginfo->msgpool = MSGPOOL;
 		msginfo->msgtql = MSGTQL;
 	}
-	max_id = ipc_get_maxid(&msg_ids(ns));
+	max_idx = ipc_get_maxidx(&msg_ids(ns));
 	up_read(&msg_ids(ns).rwsem);
-	return (max_id < 0) ? 0 : max_id;
+	return (max_idx < 0) ? 0 : max_idx;
 }
 
 static int msgctl_stat(struct ipc_namespace *ns, int msqid,
diff --git a/ipc/sem.c b/ipc/sem.c
index e4df102f3404..26f8e37fcdcb 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1294,7 +1294,7 @@ static int semctl_info(struct ipc_namespace *ns, int semid,
 			 int cmd, void __user *p)
 {
 	struct seminfo seminfo;
-	int max_id;
+	int max_idx;
 	int err;
 
 	err = security_sem_semctl(NULL, cmd);
@@ -1318,11 +1318,11 @@ static int semctl_info(struct ipc_namespace *ns, int semid,
 		seminfo.semusz = SEMUSZ;
 		seminfo.semaem = SEMAEM;
 	}
-	max_id = ipc_get_maxid(&sem_ids(ns));
+	max_idx = ipc_get_maxidx(&sem_ids(ns));
 	up_read(&sem_ids(ns).rwsem);
 	if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
 		return -EFAULT;
-	return (max_id < 0) ? 0 : max_id;
+	return (max_idx < 0) ? 0 : max_idx;
 }
 
 static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
diff --git a/ipc/shm.c b/ipc/shm.c
index c4fb359f1e53..b0eb3757ab89 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -948,7 +948,7 @@ static int shmctl_ipc_info(struct ipc_namespace *ns,
 		shminfo->shmall = ns->shm_ctlall;
 		shminfo->shmmin = SHMMIN;
 		down_read(&shm_ids(ns).rwsem);
-		err = ipc_get_maxid(&shm_ids(ns));
+		err = ipc_get_maxidx(&shm_ids(ns));
 		up_read(&shm_ids(ns).rwsem);
 		if (err < 0)
 			err = 0;
@@ -968,7 +968,7 @@ static int shmctl_shm_info(struct ipc_namespace *ns,
 		shm_info->shm_tot = ns->shm_tot;
 		shm_info->swap_attempts = 0;
 		shm_info->swap_successes = 0;
-		err = ipc_get_maxid(&shm_ids(ns));
+		err = ipc_get_maxidx(&shm_ids(ns));
 		up_read(&shm_ids(ns).rwsem);
 		if (err < 0)
 			err = 0;
diff --git a/ipc/util.c b/ipc/util.c
index a2aae8c1410d..6f30ba80ca15 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -119,7 +119,7 @@ void ipc_init_ids(struct ipc_ids *ids)
 	init_rwsem(&ids->rwsem);
 	rhashtable_init(&ids->key_ht, &ipc_kht_params);
 	idr_init(&ids->ipcs_idr);
-	ids->max_id = -1;
+	ids->max_idx = -1;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	ids->next_id = -1;
 #endif
@@ -237,7 +237,7 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
  * @limit: limit for the number of used ids
  *
  * Add an entry 'new' to the ipc ids idr. The permissions object is
- * initialised and the first free entry is set up and the id assigned
+ * initialised and the first free entry is set up and the index assigned
  * is returned. The 'new' entry is returned in a locked state on success.
  *
  * On failure the entry is not locked and a negative err-code is returned.
@@ -291,8 +291,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 	}
 
 	ids->in_use++;
-	if (idx > ids->max_id)
-		ids->max_id = idx;
+	if (idx > ids->max_idx)
+		ids->max_idx = idx;
 	return idx;
 }
 
@@ -431,20 +431,20 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
  */
 void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
 {
-	int lid = ipcid_to_idx(ipcp->id);
+	int idx = ipcid_to_idx(ipcp->id);
 
-	idr_remove(&ids->ipcs_idr, lid);
+	idr_remove(&ids->ipcs_idr, idx);
 	ipc_kht_remove(ids, ipcp);
 	ids->in_use--;
 	ipcp->deleted = true;
 
-	if (unlikely(lid == ids->max_id)) {
+	if (unlikely(idx == ids->max_idx)) {
 		do {
-			lid--;
-			if (lid == -1)
+			idx--;
+			if (idx == -1)
 				break;
-		} while (!idr_find(&ids->ipcs_idr, lid));
-		ids->max_id = lid;
+		} while (!idr_find(&ids->ipcs_idr, idx));
+		ids->max_idx = idx;
 	}
 }
 
@@ -564,9 +564,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
 struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
 {
 	struct kern_ipc_perm *out;
-	int lid = ipcid_to_idx(id);
+	int idx = ipcid_to_idx(id);
 
-	out = idr_find(&ids->ipcs_idr, lid);
+	out = idr_find(&ids->ipcs_idr, idx);
 	if (!out)
 		return ERR_PTR(-EINVAL);
 
diff --git a/ipc/util.h b/ipc/util.h
index 6c5c77c61f85..e74564fe3375 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -113,12 +113,12 @@ void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);
 int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
 
 /**
- * ipc_get_maxid - get the last assigned id
+ * ipc_get_maxidx - get the highest assigned index
  * @ids: ipc identifier set
  *
  * Called with ipc_ids.rwsem held for reading.
  */
-static inline int ipc_get_maxid(struct ipc_ids *ids)
+static inline int ipc_get_maxidx(struct ipc_ids *ids)
 {
 	if (ids->in_use == 0)
 		return -1;
@@ -126,7 +126,7 @@ static inline int ipc_get_maxid(struct ipc_ids *ids)
 	if (ids->in_use == IPCMNI)
 		return IPCMNI - 1;
 
-	return ids->max_id;
+	return ids->max_idx;
 }
 
 /*
@@ -172,9 +172,9 @@ extern struct msg_msg *load_msg(const void __user *src, size_t len);
 extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
 extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
 
-static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid)
+static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int id)
 {
-	return uid / SEQ_MULTIPLIER != ipcp->seq;
+	return ipcid_to_seqx(id) != ipcp->seq;
 }
 
 static inline void ipc_lock_object(struct kern_ipc_perm *perm)
-- 
cgit v1.2.3


From 2a9d6481004215da8e93edb588cf448f2af80303 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 21 Aug 2018 22:02:04 -0700
Subject: ipc/util.c: update return value of ipc_getref from int to bool

ipc_getref has still a return value of type "int", matching the atomic_t
interface of atomic_inc_not_zero()/atomic_add_unless().

ipc_getref now uses refcount_inc_not_zero, which has a return value of
type "bool".

Therefore, update the return code to avoid implicit conversions.

Link: http://lkml.kernel.org/r/20180712185241.4017-13-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/util.c | 2 +-
 ipc/util.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ipc/util.c b/ipc/util.c
index 6f30ba80ca15..0af05752969f 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -462,7 +462,7 @@ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
 	ipcp->key = IPC_PRIVATE;
 }
 
-int ipc_rcu_getref(struct kern_ipc_perm *ptr)
+bool ipc_rcu_getref(struct kern_ipc_perm *ptr)
 {
 	return refcount_inc_not_zero(&ptr->refcount);
 }
diff --git a/ipc/util.h b/ipc/util.h
index e74564fe3375..0a159f69b3bb 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -138,7 +138,7 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids)
  * refcount is initialized by ipc_addid(), before that point call_rcu()
  * must be used.
  */
-int ipc_rcu_getref(struct kern_ipc_perm *ptr);
+bool ipc_rcu_getref(struct kern_ipc_perm *ptr);
 void ipc_rcu_putref(struct kern_ipc_perm *ptr,
 			void (*func)(struct rcu_head *head));
 
-- 
cgit v1.2.3