From 59dc76b0d4dfdd7dc46a1010e4afb44f60f3e97f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 20 May 2016 16:56:31 -0700
Subject: mm: vmscan: reduce size of inactive file list

The inactive file list should still be large enough to contain readahead
windows and freshly written file data, but it no longer is the only
source for detecting multiple accesses to file pages.  The workingset
refault measurement code causes recently evicted file pages that get
accessed again after a shorter interval to be promoted directly to the
active list.

With that mechanism in place, we can afford to (on a larger system)
dedicate more memory to the active file list, so we can actually cache
more of the frequently used file pages in memory, and not have them
pushed out by streaming writes, once-used streaming file reads, etc.

This can help things like database workloads, where only half the page
cache can currently be used to cache the database working set.  This
patch automatically increases that fraction on larger systems, using the
same ratio that has already been used for anonymous memory.

[hannes@cmpxchg.org: cgroup-awareness]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 94da96738df3..a805474df4ab 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -415,25 +415,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return mz->lru_size[lru];
 }
 
-static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	unsigned long inactive_ratio;
-	unsigned long inactive;
-	unsigned long active;
-	unsigned long gb;
-
-	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
-	gb = (inactive + active) >> (30 - PAGE_SHIFT);
-	if (gb)
-		inactive_ratio = int_sqrt(10 * gb);
-	else
-		inactive_ratio = 1;
-
-	return inactive * inactive_ratio < active;
-}
-
 void mem_cgroup_handle_over_high(void);
 
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -646,12 +627,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return true;
 }
 
-static inline bool
-mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	return true;
-}
-
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
-- 
cgit v1.2.3


From ea7ab982b6bdb7ce218fd3a7850bb2e2b414fdd0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:38 -0700
Subject: mm, compaction: change COMPACT_ constants into enum

Compaction code is doing weird dances between COMPACT_FOO -> int ->
unsigned long

But there doesn't seem to be any reason for that.  All functions which
return/use one of those constants are not expecting any other value so it
really makes sense to define an enum for them and make it clear that no
other values are expected.

This is a pure cleanup and shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 45 +++++++++++++++++++++++++++------------------
 mm/compaction.c            | 27 ++++++++++++++-------------
 mm/page_alloc.c            |  2 +-
 3 files changed, 42 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 242b660f64e6..706cbf00e919 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -2,21 +2,29 @@
 #define _LINUX_COMPACTION_H
 
 /* Return values for compact_zone() and try_to_compact_pages() */
-/* compaction didn't start as it was deferred due to past failures */
-#define COMPACT_DEFERRED	0
-/* compaction didn't start as it was not possible or direct reclaim was more suitable */
-#define COMPACT_SKIPPED		1
-/* compaction should continue to another pageblock */
-#define COMPACT_CONTINUE	2
-/* direct compaction partially compacted a zone and there are suitable pages */
-#define COMPACT_PARTIAL		3
-/* The full zone was compacted */
-#define COMPACT_COMPLETE	4
-/* For more detailed tracepoint output */
-#define COMPACT_NO_SUITABLE_PAGE	5
-#define COMPACT_NOT_SUITABLE_ZONE	6
-#define COMPACT_CONTENDED		7
 /* When adding new states, please adjust include/trace/events/compaction.h */
+enum compact_result {
+	/* compaction didn't start as it was deferred due to past failures */
+	COMPACT_DEFERRED,
+	/*
+	 * compaction didn't start as it was not possible or direct reclaim
+	 * was more suitable
+	 */
+	COMPACT_SKIPPED,
+	/* compaction should continue to another pageblock */
+	COMPACT_CONTINUE,
+	/*
+	 * direct compaction partially compacted a zone and there are suitable
+	 * pages
+	 */
+	COMPACT_PARTIAL,
+	/* The full zone was compacted */
+	COMPACT_COMPLETE,
+	/* For more detailed tracepoint output */
+	COMPACT_NO_SUITABLE_PAGE,
+	COMPACT_NOT_SUITABLE_ZONE,
+	COMPACT_CONTENDED,
+};
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
 /* No contention detected */
@@ -38,12 +46,13 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
-extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
+			unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
-extern unsigned long compaction_suitable(struct zone *zone, int order,
+extern enum compact_result compaction_suitable(struct zone *zone, int order,
 		unsigned int alloc_flags, int classzone_idx);
 
 extern void defer_compaction(struct zone *zone, int order);
@@ -57,7 +66,7 @@ extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
 
 #else
-static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
+static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 			unsigned int order, int alloc_flags,
 			const struct alloc_context *ac,
 			enum migrate_mode mode, int *contended)
@@ -73,7 +82,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
 {
 }
 
-static inline unsigned long compaction_suitable(struct zone *zone, int order,
+static inline enum compact_result compaction_suitable(struct zone *zone, int order,
 					int alloc_flags, int classzone_idx)
 {
 	return COMPACT_SKIPPED;
diff --git a/mm/compaction.c b/mm/compaction.c
index eda3c2244f30..e721d252c5d2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1229,7 +1229,7 @@ static inline bool is_via_compact_memory(int order)
 	return order == -1;
 }
 
-static int __compact_finished(struct zone *zone, struct compact_control *cc,
+static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
 			    const int migratetype)
 {
 	unsigned int order;
@@ -1292,8 +1292,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
 	return COMPACT_NO_SUITABLE_PAGE;
 }
 
-static int compact_finished(struct zone *zone, struct compact_control *cc,
-			    const int migratetype)
+static enum compact_result compact_finished(struct zone *zone,
+			struct compact_control *cc,
+			const int migratetype)
 {
 	int ret;
 
@@ -1312,7 +1313,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
  *   COMPACT_CONTINUE - If compaction should run now
  */
-static unsigned long __compaction_suitable(struct zone *zone, int order,
+static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx)
 {
@@ -1358,11 +1359,11 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
 	return COMPACT_CONTINUE;
 }
 
-unsigned long compaction_suitable(struct zone *zone, int order,
+enum compact_result compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx)
 {
-	unsigned long ret;
+	enum compact_result ret;
 
 	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
 	trace_mm_compaction_suitable(zone, order, ret);
@@ -1372,9 +1373,9 @@ unsigned long compaction_suitable(struct zone *zone, int order,
 	return ret;
 }
 
-static int compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
 {
-	int ret;
+	enum compact_result ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
 	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
@@ -1530,11 +1531,11 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone, int order,
+static enum compact_result compact_zone_order(struct zone *zone, int order,
 		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
 		unsigned int alloc_flags, int classzone_idx)
 {
-	unsigned long ret;
+	enum compact_result ret;
 	struct compact_control cc = {
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
@@ -1572,7 +1573,7 @@ int sysctl_extfrag_threshold = 500;
  *
  * This is the main entry point for direct page compaction.
  */
-unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended)
 {
@@ -1580,7 +1581,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
 	struct zone *zone;
-	int rc = COMPACT_DEFERRED;
+	enum compact_result rc = COMPACT_DEFERRED;
 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
 
 	*contended = COMPACT_CONTENDED_NONE;
@@ -1594,7 +1595,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
-		int status;
+		enum compact_result status;
 		int zone_contended;
 
 		if (compaction_deferred(zone, order))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index edbdf56b3c9b..ed62c4b90598 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3188,7 +3188,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
-	unsigned long compact_result;
+	enum compact_result compact_result;
 	struct page *page;
 
 	if (!order)
-- 
cgit v1.2.3


From 1d4746d395975e0ff5103e20ab169d1a95b4ef9e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:44 -0700
Subject: mm, compaction: distinguish COMPACT_DEFERRED from COMPACT_SKIPPED

try_to_compact_pages() can currently return COMPACT_SKIPPED even when
the compaction is defered for some zone just because zone DMA is skipped
in 99% of cases due to watermark checks.  This makes COMPACT_DEFERRED
basically unusable for the page allocator as a feedback mechanism.

Make sure we distinguish those two states properly and switch their
ordering in the enum.  This would mean that the COMPACT_SKIPPED will be
returned only when all eligible zones are skipped.

As a result COMPACT_DEFERRED handling for THP in __alloc_pages_slowpath
will be more precise and we would bail out rather than reclaim.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 7 +++++--
 include/trace/events/compaction.h | 2 +-
 mm/compaction.c                   | 8 +++++---
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 706cbf00e919..11f228712ed5 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -4,13 +4,16 @@
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* When adding new states, please adjust include/trace/events/compaction.h */
 enum compact_result {
-	/* compaction didn't start as it was deferred due to past failures */
-	COMPACT_DEFERRED,
 	/*
 	 * compaction didn't start as it was not possible or direct reclaim
 	 * was more suitable
 	 */
 	COMPACT_SKIPPED,
+	/* compaction didn't start as it was deferred due to past failures */
+	COMPACT_DEFERRED,
+	/* compaction not active last round */
+	COMPACT_INACTIVE = COMPACT_DEFERRED,
+
 	/* compaction should continue to another pageblock */
 	COMPACT_CONTINUE,
 	/*
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index e215bf68f521..6ba16c86d7db 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -10,8 +10,8 @@
 #include <trace/events/mmflags.h>
 
 #define COMPACTION_STATUS					\
-	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_SKIPPED,		"skipped")		\
+	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_CONTINUE,		"continue")		\
 	EM( COMPACT_PARTIAL,		"partial")		\
 	EM( COMPACT_COMPLETE,		"complete")		\
diff --git a/mm/compaction.c b/mm/compaction.c
index 455ecd87f48d..b2b94474dd28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1578,7 +1578,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
 	struct zone *zone;
-	enum compact_result rc = COMPACT_DEFERRED;
+	enum compact_result rc = COMPACT_SKIPPED;
 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
 
 	*contended = COMPACT_CONTENDED_NONE;
@@ -1595,8 +1595,10 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 		enum compact_result status;
 		int zone_contended;
 
-		if (compaction_deferred(zone, order))
+		if (compaction_deferred(zone, order)) {
+			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
 			continue;
+		}
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
 				&zone_contended, alloc_flags,
@@ -1667,7 +1669,7 @@ break_loop:
 	 * If at least one zone wasn't deferred or skipped, we report if all
 	 * zones that were tried were lock contended.
 	 */
-	if (rc > COMPACT_SKIPPED && all_zones_contended)
+	if (rc > COMPACT_INACTIVE && all_zones_contended)
 		*contended = COMPACT_CONTENDED_LOCK;
 
 	return rc;
-- 
cgit v1.2.3


From c8f7de0bfae36e8532e5e25a39d15407f02aca78 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:47 -0700
Subject: mm, compaction: distinguish between full and partial COMPACT_COMPLETE

COMPACT_COMPLETE now means that compaction and free scanner met.  This
is not very useful information if somebody just wants to use this
feedback and make any decisions based on that.  The current caller might
be a poor guy who just happened to scan tiny portion of the zone and
that could be the reason no suitable pages were compacted.  Make sure we
distinguish the full and partial zone walks.

Consumers should treat COMPACT_PARTIAL_SKIPPED as a potential success
and be optimistic in retrying.

The existing users of COMPACT_COMPLETE are conservatively changed to use
COMPACT_PARTIAL_SKIPPED as well but some of them should be probably
reconsidered and only defer the compaction only for COMPACT_COMPLETE
with the new semantic.

This patch shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 10 +++++++++-
 include/trace/events/compaction.h |  1 +
 mm/compaction.c                   | 14 +++++++++++---
 mm/internal.h                     |  1 +
 4 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 11f228712ed5..9b37f9d3f7a8 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -21,7 +21,15 @@ enum compact_result {
 	 * pages
 	 */
 	COMPACT_PARTIAL,
-	/* The full zone was compacted */
+	/*
+	 * direct compaction has scanned part of the zone but wasn't successfull
+	 * to compact suitable pages.
+	 */
+	COMPACT_PARTIAL_SKIPPED,
+	/*
+	 * The full zone was compacted scanned but wasn't successfull to compact
+	 * suitable pages.
+	 */
 	COMPACT_COMPLETE,
 	/* For more detailed tracepoint output */
 	COMPACT_NO_SUITABLE_PAGE,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 6ba16c86d7db..36e2d6fb1360 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -14,6 +14,7 @@
 	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_CONTINUE,		"continue")		\
 	EM( COMPACT_PARTIAL,		"partial")		\
+	EM( COMPACT_PARTIAL_SKIPPED,	"partial_skipped")	\
 	EM( COMPACT_COMPLETE,		"complete")		\
 	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
 	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
diff --git a/mm/compaction.c b/mm/compaction.c
index b2b94474dd28..4af1577adb5c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1252,7 +1252,10 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 		if (cc->direct_compaction)
 			zone->compact_blockskip_flush = true;
 
-		return COMPACT_COMPLETE;
+		if (cc->whole_zone)
+			return COMPACT_COMPLETE;
+		else
+			return COMPACT_PARTIAL_SKIPPED;
 	}
 
 	if (is_via_compact_memory(cc->order))
@@ -1413,6 +1416,10 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
 		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
+
+	if (cc->migrate_pfn == start_pfn)
+		cc->whole_zone = true;
+
 	cc->last_migrated_pfn = 0;
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
@@ -1634,7 +1641,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 			goto break_loop;
 		}
 
-		if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
+		if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
+					status == COMPACT_PARTIAL_SKIPPED)) {
 			/*
 			 * We think that allocation won't succeed in this zone
 			 * so we defer compaction there. If it ends up
@@ -1881,7 +1889,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 						cc.classzone_idx, 0)) {
 			success = true;
 			compaction_defer_reset(zone, cc.order, false);
-		} else if (status == COMPACT_COMPLETE) {
+		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
 			/*
 			 * We use sync migration mode here, so we defer like
 			 * sync direct compaction does.
diff --git a/mm/internal.h b/mm/internal.h
index 3ac544f1963f..f6f3353b0868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -174,6 +174,7 @@ struct compact_control {
 	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
+	bool whole_zone;		/* Whole zone has been scanned */
 	int order;			/* order a direct compactor needs */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
-- 
cgit v1.2.3


From 4f9a358c36fcdad3ea1db263ec4d484a70ad543e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:50 -0700
Subject: mm, compaction: update compaction_result ordering

compaction_result will be used as the primary feedback channel for
compaction users.  At the same time try_to_compact_pages (and
potentially others) assume a certain ordering where a more specific
feedback takes precendence.

This gets a bit awkward when we have conflicting feedback from different
zones.  E.g one returing COMPACT_COMPLETE meaning the full zone has been
scanned without any outcome while other returns with COMPACT_PARTIAL aka
made some progress.  The caller should get COMPACT_PARTIAL because that
means that the compaction still can make some progress.  The same
applies for COMPACT_PARTIAL vs COMPACT_PARTIAL_SKIPPED.

Reorder PARTIAL to be the largest one so the larger the value is the
more progress we have done.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 9b37f9d3f7a8..ff39fa0a1ede 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -4,6 +4,8 @@
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* When adding new states, please adjust include/trace/events/compaction.h */
 enum compact_result {
+	/* For more detailed tracepoint output - internal to compaction */
+	COMPACT_NOT_SUITABLE_ZONE,
 	/*
 	 * compaction didn't start as it was not possible or direct reclaim
 	 * was more suitable
@@ -11,30 +13,34 @@ enum compact_result {
 	COMPACT_SKIPPED,
 	/* compaction didn't start as it was deferred due to past failures */
 	COMPACT_DEFERRED,
+
 	/* compaction not active last round */
 	COMPACT_INACTIVE = COMPACT_DEFERRED,
 
+	/* For more detailed tracepoint output - internal to compaction */
+	COMPACT_NO_SUITABLE_PAGE,
 	/* compaction should continue to another pageblock */
 	COMPACT_CONTINUE,
+
 	/*
-	 * direct compaction partially compacted a zone and there are suitable
-	 * pages
+	 * The full zone was compacted scanned but wasn't successfull to compact
+	 * suitable pages.
 	 */
-	COMPACT_PARTIAL,
+	COMPACT_COMPLETE,
 	/*
 	 * direct compaction has scanned part of the zone but wasn't successfull
 	 * to compact suitable pages.
 	 */
 	COMPACT_PARTIAL_SKIPPED,
+
+	/* compaction terminated prematurely due to lock contentions */
+	COMPACT_CONTENDED,
+
 	/*
-	 * The full zone was compacted scanned but wasn't successfull to compact
-	 * suitable pages.
+	 * direct compaction partially compacted a zone and there might be
+	 * suitable pages
 	 */
-	COMPACT_COMPLETE,
-	/* For more detailed tracepoint output */
-	COMPACT_NO_SUITABLE_PAGE,
-	COMPACT_NOT_SUITABLE_ZONE,
-	COMPACT_CONTENDED,
+	COMPACT_PARTIAL,
 };
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
-- 
cgit v1.2.3


From cab1802b5f0dddea30547a7451fda8c7e4c593f0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:56 -0700
Subject: mm, compaction: abstract compaction feedback to helpers

Compaction can provide a wild variation of feedback to the caller.  Many
of them are implementation specific and the caller of the compaction
(especially the page allocator) shouldn't be bound to specifics of the
current implementation.

This patch abstracts the feedback into three basic types:
	- compaction_made_progress - compaction was active and made some
	  progress.
	- compaction_failed - compaction failed and further attempts to
	  invoke it would most probably fail and therefore it is not
	  worth retrying
	- compaction_withdrawn - compaction wasn't invoked for an
          implementation specific reasons. In the current implementation
          it means that the compaction was deferred, contended or the
          page scanners met too early without any progress. Retrying is
          still worthwhile.

[vbabka@suse.cz: do not change thp back off behavior]
[akpm@linux-foundation.org: fix typo in comment, per Hillf]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index ff39fa0a1ede..8d8c916fe67a 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -78,6 +78,70 @@ extern void compaction_defer_reset(struct zone *zone, int order,
 				bool alloc_success);
 extern bool compaction_restarting(struct zone *zone, int order);
 
+/* Compaction has made some progress and retrying makes sense */
+static inline bool compaction_made_progress(enum compact_result result)
+{
+	/*
+	 * Even though this might sound confusing this in fact tells us
+	 * that the compaction successfully isolated and migrated some
+	 * pageblocks.
+	 */
+	if (result == COMPACT_PARTIAL)
+		return true;
+
+	return false;
+}
+
+/* Compaction has failed and it doesn't make much sense to keep retrying. */
+static inline bool compaction_failed(enum compact_result result)
+{
+	/* All zones were scanned completely and still not result. */
+	if (result == COMPACT_COMPLETE)
+		return true;
+
+	return false;
+}
+
+/*
+ * Compaction  has backed off for some reason. It might be throttling or
+ * lock contention. Retrying is still worthwhile.
+ */
+static inline bool compaction_withdrawn(enum compact_result result)
+{
+	/*
+	 * Compaction backed off due to watermark checks for order-0
+	 * so the regular reclaim has to try harder and reclaim something.
+	 */
+	if (result == COMPACT_SKIPPED)
+		return true;
+
+	/*
+	 * If compaction is deferred for high-order allocations, it is
+	 * because sync compaction recently failed. If this is the case
+	 * and the caller requested a THP allocation, we do not want
+	 * to heavily disrupt the system, so we fail the allocation
+	 * instead of entering direct reclaim.
+	 */
+	if (result == COMPACT_DEFERRED)
+		return true;
+
+	/*
+	 * If compaction in async mode encounters contention or blocks higher
+	 * priority task we back off early rather than cause stalls.
+	 */
+	if (result == COMPACT_CONTENDED)
+		return true;
+
+	/*
+	 * Page scanners have met but we haven't scanned full zones so this
+	 * is a back off in fact.
+	 */
+	if (result == COMPACT_PARTIAL_SKIPPED)
+		return true;
+
+	return false;
+}
+
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
@@ -114,6 +178,21 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 	return true;
 }
 
+static inline bool compaction_made_progress(enum compact_result result)
+{
+	return false;
+}
+
+static inline bool compaction_failed(enum compact_result result)
+{
+	return false;
+}
+
+static inline bool compaction_withdrawn(enum compact_result result)
+{
+	return true;
+}
+
 static inline int kcompactd_run(int nid)
 {
 	return 0;
-- 
cgit v1.2.3


From 0a0337e0d1d134465778a16f5cbea95086e8e9e0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:00 -0700
Subject: mm, oom: rework oom detection

__alloc_pages_slowpath has traditionally relied on the direct reclaim
and did_some_progress as an indicator that it makes sense to retry
allocation rather than declaring OOM.  shrink_zones had to rely on
zone_reclaimable if shrink_zone didn't make any progress to prevent from
a premature OOM killer invocation - the LRU might be full of dirty or
writeback pages and direct reclaim cannot clean those up.

zone_reclaimable allows to rescan the reclaimable lists several times
and restart if a page is freed.  This is really subtle behavior and it
might lead to a livelock when a single freed page keeps allocator
looping but the current task will not be able to allocate that single
page.  OOM killer would be more appropriate than looping without any
progress for unbounded amount of time.

This patch changes OOM detection logic and pulls it out from shrink_zone
which is too low to be appropriate for any high level decisions such as
OOM which is per zonelist property.  It is __alloc_pages_slowpath which
knows how many attempts have been done and what was the progress so far
therefore it is more appropriate to implement this logic.

The new heuristic is implemented in should_reclaim_retry helper called
from __alloc_pages_slowpath.  It tries to be more deterministic and
easier to follow.  It builds on an assumption that retrying makes sense
only if the currently reclaimable memory + free pages would allow the
current allocation request to succeed (as per __zone_watermark_ok) at
least for one zone in the usable zonelist.

This alone wouldn't be sufficient, though, because the writeback might
get stuck and reclaimable pages might be pinned for a really long time
or even depend on the current allocation context.  Therefore there is a
backoff mechanism implemented which reduces the reclaim target after
each reclaim round without any progress.  This means that we should
eventually converge to only NR_FREE_PAGES as the target and fail on the
wmark check and proceed to OOM.  The backoff is simple and linear with
1/16 of the reclaimable pages for each round without any progress.  We
are optimistic and reset counter for successful reclaim rounds.

Costly high order pages mostly preserve their semantic and those without
__GFP_REPEAT fail right away while those which have the flag set will
back off after the amount of reclaimable pages reaches equivalent of the
requested order.  The only difference is that if there was no progress
during the reclaim we rely on zone watermark check.  This is more
logical thing to do than previous 1<<order attempts which were a result
of zone_reclaimable faking the progress.

[vdavydov@virtuozzo.com: check classzone_idx for shrink_zone]
[hannes@cmpxchg.org: separate the heuristic into should_reclaim_retry]
[rientjes@google.com: use zone_page_state_snapshot for NR_FREE_PAGES]
[rientjes@google.com: shrink_zones doesn't need to return anything]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   1 +
 mm/page_alloc.c      | 100 ++++++++++++++++++++++++++++++++++++++++++++++-----
 mm/vmscan.c          |  25 +++----------
 3 files changed, 97 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ad220359f1b0..0af2bb2028fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -316,6 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 						struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8bcc10616fab..fa39efc3a692 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3386,6 +3386,77 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
 	return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
 }
 
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round), pages_reclaimed (cumulative number of reclaimed
+ * pages) and no_progress_loops (number of reclaim rounds without any progress
+ * in a row) is considered as well as the reclaimable pages on the applicable
+ * zone list (with a backoff mechanism which is a function of no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+		     struct alloc_context *ac, int alloc_flags,
+		     bool did_some_progress, unsigned long pages_reclaimed,
+		     int no_progress_loops)
+{
+	struct zone *zone;
+	struct zoneref *z;
+
+	/*
+	 * Make sure we converge to OOM if we cannot make any progress
+	 * several times in the row.
+	 */
+	if (no_progress_loops > MAX_RECLAIM_RETRIES)
+		return false;
+
+	if (order > PAGE_ALLOC_COSTLY_ORDER) {
+		if (pages_reclaimed >= (1<<order))
+			return false;
+
+		if (did_some_progress)
+			return true;
+	}
+
+	/*
+	 * Keep reclaiming pages while there is a chance this will lead somewhere.
+	 * If none of the target zones can satisfy our allocation request even
+	 * if all reclaimable pages are considered then we are screwed and have
+	 * to go OOM.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		unsigned long available;
+
+		available = zone_reclaimable_pages(zone);
+		available -= DIV_ROUND_UP(no_progress_loops * available,
+					  MAX_RECLAIM_RETRIES);
+		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+		/*
+		 * Would the allocation succeed if we reclaimed the whole
+		 * available?
+		 */
+		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+				ac->high_zoneidx, alloc_flags, available)) {
+			/* Wait for some write requests to complete then retry */
+			wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
@@ -3397,6 +3468,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	enum compact_result compact_result;
+	int no_progress_loops = 0;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3525,23 +3597,35 @@ retry:
 	if (gfp_mask & __GFP_NORETRY)
 		goto noretry;
 
-	/* Keep reclaiming pages as long as there is reasonable progress */
-	pages_reclaimed += did_some_progress;
-	if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
-	    ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
-		/* Wait for some write requests to complete then retry */
-		wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
-		goto retry;
+	/*
+	 * Do not retry costly high order allocations unless they are
+	 * __GFP_REPEAT
+	 */
+	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+		goto noretry;
+
+	if (did_some_progress) {
+		no_progress_loops = 0;
+		pages_reclaimed += did_some_progress;
+	} else {
+		no_progress_loops++;
 	}
 
+	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+				 did_some_progress > 0, pages_reclaimed,
+				 no_progress_loops))
+		goto retry;
+
 	/* Reclaim has failed us, start killing things */
 	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
 	if (page)
 		goto got_pg;
 
 	/* Retry as long as the OOM killer is making progress */
-	if (did_some_progress)
+	if (did_some_progress) {
+		no_progress_loops = 0;
 		goto retry;
+	}
 
 noretry:
 	/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a386454c015a..c4a2f4512fca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc)
 }
 #endif
 
-static unsigned long zone_reclaimable_pages(struct zone *zone)
+unsigned long zone_reclaimable_pages(struct zone *zone)
 {
 	unsigned long nr;
 
@@ -2507,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
- *
- * Returns true if a zone was reclaimable.
  */
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
@@ -2518,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	unsigned long nr_soft_scanned;
 	gfp_t orig_mask;
 	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
-	bool reclaimable = false;
 
 	/*
 	 * If the number of buffer_heads in the machine exceeds the maximum
@@ -2583,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 						&nr_soft_scanned);
 			sc->nr_reclaimed += nr_soft_reclaimed;
 			sc->nr_scanned += nr_soft_scanned;
-			if (nr_soft_reclaimed)
-				reclaimable = true;
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
-			reclaimable = true;
-
-		if (global_reclaim(sc) &&
-		    !reclaimable && zone_reclaimable(zone))
-			reclaimable = true;
+		shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
 	}
 
 	/*
@@ -2601,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	 * promoted it to __GFP_HIGHMEM.
 	 */
 	sc->gfp_mask = orig_mask;
-
-	return reclaimable;
 }
 
 /*
@@ -2627,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	int initial_priority = sc->priority;
 	unsigned long total_scanned = 0;
 	unsigned long writeback_threshold;
-	bool zones_reclaimable;
 retry:
 	delayacct_freepages_start();
 
@@ -2638,7 +2625,7 @@ retry:
 		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
 				sc->priority);
 		sc->nr_scanned = 0;
-		zones_reclaimable = shrink_zones(zonelist, sc);
+		shrink_zones(zonelist, sc);
 
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2685,10 +2672,6 @@ retry:
 		goto retry;
 	}
 
-	/* Any of the zones still reclaimable?  Don't OOM. */
-	if (zones_reclaimable)
-		return 1;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 86a294a81f93d6f36d00ec3ff779d36d218f852d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:12 -0700
Subject: mm, oom, compaction: prevent from should_compact_retry looping for
 ever for costly orders

"mm: consider compaction feedback also for costly allocation" has
removed the upper bound for the reclaim/compaction retries based on the
number of reclaimed pages for costly orders.  While this is desirable
the patch did miss a mis interaction between reclaim, compaction and the
retry logic.  The direct reclaim tries to get zones over min watermark
while compaction backs off and returns COMPACT_SKIPPED when all zones
are below low watermark + 1<<order gap.  If we are getting really close
to OOM then __compaction_suitable can keep returning COMPACT_SKIPPED a
high order request (e.g.  hugetlb order-9) while the reclaim is not able
to release enough pages to get us over low watermark.  The reclaim is
still able to make some progress (usually trashing over few remaining
pages) so we are not able to break out from the loop.

I have seen this happening with the same test described in "mm: consider
compaction feedback also for costly allocation" on a swapless system.
The original problem got resolved by "vmscan: consider classzone_idx in
compaction_ready" but it shows how things might go wrong when we
approach the oom event horizont.

The reason why compaction requires being over low rather than min
watermark is not clear to me.  This check was there essentially since
56de7263fcf3 ("mm: compaction: direct compact when a high-order
allocation fails").  It is clearly an implementation detail though and
we shouldn't pull it into the generic retry logic while we should be
able to cope with such eventuality.  The only place in
should_compact_retry where we retry without any upper bound is for
compaction_withdrawn() case.

Introduce compaction_zonelist_suitable function which checks the given
zonelist and returns true only if there is at least one zone which would
would unblock __compaction_suitable if more memory got reclaimed.  In
this implementation it checks __compaction_suitable with NR_FREE_PAGES
plus part of the reclaimable memory as the target for the watermark
check.  The reclaimable memory is reduced linearly by the allocation
order.  The idea is that we do not want to reclaim all the remaining
memory for a single allocation request just unblock
__compaction_suitable which doesn't guarantee we will make a further
progress.

The new helper is then used if compaction_withdrawn() feedback was
provided so we do not retry if there is no outlook for a further
progress.  !costly requests shouldn't be affected much - e.g.  order-2
pages would require to have at least 64kB on the reclaimable LRUs while
order-9 would need at least 32M which should be enough to not lock up.

[vbabka@suse.cz: fix classzone_idx vs. high_zoneidx usage in compaction_zonelist_suitable]
[akpm@linux-foundation.org: fix it for Mel's mm-page_alloc-remove-field-from-alloc_context.patch]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  4 ++++
 include/linux/mmzone.h     |  3 +++
 mm/compaction.c            | 42 +++++++++++++++++++++++++++++++++++++++---
 mm/page_alloc.c            | 23 +++++++++++++----------
 4 files changed, 59 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 8d8c916fe67a..a58c852a268f 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -142,6 +142,10 @@ static inline bool compaction_withdrawn(enum compact_result result)
 	return false;
 }
 
+
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+					int alloc_flags);
+
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60db2096fd8..8dd0333b01dc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -739,6 +739,9 @@ static inline bool is_dev_zone(const struct zone *zone)
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+			 int classzone_idx, unsigned int alloc_flags,
+			 long free_pages);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx,
 		unsigned int alloc_flags);
diff --git a/mm/compaction.c b/mm/compaction.c
index 4af1577adb5c..d8a20fcf8678 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1318,7 +1318,8 @@ static enum compact_result compact_finished(struct zone *zone,
  */
 static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
-					int classzone_idx)
+					int classzone_idx,
+					unsigned long wmark_target)
 {
 	int fragindex;
 	unsigned long watermark;
@@ -1341,7 +1342,8 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 	 * allocated and for a short time, the footprint is higher
 	 */
 	watermark += (2UL << order);
-	if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
+	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+				 alloc_flags, wmark_target))
 		return COMPACT_SKIPPED;
 
 	/*
@@ -1368,7 +1370,8 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 {
 	enum compact_result ret;
 
-	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+				    zone_page_state(zone, NR_FREE_PAGES));
 	trace_mm_compaction_suitable(zone, order, ret);
 	if (ret == COMPACT_NOT_SUITABLE_ZONE)
 		ret = COMPACT_SKIPPED;
@@ -1376,6 +1379,39 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 	return ret;
 }
 
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+		int alloc_flags)
+{
+	struct zone *zone;
+	struct zoneref *z;
+
+	/*
+	 * Make sure at least one zone would pass __compaction_suitable if we continue
+	 * retrying the reclaim.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		unsigned long available;
+		enum compact_result compact_result;
+
+		/*
+		 * Do not consider all the reclaimable memory because we do not
+		 * want to trash just for a single high order allocation which
+		 * is even not guaranteed to appear even if __compaction_suitable
+		 * is happy about the watermark check.
+		 */
+		available = zone_reclaimable_pages(zone) / order;
+		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+		compact_result = __compaction_suitable(zone, order, alloc_flags,
+				ac_classzone_idx(ac), available);
+		if (compact_result != COMPACT_SKIPPED &&
+				compact_result != COMPACT_NOT_SUITABLE_ZONE)
+			return true;
+	}
+
+	return false;
+}
+
 static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	enum compact_result ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dea406a62e3d..089f760ce64a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2750,10 +2750,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  * one free page of a suitable size. Checking now avoids taking the zone lock
  * to check in the allocation paths if no pages are free.
  */
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-			unsigned long mark, int classzone_idx,
-			unsigned int alloc_flags,
-			long free_pages)
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+			 int classzone_idx, unsigned int alloc_flags,
+			 long free_pages)
 {
 	long min = mark;
 	int o;
@@ -3256,8 +3255,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 
 static inline bool
-should_compact_retry(unsigned int order, enum compact_result compact_result,
-		     enum migrate_mode *migrate_mode,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+		     enum compact_result compact_result, enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
 	int max_retries = MAX_COMPACT_RETRIES;
@@ -3281,9 +3280,11 @@ should_compact_retry(unsigned int order, enum compact_result compact_result,
 	/*
 	 * make sure the compaction wasn't deferred or didn't bail out early
 	 * due to locks contention before we declare that we should give up.
+	 * But do not retry if the given zonelist is not suitable for
+	 * compaction.
 	 */
 	if (compaction_withdrawn(compact_result))
-		return true;
+		return compaction_zonelist_suitable(ac, order, alloc_flags);
 
 	/*
 	 * !costly requests are much more important than __GFP_REPEAT
@@ -3311,7 +3312,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 
 static inline bool
-should_compact_retry(unsigned int order, enum compact_result compact_result,
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+		     enum compact_result compact_result,
 		     enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
@@ -3706,8 +3708,9 @@ retry:
 	 * of free memory (see __compaction_suitable)
 	 */
 	if (did_some_progress > 0 &&
-			should_compact_retry(order, compact_result,
-				&migration_mode, compaction_retries))
+			should_compact_retry(ac, order, alloc_flags,
+				compact_result, &migration_mode,
+				compaction_retries))
 		goto retry;
 
 	/* Reclaim has failed us, start killing things */
-- 
cgit v1.2.3


From bb8a4b7fd1266ef888b3a80aa5f266062b224ef4 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:18 -0700
Subject: mm, oom_reaper: hide oom reaped tasks from OOM killer more carefully

Commit 36324a990cf5 ("oom: clear TIF_MEMDIE after oom_reaper managed to
unmap the address space") not only clears TIF_MEMDIE for oom reaped task
but also set OOM_SCORE_ADJ_MIN for the target task to hide it from the
oom killer.  This works in simple cases but it is not sufficient for
(unlikely) cases where the mm is shared between independent processes
(as they do not share signal struct).  If the mm had only small amount
of memory which could be reaped then another task sharing the mm could
be selected and that wouldn't help to move out from the oom situation.

Introduce MMF_OOM_REAPED mm flag which is checked in oom_badness (same
as OOM_SCORE_ADJ_MIN) and task is skipped if the flag is set.  Set the
flag after __oom_reap_task is done with a task.  This will force the
select_bad_process() to ignore all already oom reaped tasks as well as
no such task is sacrificed for its parent.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 +
 mm/oom_kill.c         | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 31bd0d97d178..40eabf176ce2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -521,6 +521,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_REAPED		21	/* mm has been already reaped */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 415f7eb913fa..c0376efa79ec 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	if (!p)
 		return 0;
 
+	/*
+	 * Do not even consider tasks which are explicitly marked oom
+	 * unkillable or have been already oom reaped.
+	 */
 	adj = (long)p->signal->oom_score_adj;
-	if (adj == OOM_SCORE_ADJ_MIN) {
+	if (adj == OOM_SCORE_ADJ_MIN ||
+			test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
 		task_unlock(p);
 		return 0;
 	}
@@ -513,7 +518,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 * This task can be safely ignored because we cannot do much more
 	 * to release its memory.
 	 */
-	tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
 	mmput(mm);
 	return ret;
-- 
cgit v1.2.3


From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:21 -0700
Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper
 context

Tetsuo has properly noted that mmput slow path might get blocked waiting
for another party (e.g.  exit_aio waits for an IO).  If that happens the
oom_reaper would be put out of the way and will not be able to process
next oom victim.  We should strive for making this context as reliable
and independent on other subsystems as much as possible.

Introduce mmput_async which will perform the slow path from an async
(WQ) context.  This will delay the operation but that shouldn't be a
problem because the oom_reaper has reclaimed the victim's address space
for most cases as much as possible and the remaining context shouldn't
bind too much memory anymore.  The only exception is when mmap_sem
trylock has failed which shouldn't happen too often.

The issue is only theoretical but not impossible.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  2 ++
 include/linux/sched.h    |  5 +++++
 kernel/fork.c            | 50 +++++++++++++++++++++++++++++++++---------------
 mm/oom_kill.c            |  8 ++++++--
 4 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1fda9c99ef95..d553855503e6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -513,6 +514,7 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+	struct work_struct async_put_work;
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40eabf176ce2..479e3cade7e9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2730,6 +2730,11 @@ static inline void mmdrop(struct mm_struct * mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+/* same as above but performs the slow path from the async kontext. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 3e8451527cbe..8fbed7194af1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+static inline void __mmput(struct mm_struct *mm)
+{
+	VM_BUG_ON(atomic_read(&mm->mm_users));
+
+	uprobe_clear_state(mm);
+	exit_aio(mm);
+	ksm_exit(mm);
+	khugepaged_exit(mm); /* must run before exit_mmap */
+	exit_mmap(mm);
+	set_mm_exe_file(mm, NULL);
+	if (!list_empty(&mm->mmlist)) {
+		spin_lock(&mmlist_lock);
+		list_del(&mm->mmlist);
+		spin_unlock(&mmlist_lock);
+	}
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+	mmdrop(mm);
+}
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
@@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm)
 {
 	might_sleep();
 
+	if (atomic_dec_and_test(&mm->mm_users))
+		__mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
 	if (atomic_dec_and_test(&mm->mm_users)) {
-		uprobe_clear_state(mm);
-		exit_aio(mm);
-		ksm_exit(mm);
-		khugepaged_exit(mm); /* must run before exit_mmap */
-		exit_mmap(mm);
-		set_mm_exe_file(mm, NULL);
-		if (!list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_del(&mm->mmlist);
-			spin_unlock(&mmlist_lock);
-		}
-		if (mm->binfmt)
-			module_put(mm->binfmt->module);
-		mmdrop(mm);
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
 	}
 }
-EXPORT_SYMBOL_GPL(mmput);
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0376efa79ec..c0e37dd1422f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -446,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-
 static bool __oom_reap_task(struct task_struct *tsk)
 {
 	struct mmu_gather tlb;
@@ -520,7 +519,12 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 */
 	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
-	mmput(mm);
+	/*
+	 * Drop our reference but make sure the mmput slow path is called from a
+	 * different context because we shouldn't risk we get stuck there and
+	 * put the oom_reaper out of the way.
+	 */
+	mmput_async(mm);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 98748bd722005be9de2662bd4f7e41ad8148bdbd Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:24 -0700
Subject: oom: consider multi-threaded tasks in task_will_free_mem

task_will_free_mem is a misnomer for a more complex PF_EXITING test for
early break out from the oom killer because it is believed that such a
task would release its memory shortly and so we do not have to select an
oom victim and perform a disruptive action.

Currently we make sure that the given task is not participating in the
core dumping because it might get blocked for a long time - see commit
d003f371b270 ("oom: don't assume that a coredumping thread will exit
soon").

The check can still do better though.  We shouldn't consider the task
unless the whole thread group is going down.  This is rather unlikely
but not impossible.  A single exiting thread would surely leave all the
address space behind.  If we are really unlucky it might get stuck on
the exit path and keep its TIF_MEMDIE and so block the oom killer.

Link: http://lkml.kernel.org/r/1460452756-15491-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 83b9c39bd8b7..d3f533f2f481 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -110,13 +110,24 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
 static inline bool task_will_free_mem(struct task_struct *task)
 {
+	struct signal_struct *sig = task->signal;
+
 	/*
 	 * A coredumping process may sleep for an extended period in exit_mm(),
 	 * so the oom killer cannot assume that the process will promptly exit
 	 * and release memory.
 	 */
-	return (task->flags & PF_EXITING) &&
-		!(task->signal->flags & SIGNAL_GROUP_COREDUMP);
+	if (sig->flags & SIGNAL_GROUP_COREDUMP)
+		return false;
+
+	if (!(task->flags & PF_EXITING))
+		return false;
+
+	/* Make sure that the whole thread group is going down */
+	if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
+		return false;
+
+	return true;
 }
 
 /* sysctls */
-- 
cgit v1.2.3


From f44666b04605d1c7fd94ab90b7ccf633e7eff228 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 20 May 2016 16:57:27 -0700
Subject: mm,oom: speed up select_bad_process() loop

Since commit 3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel
panics"), select_bad_process() is using for_each_process_thread().

Since oom_unkillable_task() scans all threads in the caller's thread
group and oom_task_origin() scans signal_struct of the caller's thread
group, we don't need to call oom_unkillable_task() and oom_task_origin()
on each thread.  Also, since !mm test will be done later at
oom_badness(), we don't need to do !mm test on each thread.  Therefore,
we only need to do TIF_MEMDIE test on each thread.

Although the original code was correct it was quite inefficient because
each thread group was scanned num_threads times which can be a lot
especially with processes with many threads.  Even though the OOM is
extremely cold path it is always good to be as effective as possible
when we are inside rcu_read_lock() - aka unpreemptible context.

If we track number of TIF_MEMDIE threads inside signal_struct, we don't
need to do TIF_MEMDIE test on each thread.  This will allow
select_bad_process() to use for_each_process().

This patch adds a counter to signal_struct for tracking how many
TIF_MEMDIE threads are in a given thread group, and check it at
oom_scan_process_thread() so that select_bad_process() can use
for_each_process() rather than for_each_process_thread().

[mhocko@suse.com: do not blow the signal_struct size]
  Link: http://lkml.kernel.org/r/20160520075035.GF19172@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/201605182230.IDC73435.MVSOHLFOQFOJtF@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 17 ++++++-----------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 479e3cade7e9..01fe1bb68754 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -669,6 +669,7 @@ struct signal_struct {
 	atomic_t		sigcnt;
 	atomic_t		live;
 	int			nr_threads;
+	atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
 	struct list_head	thread_head;
 
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0e37dd1422f..5bb2f7698ad7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -283,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 	 * This task already has access to memory reserves and is being killed.
 	 * Don't allow any other task to have access to the reserves.
 	 */
-	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-		if (!is_sysrq_oom(oc))
-			return OOM_SCAN_ABORT;
-	}
-	if (!task->mm)
-		return OOM_SCAN_CONTINUE;
+	if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
+		return OOM_SCAN_ABORT;
 
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
@@ -307,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 static struct task_struct *select_bad_process(struct oom_control *oc,
 		unsigned int *ppoints, unsigned long totalpages)
 {
-	struct task_struct *g, *p;
+	struct task_struct *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
 
 	rcu_read_lock();
-	for_each_process_thread(g, p) {
+	for_each_process(p) {
 		unsigned int points;
 
 		switch (oom_scan_process_thread(oc, p, totalpages)) {
@@ -331,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
 		points = oom_badness(p, NULL, oc->nodemask, totalpages);
 		if (!points || points < chosen_points)
 			continue;
-		/* Prefer thread group leaders for display purposes */
-		if (points == chosen_points && thread_group_leader(chosen))
-			continue;
 
 		chosen = p;
 		chosen_points = points;
@@ -673,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk)
 	/* OOM killer might race with memcg OOM */
 	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
+	atomic_inc(&tsk->signal->oom_victims);
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
 	 * if it is frozen because OOM killer wouldn't be able to free
@@ -690,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk)
 {
 	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
+	atomic_dec(&tsk->signal->oom_victims);
 
 	if (!atomic_dec_return(&oom_victims))
 		wake_up_all(&oom_victims_wait);
-- 
cgit v1.2.3


From 80c4bd7a5e4368b680e0aeb57050a1b06eb573d8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 20 May 2016 16:57:38 -0700
Subject: mm/vmalloc: keep a separate lazy-free list

When mixing lots of vmallocs and set_memory_*() (which calls
vm_unmap_aliases()) I encountered situations where the performance
degraded severely due to the walking of the entire vmap_area list each
invocation.

One simple improvement is to add the lazily freed vmap_area to a
separate lockless free list, such that we then avoid having to walk the
full list on each purge.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Roman Pen <r.peniaev@gmail.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Roman Pen <r.peniaev@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  3 ++-
 mm/vmalloc.c            | 39 +++++++++++++++++++--------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d1f1d338af20..957adb741b6f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <asm/page.h>		/* pgprot_t */
 #include <linux/rbtree.h>
 
@@ -44,7 +45,7 @@ struct vmap_area {
 	unsigned long flags;
 	struct rb_node rb_node;         /* address sorted rbtree */
 	struct list_head list;          /* address sorted list */
-	struct list_head purge_list;    /* "lazy purge" list */
+	struct llist_node purge_list;    /* "lazy purge" list */
 	struct vm_struct *vm;
 	struct rcu_head rcu_head;
 };
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae7d20b447ff..6e3291882739 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 
 /*** Global kva allocator ***/
 
-#define VM_LAZY_FREE	0x01
-#define VM_LAZY_FREEING	0x02
 #define VM_VM_AREA	0x04
 
 static DEFINE_SPINLOCK(vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 
 /* The vmap cache globals are protected by vmap_area_lock */
@@ -601,7 +600,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 					int sync, int force_flush)
 {
 	static DEFINE_SPINLOCK(purge_lock);
-	LIST_HEAD(valist);
+	struct llist_node *valist;
 	struct vmap_area *va;
 	struct vmap_area *n_va;
 	int nr = 0;
@@ -620,20 +619,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 	if (sync)
 		purge_fragmented_blocks_allcpus();
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(va, &vmap_area_list, list) {
-		if (va->flags & VM_LAZY_FREE) {
-			if (va->va_start < *start)
-				*start = va->va_start;
-			if (va->va_end > *end)
-				*end = va->va_end;
-			nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
-			list_add_tail(&va->purge_list, &valist);
-			va->flags |= VM_LAZY_FREEING;
-			va->flags &= ~VM_LAZY_FREE;
-		}
+	valist = llist_del_all(&vmap_purge_list);
+	llist_for_each_entry(va, valist, purge_list) {
+		if (va->va_start < *start)
+			*start = va->va_start;
+		if (va->va_end > *end)
+			*end = va->va_end;
+		nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
 	}
-	rcu_read_unlock();
 
 	if (nr)
 		atomic_sub(nr, &vmap_lazy_nr);
@@ -643,7 +636,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 
 	if (nr) {
 		spin_lock(&vmap_area_lock);
-		list_for_each_entry_safe(va, n_va, &valist, purge_list)
+		llist_for_each_entry_safe(va, n_va, valist, purge_list)
 			__free_vmap_area(va);
 		spin_unlock(&vmap_area_lock);
 	}
@@ -678,9 +671,15 @@ static void purge_vmap_area_lazy(void)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
-	va->flags |= VM_LAZY_FREE;
-	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
-	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+	int nr_lazy;
+
+	nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
+				    &vmap_lazy_nr);
+
+	/* After this point, we may free va at any time */
+	llist_add(&va->purge_list, &vmap_purge_list);
+
+	if (unlikely(nr_lazy > lazy_max_pages()))
 		try_purge_vmap_area_lazy();
 }
 
-- 
cgit v1.2.3


From b8ca9e3a612eaf3e54c6fa136c62246a1a9aece7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 May 2016 16:57:53 -0700
Subject: mm: tighten fault_in_pages_writeable()

copy_page_to_iter_iovec() is currently the only user of
fault_in_pages_writeable(), and it definitely can use fragments from
high order pages.

Make sure fault_in_pages_writeable() is only touching two adjacent pages
at most, as claimed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fe1513ffb7bf..97354102794d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -518,33 +518,27 @@ void page_endio(struct page *page, int rw, int err);
 extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
 
 /*
- * Fault a userspace page into pagetables.  Return non-zero on a fault.
- *
- * This assumes that two userspace pages are always sufficient.
+ * Fault one or two userspace pages into pagetables.
+ * Return -EINVAL if more than two pages would be needed.
+ * Return non-zero on a fault.
  */
 static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 {
-	int ret;
+	int span, ret;
 
 	if (unlikely(size == 0))
 		return 0;
 
+	span = offset_in_page(uaddr) + size;
+	if (span > 2 * PAGE_SIZE)
+		return -EINVAL;
 	/*
 	 * Writing zeroes into userspace here is OK, because we know that if
 	 * the zero gets there, we'll be overwriting it.
 	 */
 	ret = __put_user(0, uaddr);
-	if (ret == 0) {
-		char __user *end = uaddr + size - 1;
-
-		/*
-		 * If the page was already mapped, this will get a cache miss
-		 * for sure, so try to avoid doing it.
-		 */
-		if (((unsigned long)uaddr & PAGE_MASK) !=
-				((unsigned long)end & PAGE_MASK))
-			ret = __put_user(0, end);
-	}
+	if (ret == 0 && span > PAGE_SIZE)
+		ret = __put_user(0, uaddr + size - 1);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 7fab358d90e6ba9d9cb702bee0c8a5f5c13bb6df Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen.5i5j@gmail.com>
Date: Fri, 20 May 2016 16:57:59 -0700
Subject: include/linux/hugetlb*.h: clean up code

Macro HUGETLBFS_SB is clear enough, so one statement is clearer than 3
lines statements.

Remove redundant return statements for non-return functions, which can
save lines, at least.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h        | 4 +---
 include/linux/hugetlb_cgroup.h | 4 ----
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e44c57876e89..7ef4b635015d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -353,9 +353,7 @@ extern unsigned int default_hstate_idx;
 
 static inline struct hstate *hstate_inode(struct inode *i)
 {
-	struct hugetlbfs_sb_info *hsb;
-	hsb = HUGETLBFS_SB(i->i_sb);
-	return hsb->hstate;
+	return HUGETLBFS_SB(i->i_sb)->hstate;
 }
 
 static inline struct hstate *hstate_file(struct file *f)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 24154c26d469..063962f6dfc6 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -93,20 +93,17 @@ hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
 			     struct hugetlb_cgroup *h_cg,
 			     struct page *page)
 {
-	return;
 }
 
 static inline void
 hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
 {
-	return;
 }
 
 static inline void
 hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
 			       struct hugetlb_cgroup *h_cg)
 {
-	return;
 }
 
 static inline void hugetlb_cgroup_file_init(void)
@@ -116,7 +113,6 @@ static inline void hugetlb_cgroup_file_init(void)
 static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
 					  struct page *newhpage)
 {
-	return;
 }
 
 #endif  /* CONFIG_MEM_RES_CTLR_HUGETLB */
-- 
cgit v1.2.3


From d70c17d436b3fb9dbdae8c93bf908a6110b0cb4f Mon Sep 17 00:00:00 2001
From: Chen Gang <chengang@emindsoft.com.cn>
Date: Fri, 20 May 2016 16:58:01 -0700
Subject: include/linux/hugetlb.h: use bool instead of int for
 hugepage_migration_supported()

It is used as a pure bool function within kernel source wide.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7ef4b635015d..c26d4638f665 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -452,12 +452,12 @@ static inline pgoff_t basepage_index(struct page *page)
 
 extern void dissolve_free_huge_pages(unsigned long start_pfn,
 				     unsigned long end_pfn);
-static inline int hugepage_migration_supported(struct hstate *h)
+static inline bool hugepage_migration_supported(struct hstate *h)
 {
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 	return huge_page_shift(h) == PMD_SHIFT;
 #else
-	return 0;
+	return false;
 #endif
 }
 
@@ -519,7 +519,7 @@ static inline pgoff_t basepage_index(struct page *page)
 	return page->index;
 }
 #define dissolve_free_huge_pages(s, e)	do {} while (0)
-#define hugepage_migration_supported(h)	0
+#define hugepage_migration_supported(h)	false
 
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 					   struct mm_struct *mm, pte_t *pte)
-- 
cgit v1.2.3


From 0c9ad804f178eae02a34045bb0916fa0e31623d5 Mon Sep 17 00:00:00 2001
From: Weijie Yang <weijie.yang@samsung.com>
Date: Fri, 20 May 2016 16:58:04 -0700
Subject: mm fix commmets: if SPARSEMEM, pgdata doesn't have page_ext

If SPARSEMEM, use page_ext in mem_section
if !SPARSEMEM, use page_ext in pgdata

Signed-off-by: Weijie Yang <weijie.yang@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8dd0333b01dc..02069c23486d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1063,7 +1063,7 @@ struct mem_section {
 	unsigned long *pageblock_flags;
 #ifdef CONFIG_PAGE_EXTENSION
 	/*
-	 * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use
+	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
 	 * section. (see page_ext.h about this.)
 	 */
 	struct page_ext *page_ext;
-- 
cgit v1.2.3


From d2a1a1f0a97a77d25cbada37161dc2ecdf01f93d Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Fri, 20 May 2016 16:58:16 -0700
Subject: mm: use unsigned long constant for page flags

struct page->flags is unsigned long, so when shifting bits we should use
UL suffix to match it.

Found this problem after I added 64-bit CPU specific page flags and
failed to compile the kernel:

  mm/page_alloc.c: In function '__free_one_page':
  mm/page_alloc.c:672:2: error: integer overflow in expression [-Werror=overflow]

Link: http://lkml.kernel.org/r/1461971723-16187-1-git-send-email-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a61e06e5fbce..e5a32445f930 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -479,7 +479,7 @@ static inline void ClearPageCompound(struct page *page)
 }
 #endif
 
-#define PG_head_mask ((1L << PG_head))
+#define PG_head_mask ((1UL << PG_head))
 
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
@@ -670,7 +670,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 }
 
 #ifdef CONFIG_MMU
-#define __PG_MLOCKED		(1 << PG_mlocked)
+#define __PG_MLOCKED		(1UL << PG_mlocked)
 #else
 #define __PG_MLOCKED		0
 #endif
@@ -680,11 +680,11 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
  * these flags set.  It they are, there is a problem.
  */
 #define PAGE_FLAGS_CHECK_AT_FREE \
-	(1 << PG_lru	 | 1 << PG_locked    | \
-	 1 << PG_private | 1 << PG_private_2 | \
-	 1 << PG_writeback | 1 << PG_reserved | \
-	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED)
+	(1UL << PG_lru	 | 1UL << PG_locked    | \
+	 1UL << PG_private | 1UL << PG_private_2 | \
+	 1UL << PG_writeback | 1UL << PG_reserved | \
+	 1UL << PG_slab	 | 1UL << PG_swapcache | 1UL << PG_active | \
+	 1UL << PG_unevictable | __PG_MLOCKED)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
@@ -695,10 +695,10 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
  * alloc-free cycle to prevent from reusing the page.
  */
 #define PAGE_FLAGS_CHECK_AT_PREP	\
-	(((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
+	(((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
 
 #define PAGE_FLAGS_PRIVATE				\
-	(1 << PG_private | 1 << PG_private_2)
+	(1UL << PG_private | 1UL << PG_private_2)
 /**
  * page_has_private - Determine if page has private stuff
  * @page: The page to be checked
-- 
cgit v1.2.3


From 5f527c2b3ea261bfccb7d12f9feade924cc4987c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 20 May 2016 16:58:24 -0700
Subject: mm: thp: microoptimize compound_mapcount()

compound_mapcount() is only called after PageCompound() has already been
checked by the caller, so there's no point to check it again.  Gcc may
optimize it away too because it's inline but this will remove the
runtime check for sure and add it'll add an assert instead.

Link: http://lkml.kernel.org/r/1462547040-1737-3-git-send-email-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2b97be1147ec..65d18a45b8e8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -475,8 +475,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page)
 
 static inline int compound_mapcount(struct page *page)
 {
-	if (!PageCompound(page))
-		return 0;
+	VM_BUG_ON_PAGE(!PageCompound(page), page);
 	page = compound_head(page);
 	return atomic_read(compound_mapcount_ptr(page)) + 1;
 }
-- 
cgit v1.2.3


From d2005e3f41d4f9299e2df6a967c8beb5086967a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 20 May 2016 16:58:36 -0700
Subject: userfaultfd: don't pin the user memory in userfaultfd_file_create()

userfaultfd_file_create() increments mm->mm_users; this means that the
memory won't be unmapped/freed if mm owner exits/execs, and UFFDIO_COPY
after that can populate the orphaned mm more.

Change userfaultfd_file_create() and userfaultfd_ctx_put() to use
mm->mm_count to pin mm_struct.  This means that
atomic_inc_not_zero(mm->mm_users) is needed when we are going to
actually play with this memory.  Except handle_userfault() path doesn't
need this, the caller must already have a reference.

The patch adds the new trivial helper, mmget_not_zero(), it can have
more users.

Link: http://lkml.kernel.org/r/20160516172254.GA8595@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c      | 41 ++++++++++++++++++++++++++++-------------
 include/linux/sched.h |  7 ++++++-
 2 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 66cdb44616d5..2d97952e341a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -137,7 +137,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
-		mmput(ctx->mm);
+		mmdrop(ctx->mm);
 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 	}
 }
@@ -434,6 +434,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 
 	ACCESS_ONCE(ctx->released) = true;
 
+	if (!mmget_not_zero(mm))
+		goto wakeup;
+
 	/*
 	 * Flush page faults out of all CPUs. NOTE: all page faults
 	 * must be retried without returning VM_FAULT_SIGBUS if
@@ -466,7 +469,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 	}
 	up_write(&mm->mmap_sem);
-
+	mmput(mm);
+wakeup:
 	/*
 	 * After no new page faults can wait on this fault_*wqh, flush
 	 * the last page faults that may have been already waiting on
@@ -760,10 +764,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	start = uffdio_register.range.start;
 	end = start + uffdio_register.range.len;
 
+	ret = -ENOMEM;
+	if (!mmget_not_zero(mm))
+		goto out;
+
 	down_write(&mm->mmap_sem);
 	vma = find_vma_prev(mm, start, &prev);
-
-	ret = -ENOMEM;
 	if (!vma)
 		goto out_unlock;
 
@@ -864,6 +870,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	} while (vma && vma->vm_start < end);
 out_unlock:
 	up_write(&mm->mmap_sem);
+	mmput(mm);
 	if (!ret) {
 		/*
 		 * Now that we scanned all vmas we can already tell
@@ -902,10 +909,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	start = uffdio_unregister.start;
 	end = start + uffdio_unregister.len;
 
+	ret = -ENOMEM;
+	if (!mmget_not_zero(mm))
+		goto out;
+
 	down_write(&mm->mmap_sem);
 	vma = find_vma_prev(mm, start, &prev);
-
-	ret = -ENOMEM;
 	if (!vma)
 		goto out_unlock;
 
@@ -998,6 +1007,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	} while (vma && vma->vm_start < end);
 out_unlock:
 	up_write(&mm->mmap_sem);
+	mmput(mm);
 out:
 	return ret;
 }
@@ -1067,9 +1077,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 		goto out;
 	if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
 		goto out;
-
-	ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-			   uffdio_copy.len);
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+				   uffdio_copy.len);
+		mmput(ctx->mm);
+	}
 	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
 		return -EFAULT;
 	if (ret < 0)
@@ -1110,8 +1122,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
 		goto out;
 
-	ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
-			     uffdio_zeropage.range.len);
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+				     uffdio_zeropage.range.len);
+		mmput(ctx->mm);
+	}
 	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
 		return -EFAULT;
 	if (ret < 0)
@@ -1289,12 +1304,12 @@ static struct file *userfaultfd_file_create(int flags)
 	ctx->released = false;
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
-	atomic_inc(&ctx->mm->mm_users);
+	atomic_inc(&ctx->mm->mm_count);
 
 	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
 				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
 	if (IS_ERR(file)) {
-		mmput(ctx->mm);
+		mmdrop(ctx->mm);
 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 	}
 out:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 01fe1bb68754..6b3213d96da6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2723,12 +2723,17 @@ extern struct mm_struct * mm_alloc(void);
 
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
-static inline void mmdrop(struct mm_struct * mm)
+static inline void mmdrop(struct mm_struct *mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 
+static inline bool mmget_not_zero(struct mm_struct *mm)
+{
+	return atomic_inc_not_zero(&mm->mm_users);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* same as above but performs the slow path from the async kontext. Can
-- 
cgit v1.2.3


From 4b50bcc7eda4d3cc9e3f2a0aa60e590fedf728c5 Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Fri, 20 May 2016 16:58:38 -0700
Subject: mm: use phys_addr_t for reserve_bootmem_region() arguments

Since commit 92923ca3aace ("mm: meminit: only set page reserved in the
memblock region") the reserved bit is set on reserved memblock regions.
However start and end address are passed as unsigned long.  This is only
32bit on i386, so it can end up marking the wrong pages reserved for
ranges at 4GB and above.

This was observed on a 32bit Xen dom0 which was booted with initial
memory set to a value below 4G but allowing to balloon in memory
(dom0_mem=1024M for example).  This would define a reserved bootmem
region for the additional memory (for example on a 8GB system there was
a reverved region covering the 4GB-8GB range).  But since the addresses
were passed on as unsigned long, this was actually marking all pages
from 0 to 4GB as reserved.

Fixes: 92923ca3aacef63 ("mm: meminit: only set page reserved in the memblock region")
Link: http://lkml.kernel.org/r/1463491221-10573-1-git-send-email-stefan.bader@canonical.com
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Cc: <stable@vger.kernel.org>	[4.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/page_alloc.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65d18a45b8e8..fbdb9d40847f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1763,7 +1763,7 @@ extern void free_highmem_page(struct page *page);
 extern void adjust_managed_page_count(struct page *page, long count);
 extern void mem_init_print_info(const char *str);
 
-extern void reserve_bootmem_region(unsigned long start, unsigned long end);
+extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
 static inline void __free_reserved_page(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3f4b69aaa23a..2dd1ba4e70cc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1205,7 +1205,7 @@ static inline void init_reserved_page(unsigned long pfn)
  * marks the pages PageReserved. The remaining valid pages are later
  * sent to the buddy page allocator.
  */
-void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long end_pfn = PFN_UP(end);
-- 
cgit v1.2.3


From 5c0a85fad949212b3e059692deecdeed74ae7ec7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 20 May 2016 16:58:41 -0700
Subject: mm: make faultaround produce old ptes

Currently, faultaround code produces young pte.  This can screw up
vmscan behaviour[1], as it makes vmscan think that these pages are hot
and not push them out on first round.

During sparse file access faultaround gets more pages mapped and all of
them are young.  Under memory pressure, this makes vmscan swap out anon
pages instead, or to drop other page cache pages which otherwise stay
resident.

Modify faultaround to produce old ptes, so they can easily be reclaimed
under memory pressure.

This can to some extend defeat the purpose of faultaround on machines
without hardware accessed bit as it will not help us with reducing the
number of minor page faults.

We may want to disable faultaround on such machines altogether, but
that's subject for separate patchset.

Minchan:
 "I tested 512M mmap sequential word read test on non-HW access bit
  system (i.e., ARM) and confirmed it doesn't increase minor fault any
  more.

  old: 4096 fault_around
  minor fault: 131291
  elapsed time: 6747645 usec

  new: 65536 fault_around
  minor fault: 131291
  elapsed time: 6709263 usec

  0.56% benefit"

[1] https://lkml.kernel.org/r/1460992636-711-1-git-send-email-vinmenon@codeaurora.org

Link: http://lkml.kernel.org/r/1463488366-47723-1-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/filemap.c       |  2 +-
 mm/memory.c        | 23 ++++++++++++++++++-----
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fbdb9d40847f..f223ac26b5d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -596,7 +596,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 }
 
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon);
+		struct page *page, pte_t *pte, bool write, bool anon, bool old);
 #endif
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 8f4859989f1b..b418405903bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2191,7 +2191,7 @@ repeat:
 		if (file->f_ra.mmap_miss > 0)
 			file->f_ra.mmap_miss--;
 		addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
-		do_set_pte(vma, addr, page, pte, false, false);
+		do_set_pte(vma, addr, page, pte, false, false, true);
 		unlock_page(page);
 		goto next;
 unlock:
diff --git a/mm/memory.c b/mm/memory.c
index 007c72ad03f6..f29e5ab0342d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2876,7 +2876,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
  * vm_ops->map_pages.
  */
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon)
+		struct page *page, pte_t *pte, bool write, bool anon, bool old)
 {
 	pte_t entry;
 
@@ -2884,6 +2884,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	if (old)
+		entry = pte_mkold(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address, false);
@@ -3021,9 +3023,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
 		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-		do_fault_around(vma, address, pte, pgoff, flags);
 		if (!pte_same(*pte, orig_pte))
 			goto unlock_out;
+		do_fault_around(vma, address, pte, pgoff, flags);
+		/* Check if the fault is handled by faultaround */
+		if (!pte_same(*pte, orig_pte)) {
+			/*
+			 * Faultaround produce old pte, but the pte we've
+			 * handler fault for should be young.
+			 */
+			pte_t entry = pte_mkyoung(*pte);
+			if (ptep_set_access_flags(vma, address, pte, entry, 0))
+				update_mmu_cache(vma, address, pte);
+			goto unlock_out;
+		}
 		pte_unmap_unlock(pte, ptl);
 	}
 
@@ -3038,7 +3051,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, false, false);
+	do_set_pte(vma, address, fault_page, pte, false, false, false);
 	unlock_page(fault_page);
 unlock_out:
 	pte_unmap_unlock(pte, ptl);
@@ -3090,7 +3103,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		goto uncharge_out;
 	}
-	do_set_pte(vma, address, new_page, pte, true, true);
+	do_set_pte(vma, address, new_page, pte, true, true, false);
 	mem_cgroup_commit_charge(new_page, memcg, false, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
@@ -3147,7 +3160,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, true, false);
+	do_set_pte(vma, address, fault_page, pte, true, false, false);
 	pte_unmap_unlock(pte, ptl);
 
 	if (set_page_dirty(fault_page))
-- 
cgit v1.2.3


From 0bb2fd13b69abfd88880f356903b5c7ca36d5eea Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 20 May 2016 16:58:59 -0700
Subject: mm: page_is_guard(): return false when page_ext arrays are not
 allocated yet

When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

  BUG: unable to handle kernel NULL pointer dereference at           (null)
  IP: [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
  PGD 0
  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  Modules linked in:
  CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
  Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009
  task: ffff88017c080040 ti: ffff88017c084000 task.ti: ffff88017c084000
  RIP: 0010:[<ffffffff8118d982>]  [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
  RSP: 0000:ffff88017c087c48  EFLAGS: 00010046
  RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
  RDX: 0000000000000980 RSI: 0000000000000080 RDI: 0000000000660401
  RBP: ffff88017c087cd0 R08: 0000000000000401 R09: 0000000000000009
  R10: ffff88017c080040 R11: 000000000000000a R12: 0000000000000400
  R13: ffffea0019810000 R14: ffffea0019810040 R15: ffff88066cfe6080
  FS:  0000000000000000(0000) GS:ffff88066cd40000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 0000000002406000 CR4: 00000000000006e0
  Call Trace:
    free_hot_cold_page+0x192/0x1d0
    __free_pages+0x5c/0x90
    __free_pages_boot_core+0x11a/0x14e
    deferred_free_range+0x50/0x62
    deferred_init_memmap+0x220/0x3c3
    kthread+0xf8/0x110
    ret_from_fork+0x22/0x40
  Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
  RIP  [<ffffffff8118d982>] free_pcppages_bulk+0x2d2/0x8d0
   RSP <ffff88017c087c48>
  CR2: 0000000000000000

The problem is lookup_page_ext() returns NULL then page_is_guard() tried
to access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension
flag, but freeing page might reach here before the page_ext arrays are
allocated when feeding a range of pages to the allocator for the first
time during bootup or memory hotplug.

When it returns NULL, page_is_guard() should just return false instead
of checking PAGE_EXT_DEBUG_GUARD unconditionally.

Link: http://lkml.kernel.org/r/1463610225-29060-1-git-send-email-yang.shi@linaro.org
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f223ac26b5d9..b530c99e8e81 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2386,6 +2386,9 @@ static inline bool page_is_guard(struct page *page)
 		return false;
 
 	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return false;
+
 	return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 }
 #else
-- 
cgit v1.2.3


From 55834c59098d0c5a97b0f3247e55832b67facdcf Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 20 May 2016 16:59:11 -0700
Subject: mm: kasan: initial memory quarantine implementation

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

When the object is freed, its state changes from KASAN_STATE_ALLOC to
KASAN_STATE_QUARANTINE.  The object is poisoned and put into quarantine
instead of being returned to the allocator, therefore every subsequent
access to that object triggers a KASAN error, and the error handler is
able to say where the object has been allocated and deallocated.

When it's time for the object to leave quarantine, its state becomes
KASAN_STATE_FREE and it's returned to the allocator.  From now on the
allocator may reuse it for another allocation.  Before that happens,
it's still possible to detect a use-after free on that object (it
retains the allocation/deallocation stacks).

When the allocator reuses this object, the shadow is unpoisoned and old
allocation/deallocation stacks are wiped.  Therefore a use of this
object, even an incorrect one, won't trigger ASan warning.

Without the quarantine, it's not guaranteed that the objects aren't
reused immediately, that's why the probability of catching a
use-after-free is lower than with quarantine in place.

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

Freed objects are first added to per-cpu quarantine queues.  When a
cache is destroyed or memory shrinking is requested, the objects are
moved into the global quarantine queue.  Whenever a kmalloc call allows
memory reclaiming, the oldest objects are popped out of the global queue
until the total size of objects in quarantine is less than 3/4 of the
maximum quarantine size (which is a fraction of installed physical
memory).

As long as an object remains in the quarantine, KASAN is able to report
accesses to it, so the chance of reporting a use-after-free is
increased.  Once the object leaves quarantine, the allocator may reuse
it, in which case the object is unpoisoned and KASAN can't detect
incorrect accesses to it.

Right now quarantine support is only enabled in SLAB allocator.
Unification of KASAN features in SLAB and SLUB will be done later.

This patch is based on the "mm: kasan: quarantine" patch originally
prepared by Dmitry Chernenkov.  A number of improvements have been
suggested by Andrey Ryabinin.

[glider@google.com: v9]
  Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  13 ++-
 mm/kasan/Makefile     |   1 +
 mm/kasan/kasan.c      |  57 ++++++++--
 mm/kasan/kasan.h      |  21 +++-
 mm/kasan/quarantine.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/report.c     |   1 +
 mm/mempool.c          |   2 +-
 mm/slab.c             |  12 ++-
 mm/slab.h             |   2 +
 mm/slab_common.c      |   2 +
 10 files changed, 387 insertions(+), 15 deletions(-)
 create mode 100644 mm/kasan/quarantine.c

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 737371b56044..611927f5870d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -50,6 +50,8 @@ void kasan_free_pages(struct page *page, unsigned int order);
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags);
+void kasan_cache_shrink(struct kmem_cache *cache);
+void kasan_cache_destroy(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -63,7 +65,8 @@ void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
 void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
 void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
-void kasan_slab_free(struct kmem_cache *s, void *object);
+bool kasan_slab_free(struct kmem_cache *s, void *object);
+void kasan_poison_slab_free(struct kmem_cache *s, void *object);
 
 struct kasan_cache {
 	int alloc_meta_offset;
@@ -88,6 +91,8 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
 				      unsigned long *flags) {}
+static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
+static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
@@ -105,7 +110,11 @@ static inline void kasan_krealloc(const void *object, size_t new_size,
 
 static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags) {}
-static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+{
+	return false;
+}
+static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 131daadf40e4..1548749a3d45 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
 obj-y := kasan.o report.o kasan_init.o
+obj-$(CONFIG_SLAB) += quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 38f1dd79acdb..8df666bb23be 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -388,6 +388,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 }
 #endif
 
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
+void kasan_cache_destroy(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
 void kasan_poison_slab(struct page *page)
 {
 	kasan_poison_shadow(page_address(page),
@@ -482,7 +492,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
-void kasan_slab_free(struct kmem_cache *cache, void *object)
+void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 {
 	unsigned long size = cache->object_size;
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -491,18 +501,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
 	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
 		return;
 
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object)
+{
 #ifdef CONFIG_SLAB
-	if (cache->flags & SLAB_KASAN) {
-		struct kasan_free_meta *free_info =
-			get_free_info(cache, object);
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return false;
+
+	if (likely(cache->flags & SLAB_KASAN)) {
 		struct kasan_alloc_meta *alloc_info =
 			get_alloc_info(cache, object);
-		alloc_info->state = KASAN_STATE_FREE;
-		set_track(&free_info->track, GFP_NOWAIT);
+		struct kasan_free_meta *free_info =
+			get_free_info(cache, object);
+
+		switch (alloc_info->state) {
+		case KASAN_STATE_ALLOC:
+			alloc_info->state = KASAN_STATE_QUARANTINE;
+			quarantine_put(free_info, cache);
+			set_track(&free_info->track, GFP_NOWAIT);
+			kasan_poison_slab_free(cache, object);
+			return true;
+		case KASAN_STATE_QUARANTINE:
+		case KASAN_STATE_FREE:
+			pr_err("Double free");
+			dump_stack();
+			break;
+		default:
+			break;
+		}
 	}
+	return false;
+#else
+	kasan_poison_slab_free(cache, object);
+	return false;
 #endif
-
-	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
 }
 
 void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -511,6 +546,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(object == NULL))
 		return;
 
@@ -541,6 +579,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(ptr == NULL))
 		return;
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 30a2f0ba0e09..7f7ac51d7faf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -62,6 +62,7 @@ struct kasan_global {
 enum kasan_state {
 	KASAN_STATE_INIT,
 	KASAN_STATE_ALLOC,
+	KASAN_STATE_QUARANTINE,
 	KASAN_STATE_FREE
 };
 
@@ -79,9 +80,14 @@ struct kasan_alloc_meta {
 	u32 reserved;
 };
 
+struct qlist_node {
+	struct qlist_node *next;
+};
 struct kasan_free_meta {
-	/* Allocator freelist pointer, unused by KASAN. */
-	void **freelist;
+	/* This field is used while the object is in the quarantine.
+	 * Otherwise it might be used for the allocator freelist.
+	 */
+	struct qlist_node quarantine_link;
 	struct kasan_track track;
 };
 
@@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void)
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
+#ifdef CONFIG_SLAB
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+				struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
 #endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
new file mode 100644
index 000000000000..4973505a9bdd
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,291 @@
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+	struct qlist_node *head;
+	struct qlist_node *tail;
+	size_t bytes;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+	return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+	q->head = q->tail = NULL;
+	q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+		size_t size)
+{
+	if (unlikely(qlist_empty(q)))
+		q->head = qlink;
+	else
+		q->tail->next = qlink;
+	q->tail = qlink;
+	qlink->next = NULL;
+	q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	if (qlist_empty(to)) {
+		*to = *from;
+		qlist_init(from);
+		return;
+	}
+
+	to->tail->next = from->head;
+	to->tail = from->tail;
+	to->bytes += from->bytes;
+
+	qlist_init(from);
+}
+
+static void qlist_move(struct qlist_head *from, struct qlist_node *last,
+		struct qlist_head *to, size_t size)
+{
+	if (unlikely(last == from->tail)) {
+		qlist_move_all(from, to);
+		return;
+	}
+	if (qlist_empty(to))
+		to->head = from->head;
+	else
+		to->tail->next = from->head;
+	to->tail = last;
+	from->head = last->next;
+	last->next = NULL;
+	from->bytes -= size;
+	to->bytes += size;
+}
+
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+static struct qlist_head global_quarantine;
+static DEFINE_SPINLOCK(quarantine_lock);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+	return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	struct kasan_free_meta *free_info =
+		container_of(qlink, struct kasan_free_meta,
+			     quarantine_link);
+
+	return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	void *object = qlink_to_object(qlink, cache);
+	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	alloc_info->state = KASAN_STATE_FREE;
+	___cache_free(cache, object, _THIS_IP_);
+	local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+	struct qlist_node *qlink;
+
+	if (unlikely(qlist_empty(q)))
+		return;
+
+	qlink = q->head;
+	while (qlink) {
+		struct kmem_cache *obj_cache =
+			cache ? cache :	qlink_to_cache(qlink);
+		struct qlist_node *next = qlink->next;
+
+		qlink_free(qlink, obj_cache);
+		qlink = next;
+	}
+	qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head *q;
+	struct qlist_head temp = QLIST_INIT;
+
+	local_irq_save(flags);
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_put(q, &info->quarantine_link, cache->size);
+	if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+		qlist_move_all(q, &temp);
+
+	local_irq_restore(flags);
+
+	if (unlikely(!qlist_empty(&temp))) {
+		spin_lock_irqsave(&quarantine_lock, flags);
+		qlist_move_all(&temp, &global_quarantine);
+		spin_unlock_irqrestore(&quarantine_lock, flags);
+	}
+}
+
+void quarantine_reduce(void)
+{
+	size_t new_quarantine_size;
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+	size_t size_to_free = 0;
+	struct qlist_node *last;
+
+	if (likely(READ_ONCE(global_quarantine.bytes) <=
+		   READ_ONCE(quarantine_size)))
+		return;
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+
+	/*
+	 * Update quarantine size in case of hotplug. Allocate a fraction of
+	 * the installed memory to quarantine minus per-cpu queue limits.
+	 */
+	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+		QUARANTINE_FRACTION;
+	new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus();
+	WRITE_ONCE(quarantine_size, new_quarantine_size);
+
+	last = global_quarantine.head;
+	while (last) {
+		struct kmem_cache *cache = qlink_to_cache(last);
+
+		size_to_free += cache->size;
+		if (!last->next || size_to_free >
+		    global_quarantine.bytes - QUARANTINE_LOW_SIZE)
+			break;
+		last = last->next;
+	}
+	qlist_move(&global_quarantine, last, &to_free, size_to_free);
+
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, NULL);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+				   struct qlist_head *to,
+				   struct kmem_cache *cache)
+{
+	struct qlist_node *prev = NULL, *curr;
+
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	curr = from->head;
+	while (curr) {
+		struct qlist_node *qlink = curr;
+		struct kmem_cache *obj_cache = qlink_to_cache(qlink);
+
+		if (obj_cache == cache) {
+			if (unlikely(from->head == qlink)) {
+				from->head = curr->next;
+				prev = curr;
+			} else
+				prev->next = curr->next;
+			if (unlikely(from->tail == qlink))
+				from->tail = curr->next;
+			from->bytes -= cache->size;
+			qlist_put(to, qlink, cache->size);
+		} else {
+			prev = curr;
+		}
+		curr = curr->next;
+	}
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+	struct kmem_cache *cache = arg;
+	struct qlist_head to_free = QLIST_INIT;
+	struct qlist_head *q;
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_move_cache(q, &to_free, cache);
+	qlist_free_all(&to_free, cache);
+}
+
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+
+	on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+	qlist_move_cache(&global_quarantine, &to_free, cache);
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, cache);
+}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 60869a5a0124..b3c122ddd454 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page,
 		print_track(&alloc_info->track);
 		break;
 	case KASAN_STATE_FREE:
+	case KASAN_STATE_QUARANTINE:
 		pr_err("Object freed, allocated with size %u bytes\n",
 		       alloc_info->alloc_size);
 		free_info = get_free_info(cache, object);
diff --git a/mm/mempool.c b/mm/mempool.c
index 9b7a14a791cc..9e075f829d0d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab)
-		kasan_slab_free(pool->pool_data, element);
+		kasan_poison_slab_free(pool->pool_data, element);
 	if (pool->alloc == mempool_kmalloc)
 		kasan_kfree(element);
 	if (pool->alloc == mempool_alloc_pages)
diff --git a/mm/slab.c b/mm/slab.c
index c11bf5007952..28864c022430 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3547,9 +3547,17 @@ free_done:
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 				unsigned long caller)
 {
-	struct array_cache *ac = cpu_cache_get(cachep);
+	/* Put the object into the quarantine, don't touch it for now. */
+	if (kasan_slab_free(cachep, objp))
+		return;
+
+	___cache_free(cachep, objp, caller);
+}
 
-	kasan_slab_free(cachep, objp);
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+		unsigned long caller)
+{
+	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	kmemleak_free_recursive(objp, cachep->flags);
diff --git a/mm/slab.h b/mm/slab.h
index 5969769fbee6..dedb1a920fb8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+
 #endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3239bfd758e6..a65dad7fdcd1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	get_online_cpus();
 	get_online_mems();
 
+	kasan_cache_destroy(s);
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;
@@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 
 	get_online_cpus();
 	get_online_mems();
+	kasan_cache_shrink(cachep);
 	ret = __kmem_cache_shrink(cachep, false);
 	put_online_mems();
 	put_online_cpus();
-- 
cgit v1.2.3


From 64f8ebaf115bcddc4aaa902f981c57ba6506bc42 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 20 May 2016 16:59:28 -0700
Subject: mm/kasan: add API to check memory regions

Memory access coded in an assembly won't be seen by KASAN as a compiler
can instrument only C code.  Add kasan_check_[read,write]() API which is
going to be used to check a certain memory range.

Link: http://lkml.kernel.org/r/1462538722-1574-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                  |  2 +-
 include/linux/kasan-checks.h | 12 ++++++++++++
 mm/kasan/kasan.c             | 12 ++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kasan-checks.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 374ffa2d81b7..8b92445561b6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6242,7 +6242,7 @@ S:	Maintained
 F:	arch/*/include/asm/kasan.h
 F:	arch/*/mm/kasan_init*
 F:	Documentation/kasan.txt
-F:	include/linux/kasan.h
+F:	include/linux/kasan*.h
 F:	lib/test_kasan.c
 F:	mm/kasan/
 F:	scripts/Makefile.kasan
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
new file mode 100644
index 000000000000..b7f8aced7870
--- /dev/null
+++ b/include/linux/kasan-checks.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_KASAN_CHECKS_H
+#define _LINUX_KASAN_CHECKS_H
+
+#ifdef CONFIG_KASAN
+void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_write(const void *p, unsigned int size);
+#else
+static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_write(const void *p, unsigned int size) { }
+#endif
+
+#endif
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e5beb40d97b1..18b6a2b8d183 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -299,6 +299,18 @@ static void check_memory_region(unsigned long addr,
 	check_memory_region_inline(addr, size, write, ret_ip);
 }
 
+void kasan_check_read(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
 #undef memset
 void *memset(void *addr, int c, size_t len)
 {
-- 
cgit v1.2.3


From d0d8da2dc49dfdfe1d788eaf4d55eb5d4964d926 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Fri, 20 May 2016 16:59:48 -0700
Subject: zsmalloc: require GFP in zs_malloc()

Pass GFP flags to zs_malloc() instead of using a fixed mask supplied to
zs_create_pool(), so we can be more flexible, but, more importantly, we
need this to switch zram to per-cpu compression streams -- zram will try
to allocate handle with preemption disabled in a fast path and switch to
a slow path (using different gfp mask) if the fast one has failed.

Apart from that, this also align zs_malloc() interface with zspool/zbud.

[sergey.senozhatsky@gmail.com: pass GFP flags to zs_malloc() instead of using a fixed mask]
  Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c |  4 ++--
 include/linux/zsmalloc.h      |  4 ++--
 mm/zsmalloc.c                 | 24 +++++++++++++-----------
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 370c2f76016d..b09acdb753ee 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -514,7 +514,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
 		goto out_error;
 	}
 
-	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+	meta->mem_pool = zs_create_pool(pool_name);
 	if (!meta->mem_pool) {
 		pr_err("Error creating memory pool\n");
 		goto out_error;
@@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			src = uncmem;
 	}
 
-	handle = zs_malloc(meta->mem_pool, clen);
+	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
 	if (!handle) {
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 34eb16098a33..57a8e98f2708 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -41,10 +41,10 @@ struct zs_pool_stats {
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
+struct zs_pool *zs_create_pool(const char *name);
 void zs_destroy_pool(struct zs_pool *pool);
 
-unsigned long zs_malloc(struct zs_pool *pool, size_t size);
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
 void zs_free(struct zs_pool *pool, unsigned long obj);
 
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index ae288c9f7156..aba39a291523 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -247,7 +247,6 @@ struct zs_pool {
 	struct size_class **size_class;
 	struct kmem_cache *handle_cachep;
 
-	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
 
 	struct zs_pool_stats stats;
@@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
 	kmem_cache_destroy(pool->handle_cachep);
 }
 
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
 {
 	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-		pool->flags & ~__GFP_HIGHMEM);
+			gfp & ~__GFP_HIGHMEM);
 }
 
 static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
 			     const struct zpool_ops *zpool_ops,
 			     struct zpool *zpool)
 {
-	return zs_create_pool(name, gfp);
+	/*
+	 * Ignore global gfp flags: zs_malloc() may be invoked from
+	 * different contexts and its caller must provide a valid
+	 * gfp mask.
+	 */
+	return zs_create_pool(name);
 }
 
 static void zs_zpool_destroy(void *pool)
@@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool)
 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
 			unsigned long *handle)
 {
-	*handle = zs_malloc(pool, size);
+	*handle = zs_malloc(pool, size, gfp);
 	return *handle ? 0 : -1;
 }
 static void zs_zpool_free(void *pool, unsigned long handle)
@@ -1391,7 +1395,7 @@ static unsigned long obj_malloc(struct size_class *class,
  * otherwise 0.
  * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
  */
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 {
 	unsigned long handle, obj;
 	struct size_class *class;
@@ -1400,7 +1404,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
 		return 0;
 
-	handle = alloc_handle(pool);
+	handle = alloc_handle(pool, gfp);
 	if (!handle)
 		return 0;
 
@@ -1413,7 +1417,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 
 	if (!first_page) {
 		spin_unlock(&class->lock);
-		first_page = alloc_zspage(class, pool->flags);
+		first_page = alloc_zspage(class, gfp);
 		if (unlikely(!first_page)) {
 			free_handle(pool, handle);
 			return 0;
@@ -1878,7 +1882,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
  * On success, a pointer to the newly created pool is returned,
  * otherwise NULL.
  */
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name)
 {
 	int i;
 	struct zs_pool *pool;
@@ -1948,8 +1952,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 		prev_class = class;
 	}
 
-	pool->flags = flags;
-
 	if (zs_pool_stat_create(pool, name))
 		goto err;
 
-- 
cgit v1.2.3


From 5f56a5dfdb9bcb3bca03df59980d4d2f012cbb53 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:16 -0700
Subject: exit_thread: remove empty bodies

Define HAVE_EXIT_THREAD for archs which want to do something in
exit_thread. For others, let's define exit_thread as an empty inline.

This is a cleanup before we change the prototype of exit_thread to
accept a task parameter.

[akpm@linux-foundation.org: fix mips]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                            |  5 +++++
 arch/alpha/kernel/process.c             |  8 --------
 arch/arc/kernel/process.c               |  7 -------
 arch/arm/Kconfig                        |  1 +
 arch/arm64/kernel/process.c             |  7 -------
 arch/avr32/Kconfig                      |  1 +
 arch/blackfin/include/asm/processor.h   |  7 -------
 arch/c6x/kernel/process.c               |  4 ----
 arch/cris/Kconfig                       |  1 +
 arch/cris/arch-v10/kernel/process.c     |  9 ---------
 arch/frv/include/asm/processor.h        |  7 -------
 arch/h8300/include/asm/processor.h      |  7 -------
 arch/hexagon/kernel/process.c           |  7 -------
 arch/ia64/Kconfig                       |  1 +
 arch/m32r/kernel/process.c              |  9 ---------
 arch/m68k/include/asm/processor.h       |  7 -------
 arch/metag/Kconfig                      |  1 +
 arch/metag/include/asm/processor.h      |  2 --
 arch/microblaze/include/asm/processor.h | 10 ----------
 arch/mips/kernel/process.c              |  4 ----
 arch/mn10300/Kconfig                    |  1 +
 arch/nios2/include/asm/processor.h      |  5 -----
 arch/openrisc/include/asm/processor.h   |  9 ---------
 arch/parisc/kernel/process.c            |  7 -------
 arch/powerpc/kernel/process.c           |  4 ----
 arch/s390/Kconfig                       |  1 +
 arch/score/kernel/process.c             |  2 --
 arch/sh/Kconfig                         |  1 +
 arch/sh/kernel/process_32.c             |  7 -------
 arch/sparc/Kconfig                      |  1 +
 arch/tile/Kconfig                       |  1 +
 arch/um/kernel/process.c                |  4 ----
 arch/unicore32/kernel/process.c         |  7 -------
 arch/x86/Kconfig                        |  1 +
 arch/xtensa/Kconfig                     |  1 +
 include/linux/sched.h                   |  7 +++++++
 36 files changed, 24 insertions(+), 140 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5e7e17..0f298f9123dc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -517,6 +517,11 @@ config HAVE_ARCH_MMAP_RND_BITS
 	  - ARCH_MMAP_RND_BITS_MIN
 	  - ARCH_MMAP_RND_BITS_MAX
 
+config HAVE_EXIT_THREAD
+	bool
+	help
+	  An architecture implements exit_thread.
+
 config ARCH_MMAP_RND_BITS_MIN
 	int
 
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 84d13263ce46..b483156698d5 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -210,14 +210,6 @@ start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 }
 EXPORT_SYMBOL(start_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void
-exit_thread(void)
-{
-}
-
 void
 flush_thread(void)
 {
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index a3f750e76b68..b5db9e7fd649 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -183,13 +183,6 @@ void flush_thread(void)
 {
 }
 
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
 int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 {
 	return 0;
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b99d25b4133e..956d3575426c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -50,6 +50,7 @@ config ARM
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
 	select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 48eea6866c67..6cd2612236dc 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -200,13 +200,6 @@ void show_regs(struct pt_regs * regs)
 	__show_regs(regs);
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 static void tls_thread_flush(void)
 {
 	asm ("msr tpidr_el0, xzr");
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 18b88779e701..e43519a2ca89 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -4,6 +4,7 @@ config AVR32
 	# that we usually don't need on AVR32.
 	select EXPERT
 	select HAVE_CLK
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select VIRT_TO_BUS
diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h
index 7acd46653df3..0c265aba94ad 100644
--- a/arch/blackfin/include/asm/processor.h
+++ b/arch/blackfin/include/asm/processor.h
@@ -75,13 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c
index 3ae9f5a166a0..0ee7686a78f3 100644
--- a/arch/c6x/kernel/process.c
+++ b/arch/c6x/kernel/process.c
@@ -82,10 +82,6 @@ void flush_thread(void)
 {
 }
 
-void exit_thread(void)
-{
-}
-
 /*
  * Do necessary setup to start up a newly executed thread.
  */
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 99bda1ba3d2f..5c0ca8ae9293 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -59,6 +59,7 @@ config CRIS
 	select GENERIC_IOMAP
 	select MODULES_USE_ELF_RELA
 	select CLONE_BACKWARDS2
+	select HAVE_EXIT_THREAD if ETRAX_ARCH_V32
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select GPIOLIB
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 02b783457be0..96e5afef6b47 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -35,15 +35,6 @@ void default_idle(void)
 	local_irq_enable();
 }
 
-/*
- * Free current thread data structures etc..
- */
-
-void exit_thread(void)
-{
-	/* Nothing needs to be done.  */
-}
-
 /* if the watchdog is enabled, we can simply disable interrupts and go
  * into an eternal loop, and the watchdog will reset the CPU after 0.1s
  * if on the other hand the watchdog wasn't enabled, we just enable it and wait
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h
index ae8d423e79d9..73f0a79ad8e6 100644
--- a/arch/frv/include/asm/processor.h
+++ b/arch/frv/include/asm/processor.h
@@ -96,13 +96,6 @@ extern asmlinkage void *restore_user_regs(const struct user_context *target, ...
 #define release_segments(mm)		do { } while (0)
 #define forget_segments()		do { } while (0)
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h
index 54e3fd83c336..111df7397ac7 100644
--- a/arch/h8300/include/asm/processor.h
+++ b/arch/h8300/include/asm/processor.h
@@ -110,13 +110,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index a9ebd471823a..d9edfd3fc52a 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -136,13 +136,6 @@ void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free any architecture-specific thread data structures, etc.
- */
-void exit_thread(void)
-{
-}
-
 /*
  * Some archs flush debug and FPU info here
  */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index b534ebab36ea..f80758cb7157 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -18,6 +18,7 @@ config IA64
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
 	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
 	select HAVE_UNSTABLE_SCHED_CLOCK
+	select HAVE_EXIT_THREAD
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index e69221d581d5..a88b1f01e91f 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -101,15 +101,6 @@ void show_regs(struct pt_regs * regs)
 #endif
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-	/* Nothing to do. */
-	DPRINTK("pid = %d\n", current->pid);
-}
-
 void flush_thread(void)
 {
 	DPRINTK("pid = %d\n", current->pid);
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index 20dda1d4b860..a6ce2ec8d693 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -153,13 +153,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/*
- * Free current thread data structures etc..
- */
-static inline void exit_thread(void)
-{
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88da3e31..e47a08d72819 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -11,6 +11,7 @@ config METAG
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_KERNEL_BZIP2
diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h
index 0838ca699764..a0333ebcac35 100644
--- a/arch/metag/include/asm/processor.h
+++ b/arch/metag/include/asm/processor.h
@@ -134,8 +134,6 @@ static inline void release_thread(struct task_struct *dead_task)
 #define copy_segments(tsk, mm)		do { } while (0)
 #define release_segments(mm)		do { } while (0)
 
-extern void exit_thread(void);
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 497a988d79c2..c38d0dd91134 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -70,11 +70,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free all resources held by a thread. */
-static inline void exit_thread(void)
-{
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *t);
 
 extern unsigned long get_wchan(struct task_struct *p);
@@ -127,11 +122,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free current thread data structures etc.  */
-static inline void exit_thread(void)
-{
-}
-
 /* Return saved (kernel) PC of a blocked thread.  */
 #  define thread_saved_pc(tsk)	\
 	((tsk)->thread.regs ? (tsk)->thread.regs->r15 : 0)
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index a6b3dc54260a..411c971e3417 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -73,10 +73,6 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 	regs->regs[29] = sp;
 }
 
-void exit_thread(void)
-{
-}
-
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	/*
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 06ddb5501ab1..9627e81a6cbb 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -1,5 +1,6 @@
 config MN10300
 	def_bool y
+	select HAVE_EXIT_THREAD
 	select HAVE_OPROFILE
 	select HAVE_UID16
 	select GENERIC_IRQ_SHOW
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index c2ba45c159c7..1c953f0cadbf 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -75,11 +75,6 @@ static inline void release_thread(struct task_struct *dead_task)
 {
 }
 
-/* Free current thread data structures etc.. */
-static inline void exit_thread(void)
-{
-}
-
 /* Return saved PC of a blocked thread. */
 #define thread_saved_pc(tsk)	((tsk)->thread.kregs->ea)
 
diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h
index 4d235e3d2534..70334c9f7d24 100644
--- a/arch/openrisc/include/asm/processor.h
+++ b/arch/openrisc/include/asm/processor.h
@@ -84,15 +84,6 @@ void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp);
 void release_thread(struct task_struct *);
 unsigned long get_wchan(struct task_struct *p);
 
-/*
- * Free current thread data structures etc..
- */
-
-extern inline void exit_thread(void)
-{
-	/* Nothing needs to be done.  */
-}
-
 /*
  * Return saved PC of a blocked thread. For now, this is the "user" PC
  */
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 809905a811ed..40639439d8b3 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -144,13 +144,6 @@ void machine_power_off(void)
 void (*pm_power_off)(void) = machine_power_off;
 EXPORT_SYMBOL(pm_power_off);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	/* Only needs to handle fpu stuff or perf monitors.
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ea8a28fd6f31..e2f12cbcade9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1329,10 +1329,6 @@ void show_regs(struct pt_regs * regs)
 		show_instructions(regs);
 }
 
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index de0fcc08dff5..e2c9aaaf64b2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -134,6 +134,7 @@ config S390
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_EXIT_THREAD
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c
index a1519ad3d49d..aae9480706c2 100644
--- a/arch/score/kernel/process.c
+++ b/arch/score/kernel/process.c
@@ -56,8 +56,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp)
 	regs->regs[0] = sp;
 }
 
-void exit_thread(void) {}
-
 /*
  * When a process does an "exec", machine state like FPU and debug
  * registers need to be reset.  This is a hook function for that.
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc3fc81..cb93af8f8017 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -71,6 +71,7 @@ config SUPERH32
 
 config SUPERH64
 	def_bool ARCH = "sh64"
+	select HAVE_EXIT_THREAD
 	select KALLSYMS
 
 config ARCH_DEFCONFIG
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 2885fc9d9dcd..ee12e9451874 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -76,13 +76,6 @@ void start_thread(struct pt_regs *regs, unsigned long new_pc,
 }
 EXPORT_SYMBOL(start_thread);
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index db0a26cffa97..27b3a0ad40a0 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -20,6 +20,7 @@ config SPARC
 	select HAVE_OPROFILE
 	select HAVE_ARCH_KGDB if !SMP || SPARC64
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_EXIT_THREAD
 	select SYSCTL_EXCEPTION_TRACE
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select RTC_CLASS
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 81719302b056..174746225577 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -3,6 +3,7 @@
 
 config TILE
 	def_bool y
+	select HAVE_EXIT_THREAD
 	select HAVE_PERF_EVENTS
 	select USE_PMC if PERF_EVENTS
 	select HAVE_DMA_API_DEBUG
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 48af59aae129..0b04711f1f18 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -103,10 +103,6 @@ void interrupt_end(void)
 		tracehook_notify_resume(regs);
 }
 
-void exit_thread(void)
-{
-}
-
 int get_current_pid(void)
 {
 	return task_pid_nr(current);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index b008e9961465..00299c927852 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -201,13 +201,6 @@ void show_regs(struct pt_regs *regs)
 	__backtrace();
 }
 
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-}
-
 void flush_thread(void)
 {
 	struct thread_info *thread = current_thread_info();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ace79d2da2c3..8ff5b3be95d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -105,6 +105,7 @@ config X86
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select HAVE_EXIT_THREAD
 	select HAVE_FENTRY			if X86_64
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_FP_TEST
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 85257afe71c3..64336f666fb6 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -14,6 +14,7 @@ config XTENSA
 	select GENERIC_PCI_IOMAP
 	select GENERIC_SCHED_CLOCK
 	select HAVE_DMA_API_DEBUG
+	select HAVE_EXIT_THREAD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUTEX_CMPXCHG if !MMU
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6b3213d96da6..167c0d4bf3fa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2769,7 +2769,14 @@ static inline int copy_thread_tls(
 }
 #endif
 extern void flush_thread(void);
+
+#ifdef CONFIG_HAVE_EXIT_THREAD
 extern void exit_thread(void);
+#else
+static inline void exit_thread(void)
+{
+}
+#endif
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_sighand(struct sighand_struct *);
-- 
cgit v1.2.3


From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:20 -0700
Subject: exit_thread: accept a task parameter to be exited

We need to call exit_thread from copy_process in a fail path.  So make it
accept task_struct as a parameter.

[v2]
* s390: exit_thread_runtime_instr doesn't make sense to be called for
  non-current tasks.
* arm: fix the comment in vfp_thread_copy
* change 'me' to 'tsk' for task_struct
* now we can change only archs that actually have exit_thread

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/process.c           |  4 ++--
 arch/arm/vfp/vfpmodule.c            |  4 ----
 arch/avr32/kernel/process.c         |  4 ++--
 arch/cris/arch-v32/kernel/process.c |  4 ++--
 arch/ia64/kernel/perfmon.c          |  4 ++--
 arch/ia64/kernel/process.c          | 14 +++++++-------
 arch/metag/kernel/process.c         |  6 +++---
 arch/mn10300/kernel/process.c       |  4 ++--
 arch/s390/kernel/process.c          |  5 +++--
 arch/sh/kernel/process_64.c         |  5 ++---
 arch/sparc/kernel/process_32.c      | 12 ++++++------
 arch/sparc/kernel/process_64.c      |  4 ++--
 arch/tile/kernel/process.c          |  4 ++--
 arch/x86/kernel/process.c           |  5 ++---
 arch/xtensa/kernel/process.c        |  4 ++--
 include/linux/sched.h               |  4 ++--
 kernel/exit.c                       |  2 +-
 17 files changed, 42 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 4adfb46e3ee9..a647d6642f3e 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -193,9 +193,9 @@ EXPORT_SYMBOL_GPL(thread_notify_head);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	thread_notify(THREAD_NOTIFY_EXIT, current_thread_info());
+	thread_notify(THREAD_NOTIFY_EXIT, task_thread_info(tsk));
 }
 
 void flush_thread(void)
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 2a61e4b04600..73085d3482ed 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -156,10 +156,6 @@ static void vfp_thread_copy(struct thread_info *thread)
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  *  THREAD_NOTIFY_EXIT
- *   - the thread (v) will be running on the local CPU, so
- *	v === current_thread_info()
- *   - thread->cpu is the local CPU number at the time it is accessed,
- *	but may change at any time.
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  */
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 42a53e740a7e..68e5b9dac059 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -62,9 +62,9 @@ void machine_restart(char *cmd)
 /*
  * Free current thread data structures etc
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	ocd_disable(current);
+	ocd_disable(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c7ce784a393c..4d1afa9f9fd3 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -33,9 +33,9 @@ void default_idle(void)
  */
 
 extern void deconfigure_bp(long pid);
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	deconfigure_bp(current->pid);
+	deconfigure_bp(tsk->pid);
 }
 
 /*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9cd607b06964..2436ad5f92c1 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4542,8 +4542,8 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 
 
 /*
- * called only from exit_thread(): task == current
- * we come here only if current has a context attached (loaded or masked)
+ * called only from exit_thread()
+ * we come here only if the task has a context attached (loaded or masked)
  */
 void
 pfm_exit_thread(struct task_struct *task)
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index b51514957620..aae6c4dc7ae7 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -570,22 +570,22 @@ flush_thread (void)
 }
 
 /*
- * Clean up state associated with current thread.  This is called when
+ * Clean up state associated with a thread.  This is called when
  * the thread calls exit().
  */
 void
-exit_thread (void)
+exit_thread (struct task_struct *tsk)
 {
 
-	ia64_drop_fpu(current);
+	ia64_drop_fpu(tsk);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
-	if (current->thread.pfm_context)
-		pfm_exit_thread(current);
+	if (tsk->thread.pfm_context)
+		pfm_exit_thread(tsk);
 
 	/* free debug register resources */
-	if (current->thread.flags & IA64_THREAD_DBG_VALID)
-		pfm_release_debug_registers(current);
+	if (tsk->thread.flags & IA64_THREAD_DBG_VALID)
+		pfm_release_debug_registers(tsk);
 #endif
 }
 
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 7f546183a0f0..35062796edf2 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -345,10 +345,10 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc.
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	clear_fpu(&current->thread);
-	clear_dsp(&current->thread);
+	clear_fpu(&tsk->thread);
+	clear_dsp(&tsk->thread);
 }
 
 /* TODO: figure out how to unwind the kernel stack here to figure out
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 74a96ccf7451..cbede4e88dee 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -103,9 +103,9 @@ void show_regs(struct pt_regs *regs)
 /*
  * free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_fpu(current);
+	exit_fpu(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 481d7a83efc6..bba4fa74b321 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -68,9 +68,10 @@ extern void kernel_thread_starter(void);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_thread_runtime_instr();
+	if (tsk == current)
+		exit_thread_runtime_instr();
 }
 
 void flush_thread(void)
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e2062e643341..9d3e9916555d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -288,7 +288,7 @@ void show_regs(struct pt_regs *regs)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 	/*
 	 * See arch/sparc/kernel/process.c for the precedent for doing
@@ -307,9 +307,8 @@ void exit_thread(void)
 	 * which it would get safely nulled.
 	 */
 #ifdef CONFIG_SH_FPU
-	if (last_task_used_math == current) {
+	if (last_task_used_math == tsk)
 		last_task_used_math = NULL;
-	}
 #endif
 }
 
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index c5113c7ce2fd..b7780a5bef11 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -184,21 +184,21 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifndef CONFIG_SMP
-	if(last_task_used_math == current) {
+	if (last_task_used_math == tsk) {
 #else
-	if (test_thread_flag(TIF_USEDFPU)) {
+	if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) {
 #endif
 		/* Keep process from leaving FPU in a bogon state. */
 		put_psr(get_psr() | PSR_EF);
-		fpsave(&current->thread.float_regs[0], &current->thread.fsr,
-		       &current->thread.fpqueue[0], &current->thread.fpqdepth);
+		fpsave(&tsk->thread.float_regs[0], &tsk->thread.fsr,
+		       &tsk->thread.fpqueue[0], &tsk->thread.fpqdepth);
 #ifndef CONFIG_SMP
 		last_task_used_math = NULL;
 #else
-		clear_thread_flag(TIF_USEDFPU);
+		clear_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU);
 #endif
 	}
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index c16ef1af1843..fa14402b33f9 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -417,9 +417,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 }
 
 /* Free current thread data structures etc.. */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct thread_info *t = current_thread_info();
+	struct thread_info *t = task_thread_info(tsk);
 
 	if (t->utraps) {
 		if (t->utraps[0] < 2)
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index b5f30d376ce1..6b705ccc9cc1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -541,7 +541,7 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifdef CONFIG_HARDWALL
 	/*
@@ -550,7 +550,7 @@ void exit_thread(void)
 	 * the last reference to a hardwall fd, it would already have
 	 * been released and deactivated at this point.)
 	 */
-	hardwall_deactivate_all(current);
+	hardwall_deactivate_all(tsk);
 #endif
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2915d54e9dd5..96becbbb52e0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct task_struct *me = current;
-	struct thread_struct *t = &me->thread;
+	struct thread_struct *t = &tsk->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
 	struct fpu *fpu = &t->fpu;
 
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 5bbfed81c97b..e0ded48561db 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -115,10 +115,10 @@ void arch_cpu_idle(void)
 /*
  * This is called when the thread calls exit().
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #if XTENSA_HAVE_COPROCESSORS
-	coprocessor_release_all(current_thread_info());
+	coprocessor_release_all(task_thread_info(tsk));
 #endif
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 167c0d4bf3fa..02bdab4d6db7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2771,9 +2771,9 @@ static inline int copy_thread_tls(
 extern void flush_thread(void);
 
 #ifdef CONFIG_HAVE_EXIT_THREAD
-extern void exit_thread(void);
+extern void exit_thread(struct task_struct *tsk);
 #else
-static inline void exit_thread(void)
+static inline void exit_thread(struct task_struct *tsk)
 {
 }
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195667e1..75b34fe835b2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -746,7 +746,7 @@ void do_exit(long code)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
-	exit_thread();
+	exit_thread(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
-- 
cgit v1.2.3


From 2eeed7e98d6a1341b1574893a95ce5b8379140f2 Mon Sep 17 00:00:00 2001
From: René Nyffenegger <mail@renenyffenegger.ch>
Date: Fri, 20 May 2016 17:00:30 -0700
Subject: include/linux/syscalls.h: use pid_t instead of int
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In include/linux/syscalls.h, the four functions sys_kill, sys_tgkill,
sys_tkill and sys_rt_sigqueueinfo are declared with "int pid" and "int
tgid".

However, in kernel/signal.c, the corresponding definitions use the more
appropriate "pid_t" (which is a typedef'd int).

This patch changes "int" to "pid_t" in the declarations of sys_kill,
sys_tgkill, sys_tkill and sys_rt_sigqueueinfo in <linux/syscalls.h> in
order to harmonize the function declarations with their respective
definitions.

Link: http://lkml.kernel.org/r/57302FDA.7020205@renenyffenegger.ch
Signed-off-by: René Nyffenegger <mail@renenyffenegger.ch>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Cc: Zach Brown <zab@redhat.com>
Cc: Milosz Tanski <milosz@adfin.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d795472c54d8..d02239022bd0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -371,10 +371,10 @@ asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
 				size_t sigsetsize);
 asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t  pid, int sig,
 		siginfo_t __user *uinfo);
-asmlinkage long sys_kill(int pid, int sig);
-asmlinkage long sys_tgkill(int tgid, int pid, int sig);
-asmlinkage long sys_tkill(int pid, int sig);
-asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo);
+asmlinkage long sys_kill(pid_t pid, int sig);
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig);
+asmlinkage long sys_tkill(pid_t pid, int sig);
+asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo);
 asmlinkage long sys_sgetmask(void);
 asmlinkage long sys_ssetmask(int newmask);
 asmlinkage long sys_signal(int sig, __sighandler_t handler);
-- 
cgit v1.2.3


From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:33 -0700
Subject: printk/nmi: generic solution for safe printk in NMI

printk() takes some locks and could not be used a safe way in NMI
context.

The chance of a deadlock is real especially when printing stacks from
all CPUs.  This particular problem has been addressed on x86 by the
commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all
CPUs").

The patchset brings two big advantages.  First, it makes the NMI
backtraces safe on all architectures for free.  Second, it makes all NMI
messages almost safe on all architectures (the temporary buffer is
limited.  We still should keep the number of messages in NMI context at
minimum).

Note that there already are several messages printed in NMI context:
WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE
handlers.  These are not easy to avoid.

This patch reuses most of the code and makes it generic.  It is useful
for all messages and architectures that support NMI.

The alternative printk_func is set when entering and is reseted when
leaving NMI context.  It queues IRQ work to copy the messages into the
main ring buffer in a safe context.

__printk_nmi_flush() copies all available messages and reset the buffer.
Then we could use a simple cmpxchg operations to get synchronized with
writers.  There is also used a spinlock to get synchronized with other
flushers.

We do not longer use seq_buf because it depends on external lock.  It
would be hard to make all supported operations safe for a lockless use.
It would be confusing and error prone to make only some operations safe.

The code is put into separate printk/nmi.c as suggested by Steven
Rostedt.  It needs a per-CPU buffer and is compiled only on
architectures that call nmi_enter().  This is achieved by the new
HAVE_NMI Kconfig flag.

The are MN10300 and Xtensa architectures.  We need to clean up NMI
handling there first.  Let's do it separately.

The patch is heavily based on the draft from Peter Zijlstra, see

  https://lkml.org/lkml/2015/6/10/327

[arnd@arndb.de: printk-nmi: use %zu format string for size_t]
[akpm@linux-foundation.org: min_t->min - all types are size_t here]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>	[arm part]
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                  |   4 +
 arch/arm/Kconfig              |   1 +
 arch/arm/kernel/smp.c         |   2 +
 arch/avr32/Kconfig            |   1 +
 arch/blackfin/Kconfig         |   1 +
 arch/cris/Kconfig             |   1 +
 arch/mips/Kconfig             |   1 +
 arch/powerpc/Kconfig          |   1 +
 arch/s390/Kconfig             |   1 +
 arch/sh/Kconfig               |   1 +
 arch/sparc/Kconfig            |   1 +
 arch/tile/Kconfig             |   1 +
 arch/x86/Kconfig              |   1 +
 arch/x86/kernel/apic/hw_nmi.c |   1 -
 include/linux/hardirq.h       |   2 +
 include/linux/percpu.h        |   3 -
 include/linux/printk.h        |  12 ++-
 init/Kconfig                  |   5 +
 init/main.c                   |   1 +
 kernel/printk/Makefile        |   1 +
 kernel/printk/internal.h      |  44 +++++++++
 kernel/printk/nmi.c           | 219 ++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c        |  19 +---
 lib/nmi_backtrace.c           |  89 +----------------
 24 files changed, 306 insertions(+), 107 deletions(-)
 create mode 100644 kernel/printk/internal.h
 create mode 100644 kernel/printk/nmi.c

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 0f298f9123dc..8f84fd268dee 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -187,7 +187,11 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_NMI
+	bool
+
 config HAVE_NMI_WATCHDOG
+	depends on HAVE_NMI
 	bool
 #
 # An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 956d3575426c..90542db1220d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -67,6 +67,7 @@ config ARM
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	select HAVE_MEMBLOCK
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
 	select HAVE_OPTPROBES if !THUMB2_KERNEL
 	select HAVE_PERF_EVENTS
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index baee70267f29..df90bc59bfce 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -644,9 +644,11 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 		break;
 
 	case IPI_CPU_BACKTRACE:
+		printk_nmi_enter();
 		irq_enter();
 		nmi_cpu_backtrace(regs);
 		irq_exit();
+		printk_nmi_exit();
 		break;
 
 	default:
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index e43519a2ca89..7e75d45e20cd 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -18,6 +18,7 @@ config AVR32
 	select GENERIC_CLOCKEVENTS
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
+	select HAVE_NMI
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
 	  designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index a63c12259e77..28c63fea786d 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select HAVE_NMI
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 5c0ca8ae9293..deba2662b9f3 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -70,6 +70,7 @@ config CRIS
 	select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
 	select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
 	select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
+	select HAVE_NMI
 
 config HZ
 	int
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 5663f411c225..8040fb1845b4 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -48,6 +48,7 @@ config MIPS
 	select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC
 	select GENERIC_CMOS_UPDATE
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL if MODULES
 	select MODULES_USE_ELF_RELA if MODULES && 64BIT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f0403b58ae8b..01f7464d9fea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -155,6 +155,7 @@ config PPC
 	select NO_BOOTMEM
 	select HAVE_GENERIC_RCU_GUP
 	select HAVE_PERF_EVENTS_NMI if PPC64
+	select HAVE_NMI if PERF_EVENTS
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
 	select ARCH_HAS_DMA_SET_COHERENT_MASK
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e2c9aaaf64b2..1c3c43d9d1b5 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -166,6 +166,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
+	select HAVE_NMI
 
 
 config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index cb93af8f8017..f6254341c065 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_NMI
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 27b3a0ad40a0..1012f7ffcdf5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -79,6 +79,7 @@ config SPARC64
 	select NO_BOOTMEM
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select HAVE_NMI
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 174746225577..76989b878f3c 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -30,6 +30,7 @@ config TILE
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_CONTEXT_TRACKING
+	select HAVE_NMI if USE_PMC
 	select EDAC_SUPPORT
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ff5b3be95d4..0a7b885964ba 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -131,6 +131,7 @@ config X86
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
+	select HAVE_NMI
 	select HAVE_OPROFILE
 	select HAVE_OPTPROBES
 	select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 045e424fb368..7788ce643bf4 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/seq_buf.h>
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..c683996110b1 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,6 +61,7 @@ extern void irq_exit(void);
 
 #define nmi_enter()						\
 	do {							\
+		printk_nmi_enter();				\
 		lockdep_off();					\
 		ftrace_nmi_enter();				\
 		BUG_ON(in_nmi());				\
@@ -77,6 +78,7 @@ extern void irq_exit(void);
 		preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
 		ftrace_nmi_exit();				\
 		lockdep_on();					\
+		printk_nmi_exit();				\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4bc6dafb703e..56939d3f6e53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -129,7 +129,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
 						__alignof__(type))
 
-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
 #endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9ccbdf2c1453..51dd6b824fe2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -122,7 +122,17 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_nmi_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_nmi_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index 79a91a2c0444..bccc1d607be5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1454,6 +1454,11 @@ config PRINTK
 	  very difficult to diagnose system problems, saying N here is
 	  strongly discouraged.
 
+config PRINTK_NMI
+	def_bool y
+	depends on PRINTK
+	depends on HAVE_NMI
+
 config BUG
 	bool "BUG() support" if EXPERT
 	default y
diff --git a/init/main.c b/init/main.c
index 2075fafaad59..fa9b2bdde183 100644
--- a/init/main.c
+++ b/init/main.c
@@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void)
 	timekeeping_init();
 	time_init();
 	sched_clock_postinit();
+	printk_nmi_init();
 	perf_event_init();
 	profile_init();
 	call_function_init();
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..abb0042a427b 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,3 @@
 obj-y	= printk.o
+obj-$(CONFIG_PRINTK_NMI)		+= nmi.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..2de99faedfc1
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,44 @@
+/*
+ * internal.h - printk internal definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/percpu.h>
+
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+
+#ifdef CONFIG_PRINTK_NMI
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it temporary stores the strings into a per-CPU buffer.
+ * The alternative implementation is chosen transparently
+ * via per-CPU variable.
+ */
+DECLARE_PER_CPU(printk_func_t, printk_func);
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else /* CONFIG_PRINTK_NMI */
+
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return vprintk_default(fmt, args);
+}
+
+#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
new file mode 100644
index 000000000000..303cf0d15e57
--- /dev/null
+++ b/kernel/printk/nmi.c
@@ -0,0 +1,219 @@
+/*
+ * nmi.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_nmi_irq_ready;
+
+#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct nmi_seq_buf {
+	atomic_t		len;	/* length of written data */
+	struct irq_work		work;	/* IRQ work that flushes the buffer */
+	unsigned char		buffer[NMI_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	int add = 0;
+	size_t len;
+
+again:
+	len = atomic_read(&s->len);
+
+	if (len >= sizeof(s->buffer))
+		return 0;
+
+	/*
+	 * Make sure that all old data have been read before the buffer was
+	 * reseted. This is not needed when we just append data.
+	 */
+	if (!len)
+		smp_rmb();
+
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+	/*
+	 * Do it once again if the buffer has been flushed in the meantime.
+	 * Note that atomic_cmpxchg() is an implicit memory barrier that
+	 * makes sure that the data were written before updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, len + add) != len)
+		goto again;
+
+	/* Get flushed in a more safe context. */
+	if (add && printk_nmi_irq_ready) {
+		/* Make sure that IRQ work is really initialized. */
+		smp_rmb();
+		irq_work_queue(&s->work);
+	}
+
+	return add;
+}
+
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk("%.*s", (end - start) + 1, buf);
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_nmi_flush(struct irq_work *work)
+{
+	static raw_spinlock_t read_lock =
+		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
+	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+	unsigned long flags;
+	size_t len, size;
+	int i, last_i;
+
+	/*
+	 * The lock has two functions. First, one reader has to flush all
+	 * available message to make the lockless synchronization with
+	 * writers easier. Second, we do not want to mix messages from
+	 * different CPUs. This is especially important when printing
+	 * a backtrace.
+	 */
+	raw_spin_lock_irqsave(&read_lock, flags);
+
+	i = 0;
+more:
+	len = atomic_read(&s->len);
+
+	/*
+	 * This is just a paranoid check that nobody has manipulated
+	 * the buffer an unexpected way. If we printed something then
+	 * @len must only increase.
+	 */
+	if (i && i >= len)
+		pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+		       i, len);
+
+	if (!len)
+		goto out; /* Someone else has already flushed the buffer. */
+
+	/* Make sure that data has been written up to the @len */
+	smp_rmb();
+
+	size = min(len, sizeof(s->buffer));
+	last_i = i;
+
+	/* Print line by line. */
+	for (; i < size; i++) {
+		if (s->buffer[i] == '\n') {
+			print_nmi_seq_line(s, last_i, i);
+			last_i = i + 1;
+		}
+	}
+	/* Check if there was a partial line. */
+	if (last_i < size) {
+		print_nmi_seq_line(s, last_i, size - 1);
+		pr_cont("\n");
+	}
+
+	/*
+	 * Check that nothing has got added in the meantime and truncate
+	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
+	 * barrier that makes sure that the data were copied before
+	 * updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, 0) != len)
+		goto more;
+
+out:
+	raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_nmi_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_nmi_flush(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void __init printk_nmi_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+		init_irq_work(&s->work, __printk_nmi_flush);
+	}
+
+	/* Make sure that IRQ works are initialized before enabling. */
+	smp_wmb();
+	printk_nmi_irq_ready = 1;
+
+	/* Flush pending messages that did not have scheduled IRQ works. */
+	printk_nmi_flush();
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bfbf284e4218..71eba0607034 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
 
 #include "console_cmdline.h"
 #include "braille.h"
+#include "internal.h"
 
 int console_printk[4] = {
 	CONSOLE_LOGLEVEL_DEFAULT,	/* console_loglevel */
@@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(vprintk_default);
 
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
 /**
  * printk - print a kernel message
  * @fmt: format string
@@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
  */
 asmlinkage __visible int printk(const char *fmt, ...)
 {
-	printk_func_t vprintk_func;
 	va_list args;
 	int r;
 
 	va_start(args, fmt);
-
-	/*
-	 * If a caller overrides the per_cpu printk_func, then it needs
-	 * to disable preemption when calling printk(). Otherwise
-	 * the printk_func should be set to the default. No need to
-	 * disable preemption here.
-	 */
-	vprintk_func = this_cpu_read(printk_func);
 	r = vprintk_func(fmt, args);
-
 	va_end(args);
 
 	return r;
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 6019c53c669e..26caf51cc238 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,33 +16,14 @@
 #include <linux/delay.h>
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
-#include <linux/seq_buf.h>
 
 #ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
 
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
 
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
-}
-
 /*
  * When raise() is called it will be is passed a pointer to the
  * backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
 void nmi_trigger_all_cpu_backtrace(bool include_self,
 				   void (*raise)(cpumask_t *mask))
 {
-	struct nmi_seq_buf *s;
-	int i, cpu, this_cpu = get_cpu();
+	int i, this_cpu = get_cpu();
 
 	if (test_and_set_bit(0, &backtrace_flag)) {
 		/*
@@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	if (!include_self)
 		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
 
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
 	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
 		pr_info("Sending NMI to %s CPUs:\n",
 			(include_self ? "all" : "other"));
@@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	}
 
 	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
+	 * Force flush any remote buffers that might be stuck in IRQ context
+	 * and therefore could not run their irq_work.
 	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int len, last_i = 0;
+	printk_nmi_flush();
 
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
-
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
+	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
 }
 
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
-}
-
 bool nmi_cpu_backtrace(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
 		pr_warn("NMI backtrace for cpu %d\n", cpu);
 		if (regs)
 			show_regs(regs);
 		else
 			dump_stack();
-		this_cpu_write(printk_func, printk_func_save);
-
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return true;
 	}
-- 
cgit v1.2.3


From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:42 -0700
Subject: printk/nmi: flush NMI messages on the system panic

In NMI context, printk() messages are stored into per-CPU buffers to
avoid a possible deadlock.  They are normally flushed to the main ring
buffer via an IRQ work.  But the work is never called when the system
calls panic() in the very same NMI handler.

This patch tries to flush NMI buffers before the crash dump is
generated.  In this case it does not risk a double release and bails out
when the logbuf_lock is already taken.  The aim is to get the messages
into the main ring buffer when possible.  It makes them better
accessible in the vmcore.

Then the patch tries to flush the buffers second time when other CPUs
are down.  It might be more aggressive and reset logbuf_lock.  The aim
is to get the messages available for the consequent kmsg_dump() and
console_flush_on_panic() calls.

The patch causes vprintk_emit() to be called even in NMI context again.
But it is done via printk_deferred() so that the console handling is
skipped.  Consoles use internal locks and we could not prevent a
deadlock easily.  They are explicitly called later when the crash dump
is not generated, see console_flush_on_panic().

Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h   |  2 ++
 kernel/kexec_core.c      |  1 +
 kernel/panic.c           |  6 +++++-
 kernel/printk/internal.h |  2 ++
 kernel/printk/nmi.c      | 39 ++++++++++++++++++++++++++++++++++++++-
 kernel/printk/printk.c   |  2 +-
 6 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 51dd6b824fe2..f4da695fd615 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -127,11 +127,13 @@ extern void printk_nmi_init(void);
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
 extern void printk_nmi_flush(void);
+extern void printk_nmi_flush_on_panic(void);
 #else
 static inline void printk_nmi_init(void) { }
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
 static inline void printk_nmi_flush(void) { }
+static inline void printk_nmi_flush_on_panic(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1c03dfb4abfd..d5d408252992 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs)
 	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
 	if (old_cpu == PANIC_CPU_INVALID) {
 		/* This is the 1st CPU which comes here, so go ahead. */
+		printk_nmi_flush_on_panic();
 		__crash_kexec(regs);
 
 		/*
diff --git a/kernel/panic.c b/kernel/panic.c
index 535c96510a44..8aa74497cc5a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -160,8 +160,10 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (!crash_kexec_post_notifiers)
+	if (!crash_kexec_post_notifiers) {
+		printk_nmi_flush_on_panic();
 		__crash_kexec(NULL);
+	}
 
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
@@ -176,6 +178,8 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
+	/* Call flush even twice. It tries harder with a single online CPU */
+	printk_nmi_flush_on_panic();
 	kmsg_dump(KMSG_DUMP_PANIC);
 
 	/*
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 341bedccc065..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
+extern raw_spinlock_t logbuf_lock;
+
 /*
  * printk() could not take logbuf_lock in NMI context. Instead,
  * it temporary stores the strings into a per-CPU buffer.
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bf08557d7e3d..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -17,6 +17,7 @@
 
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
+#include <linux/debug_locks.h>
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/irq_work.h>
@@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
 {
 	const char *buf = s->buffer + start;
 
-	printk("%.*s", (end - start) + 1, buf);
+	/*
+	 * The buffers are flushed in NMI only on panic.  The messages must
+	 * go only into the ring buffer at this stage.  Consoles will get
+	 * explicitly called later when a crashdump is not generated.
+	 */
+	if (in_nmi())
+		printk_deferred("%.*s", (end - start) + 1, buf);
+	else
+		printk("%.*s", (end - start) + 1, buf);
+
 }
 
 /*
@@ -194,6 +204,33 @@ void printk_nmi_flush(void)
 		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
 }
 
+/**
+ * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ *	goes down.
+ *
+ * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_nmi_flush_on_panic(void)
+{
+	/*
+	 * Make sure that we could access the main ring buffer.
+	 * Do not risk a double release when more CPUs are up.
+	 */
+	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+		if (num_online_cpus() > 1)
+			return;
+
+		debug_locks_off();
+		raw_spin_lock_init(&logbuf_lock);
+	}
+
+	printk_nmi_flush();
+}
+
 void __init printk_nmi_init(void)
 {
 	int cpu;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e38579d730f4..60cdf6386763 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -245,7 +245,7 @@ __packed __aligned(4)
  * within the scheduler's rq lock. It must be released before calling
  * console_unlock() or anything else that might wake up a process.
  */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
+DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #ifdef CONFIG_PRINTK
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
-- 
cgit v1.2.3


From 8da4b8c48e7b43cb16d05e1dbb34ad9f73ab7efd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:00 -0700
Subject: lib/uuid.c: move generate_random_uuid() to uuid.c

Let's gather the UUID related functions under one hood.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/random.c  | 21 +--------------------
 fs/btrfs/volumes.c     |  2 +-
 fs/ext4/ioctl.c        |  2 +-
 fs/f2fs/file.c         |  2 +-
 fs/reiserfs/objectid.c |  2 +-
 fs/ubifs/sb.c          |  2 +-
 include/linux/random.h |  1 -
 include/linux/uuid.h   |  2 ++
 lib/uuid.c             | 20 ++++++++++++++++++++
 9 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index b583e5336630..0158d3bff7e5 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -260,6 +260,7 @@
 #include <linux/irq.h>
 #include <linux/syscalls.h>
 #include <linux/completion.h>
+#include <linux/uuid.h>
 
 #include <asm/processor.h>
 #include <asm/uaccess.h>
@@ -1621,26 +1622,6 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 	return urandom_read(NULL, buf, count, NULL);
 }
 
-/***************************************************************
- * Random UUID interface
- *
- * Used here for a Boot ID, but can be useful for other kernel
- * drivers.
- ***************************************************************/
-
-/*
- * Generate random UUID
- */
-void generate_random_uuid(unsigned char uuid_out[16])
-{
-	get_random_bytes(uuid_out, 16);
-	/* Set UUID version to 4 --- truly random generation */
-	uuid_out[6] = (uuid_out[6] & 0x0F) | 0x40;
-	/* Set the UUID variant to DCE */
-	uuid_out[8] = (uuid_out[8] & 0x3F) | 0x80;
-}
-EXPORT_SYMBOL(generate_random_uuid);
-
 /********************************************************************
  *
  * Sysctl interface
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bd0f45fb38c4..bfb80da3e6eb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,13 +20,13 @@
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
-#include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/semaphore.h>
+#include <linux/uuid.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eae5917c534e..7497f50cb293 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -13,8 +13,8 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/file.h>
-#include <linux/random.h>
 #include <linux/quotaops.h>
+#include <linux/uuid.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index eb9d027e5981..c6b14951bef3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,7 +20,7 @@
 #include <linux/uaccess.h>
 #include <linux/mount.h>
 #include <linux/pagevec.h>
-#include <linux/random.h>
+#include <linux/uuid.h>
 
 #include "f2fs.h"
 #include "node.h"
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 99a5d5dae46a..415d66ca87d1 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -3,8 +3,8 @@
  */
 
 #include <linux/string.h>
-#include <linux/random.h>
 #include <linux/time.h>
+#include <linux/uuid.h>
 #include "reiserfs.h"
 
 /* find where objectid map starts */
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index f4fbc7b6b794..3cbb904a6d7d 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,8 +28,8 @@
 
 #include "ubifs.h"
 #include <linux/slab.h>
-#include <linux/random.h>
 #include <linux/math64.h>
+#include <linux/uuid.h>
 
 /*
  * Default journal size in logical eraseblocks as a percent of total
diff --git a/include/linux/random.h b/include/linux/random.h
index 9c29122037f9..e47e533742b5 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -26,7 +26,6 @@ extern void get_random_bytes(void *buf, int nbytes);
 extern int add_random_ready_callback(struct random_ready_callback *rdy);
 extern void del_random_ready_callback(struct random_ready_callback *rdy);
 extern void get_random_bytes_arch(void *buf, int nbytes);
-void generate_random_uuid(unsigned char uuid_out[16]);
 extern int random_int_secret_init(void);
 
 #ifndef MODULE
diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 6df2509033d7..91c2b6d9cbb7 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -33,6 +33,8 @@ static inline int uuid_be_cmp(const uuid_be u1, const uuid_be u2)
 	return memcmp(&u1, &u2, sizeof(uuid_be));
 }
 
+void generate_random_uuid(unsigned char uuid[16]);
+
 extern void uuid_le_gen(uuid_le *u);
 extern void uuid_be_gen(uuid_be *u);
 
diff --git a/lib/uuid.c b/lib/uuid.c
index 398821e4dce1..6c81c0b0467e 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -23,6 +23,26 @@
 #include <linux/uuid.h>
 #include <linux/random.h>
 
+/***************************************************************
+ * Random UUID interface
+ *
+ * Used here for a Boot ID, but can be useful for other kernel
+ * drivers.
+ ***************************************************************/
+
+/*
+ * Generate random UUID
+ */
+void generate_random_uuid(unsigned char uuid[16])
+{
+	get_random_bytes(uuid, 16);
+	/* Set UUID version to 4 --- truly random generation */
+	uuid[6] = (uuid[6] & 0x0F) | 0x40;
+	/* Set the UUID variant to DCE */
+	uuid[8] = (uuid[8] & 0x3F) | 0x80;
+}
+EXPORT_SYMBOL(generate_random_uuid);
+
 static void __uuid_gen_common(__u8 b[16])
 {
 	prandom_bytes(b, 16);
-- 
cgit v1.2.3


From 2b1b0d66704a8cafe83be7114ec4c15ab3a314ad Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:04 -0700
Subject: lib/uuid.c: introduce a few more generic helpers

There are new helpers in this patch:

  uuid_is_valid		checks if a UUID is valid
  uuid_be_to_bin	converts from string to binary (big endian)
  uuid_le_to_bin	converts from string to binary (little endian)

They will be used in future, i.e. in the following patches in the series.

This also moves the indices arrays to lib/uuid.c to be shared accross
modules.

[andriy.shevchenko@linux.intel.com: fix typo]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uuid.h | 13 +++++++++++
 lib/uuid.c           | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/vsprintf.c       |  9 ++++----
 3 files changed, 82 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index 91c2b6d9cbb7..e0b95e728a77 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -22,6 +22,11 @@
 
 #include <uapi/linux/uuid.h>
 
+/*
+ * The length of a UUID string ("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")
+ * not including trailing NUL.
+ */
+#define	UUID_STRING_LEN		36
 
 static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2)
 {
@@ -38,4 +43,12 @@ void generate_random_uuid(unsigned char uuid[16]);
 extern void uuid_le_gen(uuid_le *u);
 extern void uuid_be_gen(uuid_be *u);
 
+bool __must_check uuid_is_valid(const char *uuid);
+
+extern const u8 uuid_le_index[16];
+extern const u8 uuid_be_index[16];
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u);
+int uuid_be_to_bin(const char *uuid, uuid_be *u);
+
 #endif
diff --git a/lib/uuid.c b/lib/uuid.c
index 6c81c0b0467e..82787f652fbc 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -19,10 +19,17 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/uuid.h>
 #include <linux/random.h>
 
+const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_le_index);
+const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+EXPORT_SYMBOL(uuid_be_index);
+
 /***************************************************************
  * Random UUID interface
  *
@@ -65,3 +72,61 @@ void uuid_be_gen(uuid_be *bu)
 	bu->b[6] = (bu->b[6] & 0x0F) | 0x40;
 }
 EXPORT_SYMBOL_GPL(uuid_be_gen);
+
+/**
+  * uuid_is_valid - checks if UUID string valid
+  * @uuid:	UUID string to check
+  *
+  * Description:
+  * It checks if the UUID string is following the format:
+  *	xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+  * where x is a hex digit.
+  *
+  * Return: true if input is valid UUID string.
+  */
+bool uuid_is_valid(const char *uuid)
+{
+	unsigned int i;
+
+	for (i = 0; i < UUID_STRING_LEN; i++) {
+		if (i == 8 || i == 13 || i == 18 || i == 23) {
+			if (uuid[i] != '-')
+				return false;
+		} else if (!isxdigit(uuid[i])) {
+			return false;
+		}
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(uuid_is_valid);
+
+static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16])
+{
+	static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34};
+	unsigned int i;
+
+	if (!uuid_is_valid(uuid))
+		return -EINVAL;
+
+	for (i = 0; i < 16; i++) {
+		int hi = hex_to_bin(uuid[si[i]] + 0);
+		int lo = hex_to_bin(uuid[si[i]] + 1);
+
+		b[ei[i]] = (hi << 4) | lo;
+	}
+
+	return 0;
+}
+
+int uuid_le_to_bin(const char *uuid, uuid_le *u)
+{
+	return __uuid_to_bin(uuid, u->b, uuid_le_index);
+}
+EXPORT_SYMBOL(uuid_le_to_bin);
+
+int uuid_be_to_bin(const char *uuid, uuid_be *u)
+{
+	return __uuid_to_bin(uuid, u->b, uuid_be_index);
+}
+EXPORT_SYMBOL(uuid_be_to_bin);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index be0e7cf11e48..0967771d8f7f 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -30,6 +30,7 @@
 #include <linux/ioport.h>
 #include <linux/dcache.h>
 #include <linux/cred.h>
+#include <linux/uuid.h>
 #include <net/addrconf.h>
 #ifdef CONFIG_BLOCK
 #include <linux/blkdev.h>
@@ -1304,19 +1305,17 @@ static noinline_for_stack
 char *uuid_string(char *buf, char *end, const u8 *addr,
 		  struct printf_spec spec, const char *fmt)
 {
-	char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
+	char uuid[UUID_STRING_LEN + 1];
 	char *p = uuid;
 	int i;
-	static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-	static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
-	const u8 *index = be;
+	const u8 *index = uuid_be_index;
 	bool uc = false;
 
 	switch (*(++fmt)) {
 	case 'L':
 		uc = true;		/* fall-through */
 	case 'l':
-		index = le;
+		index = uuid_le_index;
 		break;
 	case 'B':
 		uc = true;
-- 
cgit v1.2.3


From e3a93bce69ad3e2c38927abe311b8cb4f17abbaf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:07 -0700
Subject: lib/uuid.c: remove FSF address

There is no point in keeping an address in the file since it's subject
to change.

While here, update Intel Copyright years.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uuid.h      | 6 +-----
 include/uapi/linux/uuid.h | 4 ----
 lib/uuid.c                | 6 +-----
 3 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/uuid.h b/include/linux/uuid.h
index e0b95e728a77..2d095fc60204 100644
--- a/include/linux/uuid.h
+++ b/include/linux/uuid.h
@@ -1,7 +1,7 @@
 /*
  * UUID/GUID definition
  *
- * Copyright (C) 2010, Intel Corp.
+ * Copyright (C) 2010, 2016 Intel Corp.
  *	Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #ifndef _LINUX_UUID_H_
 #define _LINUX_UUID_H_
diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h
index 786f0773cc33..3738e5fb6a4d 100644
--- a/include/uapi/linux/uuid.h
+++ b/include/uapi/linux/uuid.h
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #ifndef _UAPI_LINUX_UUID_H_
diff --git a/lib/uuid.c b/lib/uuid.c
index 82787f652fbc..e116ae5fa00f 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -1,7 +1,7 @@
 /*
  * Unified UUID/GUID definition
  *
- * Copyright (C) 2009, Intel Corp.
+ * Copyright (C) 2009, 2016 Intel Corp.
  *	Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From ba7e34b1bbd2722685bbc75d168672d5154d8614 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:18 -0700
Subject: include/linux/efi.h: redefine type, constant, macro from generic code

Generic UUID library defines structure type, macro to define UUID, and
the length of the UUID string.  This patch removes duplicate data
structure definition, UUID string length constant as well as macro for
UUID handling.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/efi.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index df7acb51f3cc..c2db3ca22217 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -21,6 +21,7 @@
 #include <linux/pfn.h>
 #include <linux/pstore.h>
 #include <linux/reboot.h>
+#include <linux/uuid.h>
 #include <linux/screen_info.h>
 
 #include <asm/page.h>
@@ -44,17 +45,10 @@ typedef u16 efi_char16_t;		/* UNICODE character */
 typedef u64 efi_physical_addr_t;
 typedef void *efi_handle_t;
 
-
-typedef struct {
-	u8 b[16];
-} efi_guid_t;
+typedef uuid_le efi_guid_t;
 
 #define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \
-((efi_guid_t) \
-{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
-  (b) & 0xff, ((b) >> 8) & 0xff, \
-  (c) & 0xff, ((c) >> 8) & 0xff, \
-  (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+	UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
 
 /*
  * Generic EFI table header
@@ -1117,7 +1111,7 @@ extern int efi_status_to_err(efi_status_t status);
  * Length of a GUID string (strlen("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"))
  * not including trailing NUL
  */
-#define EFI_VARIABLE_GUID_LEN 36
+#define EFI_VARIABLE_GUID_LEN	UUID_STRING_LEN
 
 /*
  * The type of search to perform when calling boottime->locate_handle
-- 
cgit v1.2.3


From 63579785752ba7d0e842078ec6b2875367046f06 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:24 -0700
Subject: include/linux/genhd.h: move to use generic UUID library

UUID library provides uuid_be type and uuid_be_to_bin() function.  This
substitutes open coded variant by generic library calls.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genhd.h | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5c706765404a..359a8e4bd44d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -14,6 +14,7 @@
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
+#include <linux/uuid.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -93,7 +94,7 @@ struct disk_stats {
  * Enough for the string representation of any kind of UUID plus NULL.
  * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
  */
-#define PARTITION_META_INFO_UUIDLTH	37
+#define PARTITION_META_INFO_UUIDLTH	(UUID_STRING_LEN + 1)
 
 struct partition_meta_info {
 	char uuid[PARTITION_META_INFO_UUIDLTH];
@@ -228,27 +229,9 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 	return NULL;
 }
 
-static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
-{
-	int i;
-	for (i = 0; i < 16; ++i) {
-		*to++ = (hex_to_bin(*uuid_str) << 4) |
-			(hex_to_bin(*(uuid_str + 1)));
-		uuid_str += 2;
-		switch (i) {
-		case 3:
-		case 5:
-		case 7:
-		case 9:
-			uuid_str++;
-			continue;
-		}
-	}
-}
-
 static inline int blk_part_pack_uuid(const u8 *uuid_str, u8 *to)
 {
-	part_pack_uuid(uuid_str, to);
+	uuid_be_to_bin(uuid_str, (uuid_be *)to);
 	return 0;
 }
 
-- 
cgit v1.2.3


From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:01:33 -0700
Subject: radix-tree: introduce radix_tree_empty

Commit e61452365372 ("radix_tree: add support for multi-order entries")
left the impression that the support for multiorder radix tree entries
was functional.  As soon as Ross tried to use it, it became apparent
that my testing was completely inadequate, and it didn't even work a
little bit for orders that were not a multiple of shift.

This series of patches is the result of about 6 weeks of redesign,
reimplementation, testing, arguing and hair-pulling.  The great news is
that the test-suite is now far better than it was.  That's reflected in
the diffstat for the test-suite alone:

 12 files changed, 436 insertions(+), 28 deletions(-)

The highlight for users of the tree is that the restriction on the order
of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix
tree now supports any order between 0 and 64.

For those who are interested in how the tree works, patch 9 is probably
the most interesting one as it introduces the new machinery for handling
sibling entries.

I've tried to be fair in attributing authorship to the person who
contributed the majority of the code in each patch; Ross has been an
invaluable partner in the development of this support and it's fair to
say that each of us has code in every commit.

I should also express my appreciation of the 0day testing.  It prompted
me that I was bloating the tinyconfig in an unacceptable way, and it
bisected to a commit which contained a rather nasty memory-corruption
bug.

This patch (of 29):

The irqdomain code was checking for 0 or 1 entries, not 0 entries like
the comment said they were.  Introduce a new helper that will actually
check for an empty tree.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 5 +++++
 kernel/irq/irqdomain.c     | 7 +------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 51a97ac8bfbf..83f708e5db59 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -136,6 +136,11 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
+static inline bool radix_tree_empty(struct radix_tree_root *root)
+{
+	return root->rnode == NULL;
+}
+
 /**
  * Radix-tree synchronization
  *
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d65f6f31a5b3..8798b6c9e945 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain)
 {
 	mutex_lock(&irq_domain_mutex);
 
-	/*
-	 * radix_tree_delete() takes care of destroying the root
-	 * node when all entries are removed. Shout if there are
-	 * any mappings left.
-	 */
-	WARN_ON(domain->revmap_tree.height);
+	WARN_ON(!radix_tree_empty(&domain->revmap_tree));
 
 	list_del(&domain->link);
 
-- 
cgit v1.2.3


From 97d778b2de9213c7a7483dad0f533c1af9f0810f Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:01:42 -0700
Subject: radix tree test suite: allow testing other fan-out values

The defines in regression2.c are already in radix-tree.h and duplicating
them in the test case makes experimenting with other values for the
fan-out harder than necessary.  Allow the user of the radix tree to
decide what the fan-out should be rather than fixing it to 8 for
non-kernel uses.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h              | 4 +---
 tools/testing/radix-tree/linux/kernel.h | 2 ++
 tools/testing/radix-tree/regression2.c  | 7 -------
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 83f708e5db59..5ce5a1e0ecc5 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -70,10 +70,8 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 
 #define RADIX_TREE_MAX_TAGS 3
 
-#ifdef __KERNEL__
+#ifndef RADIX_TREE_MAP_SHIFT
 #define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
 #endif
 
 #define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 76a88f35fdc4..31fe2c77d7ae 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -12,6 +12,8 @@
 #define CONFIG_SHMEM
 #define CONFIG_SWAP
 
+#define RADIX_TREE_MAP_SHIFT	3
+
 #ifndef NULL
 #define NULL	0
 #endif
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
index 5d2fa28cdca3..63bf347aaf33 100644
--- a/tools/testing/radix-tree/regression2.c
+++ b/tools/testing/radix-tree/regression2.c
@@ -51,13 +51,6 @@
 
 #include "regression.h"
 
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT    (CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT    3       /* For more stressful testing */
-#endif
-
-#define RADIX_TREE_MAP_SIZE     (1UL << RADIX_TREE_MAP_SHIFT)
 #define PAGECACHE_TAG_DIRTY     0
 #define PAGECACHE_TAG_WRITEBACK 1
 #define PAGECACHE_TAG_TOWRITE   2
-- 
cgit v1.2.3


From 6c4bd68a2962c03423a226d949caf64216d013cc Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:01:51 -0700
Subject: radix-tree: remove unused looping macros

radix_tree_for_each_chunk() and radix_tree_for_each_chunk_slot() have
never been used in the kernel since their introduction in 2012, so
remove them.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 5ce5a1e0ecc5..e1512a607709 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -478,34 +478,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 	return NULL;
 }
 
-/**
- * radix_tree_for_each_chunk - iterate over chunks
- *
- * @slot:	the void** variable for pointer to chunk first slot
- * @root:	the struct radix_tree_root pointer
- * @iter:	the struct radix_tree_iter pointer
- * @start:	iteration starting index
- * @flags:	RADIX_TREE_ITER_* and tag index
- *
- * Locks can be released and reacquired between iterations.
- */
-#define radix_tree_for_each_chunk(slot, root, iter, start, flags)	\
-	for (slot = radix_tree_iter_init(iter, start) ;			\
-	      (slot = radix_tree_next_chunk(root, iter, flags)) ;)
-
-/**
- * radix_tree_for_each_chunk_slot - iterate over slots in one chunk
- *
- * @slot:	the void** variable, at the beginning points to chunk first slot
- * @iter:	the struct radix_tree_iter pointer
- * @flags:	RADIX_TREE_ITER_*, should be constant
- *
- * This macro is designed to be nested inside radix_tree_for_each_chunk().
- * @slot points to the radix tree slot, @iter->index contains its index.
- */
-#define radix_tree_for_each_chunk_slot(slot, iter, flags)		\
-	for (; slot ; slot = radix_tree_next_slot(slot, iter, flags))
-
 /**
  * radix_tree_for_each_slot - iterate over non-empty slots
  *
-- 
cgit v1.2.3


From 21ef533931f73a8e963a6107aa5ec51b192f28be Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 20 May 2016 17:02:26 -0700
Subject: radix-tree: add support for multi-order iterating

This enables the macros radix_tree_for_each_slot() and friends to be
used with multi-order entries.

The way that this works is that we treat all entries in a given slots[]
array as a single chunk.  If the index given to radix_tree_next_chunk()
happens to point us to a sibling entry, we will back up iter->index so
that it points to the canonical entry, and that will be the place where
we start our iteration.

As we're processing a chunk in radix_tree_next_slot(), we process
canonical entries, skip over sibling entries, and restart the chunk
lookup if we find a non-sibling indirect pointer.  This drops back to
the radix_tree_next_chunk() code, which will re-walk the tree and look
for another chunk.

This allows us to properly handle multi-order entries mixed with other
entries that are at various heights in the radix tree.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h                    | 69 +++++++++++++++++++++++----
 lib/radix-tree.c                              | 66 ++++++++++++++-----------
 tools/testing/radix-tree/generated/autoconf.h |  3 ++
 tools/testing/radix-tree/linux/kernel.h       |  5 +-
 4 files changed, 102 insertions(+), 41 deletions(-)
 create mode 100644 tools/testing/radix-tree/generated/autoconf.h

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index e1512a607709..8558d52e1c7b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -330,8 +330,9 @@ static inline void radix_tree_preload_end(void)
  * struct radix_tree_iter - radix tree iterator state
  *
  * @index:	index of current slot
- * @next_index:	next-to-last index for this chunk
+ * @next_index:	one beyond the last index for this chunk
  * @tags:	bit-mask for tag-iterating
+ * @shift:	shift for the node that holds our slots
  *
  * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
  * subinterval of slots contained within one radix tree leaf node.  It is
@@ -344,8 +345,20 @@ struct radix_tree_iter {
 	unsigned long	index;
 	unsigned long	next_index;
 	unsigned long	tags;
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	unsigned int	shift;
+#endif
 };
 
+static inline unsigned int iter_shift(struct radix_tree_iter *iter)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	return iter->shift;
+#else
+	return 0;
+#endif
+}
+
 #define RADIX_TREE_ITER_TAG_MASK	0x00FF	/* tag index in lower byte */
 #define RADIX_TREE_ITER_TAGGED		0x0100	/* lookup tagged slots */
 #define RADIX_TREE_ITER_CONTIG		0x0200	/* stop at first hole */
@@ -405,6 +418,12 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
 	return NULL;
 }
 
+static inline unsigned long
+__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
+{
+	return iter->index + (slots << iter_shift(iter));
+}
+
 /**
  * radix_tree_iter_next - resume iterating when the chunk may be invalid
  * @iter:	iterator state
@@ -416,7 +435,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
 static inline __must_check
 void **radix_tree_iter_next(struct radix_tree_iter *iter)
 {
-	iter->next_index = iter->index + 1;
+	iter->next_index = __radix_tree_iter_add(iter, 1);
 	iter->tags = 0;
 	return NULL;
 }
@@ -430,7 +449,12 @@ void **radix_tree_iter_next(struct radix_tree_iter *iter)
 static __always_inline long
 radix_tree_chunk_size(struct radix_tree_iter *iter)
 {
-	return iter->next_index - iter->index;
+	return (iter->next_index - iter->index) >> iter_shift(iter);
+}
+
+static inline void *indirect_to_ptr(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
 }
 
 /**
@@ -448,24 +472,51 @@ static __always_inline void **
 radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 {
 	if (flags & RADIX_TREE_ITER_TAGGED) {
+		void *canon = slot;
+
 		iter->tags >>= 1;
+		if (unlikely(!iter->tags))
+			return NULL;
+		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+					radix_tree_is_indirect_ptr(slot[1])) {
+			if (indirect_to_ptr(slot[1]) == canon) {
+				iter->tags >>= 1;
+				iter->index = __radix_tree_iter_add(iter, 1);
+				slot++;
+				continue;
+			}
+			iter->next_index = __radix_tree_iter_add(iter, 1);
+			return NULL;
+		}
 		if (likely(iter->tags & 1ul)) {
-			iter->index++;
+			iter->index = __radix_tree_iter_add(iter, 1);
 			return slot + 1;
 		}
-		if (!(flags & RADIX_TREE_ITER_CONTIG) && likely(iter->tags)) {
+		if (!(flags & RADIX_TREE_ITER_CONTIG)) {
 			unsigned offset = __ffs(iter->tags);
 
 			iter->tags >>= offset;
-			iter->index += offset + 1;
+			iter->index = __radix_tree_iter_add(iter, offset + 1);
 			return slot + offset + 1;
 		}
 	} else {
-		long size = radix_tree_chunk_size(iter);
+		long count = radix_tree_chunk_size(iter);
+		void *canon = slot;
 
-		while (--size > 0) {
+		while (--count > 0) {
 			slot++;
-			iter->index++;
+			iter->index = __radix_tree_iter_add(iter, 1);
+
+			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
+			    radix_tree_is_indirect_ptr(*slot)) {
+				if (indirect_to_ptr(*slot) == canon)
+					continue;
+				else {
+					iter->next_index = iter->index;
+					break;
+				}
+			}
+
 			if (likely(*slot))
 				return slot;
 			if (flags & RADIX_TREE_ITER_CONTIG) {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index ff460423ff4b..a4da86e40def 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -75,11 +75,6 @@ static inline void *ptr_to_indirect(void *ptr)
 	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
 }
 
-static inline void *indirect_to_ptr(void *ptr)
-{
-	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
-}
-
 #define RADIX_TREE_RETRY	ptr_to_indirect(NULL)
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
@@ -885,6 +880,14 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_get);
 
+static inline void __set_iter_shift(struct radix_tree_iter *iter,
+					unsigned int shift)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	iter->shift = shift;
+#endif
+}
+
 /**
  * radix_tree_next_chunk - find next chunk of slots for iteration
  *
@@ -898,7 +901,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 {
 	unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
 	struct radix_tree_node *rnode, *node;
-	unsigned long index, offset, height;
+	unsigned long index, offset, maxindex;
 
 	if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
 		return NULL;
@@ -916,33 +919,39 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 	if (!index && iter->index)
 		return NULL;
 
-	rnode = rcu_dereference_raw(root->rnode);
+ restart:
+	shift = radix_tree_load_root(root, &rnode, &maxindex);
+	if (index > maxindex)
+		return NULL;
+
 	if (radix_tree_is_indirect_ptr(rnode)) {
 		rnode = indirect_to_ptr(rnode);
-	} else if (rnode && !index) {
+	} else if (rnode) {
 		/* Single-slot tree */
-		iter->index = 0;
-		iter->next_index = 1;
+		iter->index = index;
+		iter->next_index = maxindex + 1;
 		iter->tags = 1;
+		__set_iter_shift(iter, shift);
 		return (void **)&root->rnode;
 	} else
 		return NULL;
 
-restart:
-	height = rnode->path & RADIX_TREE_HEIGHT_MASK;
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	shift -= RADIX_TREE_MAP_SHIFT;
 	offset = index >> shift;
 
-	/* Index outside of the tree */
-	if (offset >= RADIX_TREE_MAP_SIZE)
-		return NULL;
-
 	node = rnode;
 	while (1) {
 		struct radix_tree_node *slot;
+		unsigned new_off = radix_tree_descend(node, &slot, offset);
+
+		if (new_off < offset) {
+			offset = new_off;
+			index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
+			index |= offset << shift;
+		}
+
 		if ((flags & RADIX_TREE_ITER_TAGGED) ?
-				!test_bit(offset, node->tags[tag]) :
-				!node->slots[offset]) {
+				!tag_get(node, tag, offset) : !slot) {
 			/* Hole detected */
 			if (flags & RADIX_TREE_ITER_CONTIG)
 				return NULL;
@@ -954,7 +963,10 @@ restart:
 						offset + 1);
 			else
 				while (++offset	< RADIX_TREE_MAP_SIZE) {
-					if (node->slots[offset])
+					void *slot = node->slots[offset];
+					if (is_sibling_entry(node, slot))
+						continue;
+					if (slot)
 						break;
 				}
 			index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1);
@@ -964,25 +976,23 @@ restart:
 				return NULL;
 			if (offset == RADIX_TREE_MAP_SIZE)
 				goto restart;
+			slot = rcu_dereference_raw(node->slots[offset]);
 		}
 
-		/* This is leaf-node */
-		if (!shift)
-			break;
-
-		slot = rcu_dereference_raw(node->slots[offset]);
-		if (slot == NULL)
+		if ((slot == NULL) || (slot == RADIX_TREE_RETRY))
 			goto restart;
 		if (!radix_tree_is_indirect_ptr(slot))
 			break;
+
 		node = indirect_to_ptr(slot);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 	}
 
 	/* Update the iterator state */
-	iter->index = index;
-	iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1;
+	iter->index = index & ~((1 << shift) - 1);
+	iter->next_index = (index | ((RADIX_TREE_MAP_SIZE << shift) - 1)) + 1;
+	__set_iter_shift(iter, shift);
 
 	/* Construct iter->tags bit-mask from node->tags[tag] array */
 	if (flags & RADIX_TREE_ITER_TAGGED) {
diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h
new file mode 100644
index 000000000000..ad18cf5a2a3a
--- /dev/null
+++ b/tools/testing/radix-tree/generated/autoconf.h
@@ -0,0 +1,3 @@
+#define CONFIG_RADIX_TREE_MULTIORDER 1
+#define CONFIG_SHMEM 1
+#define CONFIG_SWAP 1
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 8ea0ed450810..be98a47b4e1b 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -8,10 +8,7 @@
 #include <limits.h>
 
 #include "../../include/linux/compiler.h"
-
-#define CONFIG_RADIX_TREE_MULTIORDER
-#define CONFIG_SHMEM
-#define CONFIG_SWAP
+#include "../../../include/linux/kconfig.h"
 
 #define RADIX_TREE_MAP_SHIFT	3
 
-- 
cgit v1.2.3


From 0c7fa0a8418cbe0e8963fe36db9575d03b8589f7 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:07 -0700
Subject: radix-tree: split node->path into offset and height

Neither piece of information we're storing in node->path can be larger
than 64, so store each in its own unsigned char instead of shifting and
masking to store them both in an unsigned int.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  7 ++-----
 lib/radix-tree.c           | 38 +++++++++++++++++---------------------
 2 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 8558d52e1c7b..2d2ad9d685a3 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -84,16 +84,13 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
 					  RADIX_TREE_MAP_SHIFT))
 
-/* Height component in node->path */
-#define RADIX_TREE_HEIGHT_SHIFT	(RADIX_TREE_MAX_PATH + 1)
-#define RADIX_TREE_HEIGHT_MASK	((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)
-
 /* Internally used bits of node->count */
 #define RADIX_TREE_COUNT_SHIFT	(RADIX_TREE_MAP_SHIFT + 1)
 #define RADIX_TREE_COUNT_MASK	((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
 
 struct radix_tree_node {
-	unsigned int	path;	/* Offset in parent & height from the bottom */
+	unsigned char	height;	/* From the bottom */
+	unsigned char	offset;	/* Slot offset in parent */
 	unsigned int	count;
 	union {
 		struct {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 75944e42e4a0..dd04b51e5fbb 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -218,15 +218,15 @@ radix_tree_find_next_bit(const unsigned long *addr,
 }
 
 #ifndef __KERNEL__
-static void dump_node(struct radix_tree_node *node, unsigned offset,
+static void dump_node(struct radix_tree_node *node,
 				unsigned shift, unsigned long index)
 {
 	unsigned long i;
 
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n",
-		node, offset,
+	pr_debug("radix node: %p offset %d tags %lx %lx %lx height %d count %d parent %p\n",
+		node, node->offset,
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
-		node->path, node->count, node->parent);
+		node->height, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		unsigned long first = index | (i << shift);
@@ -243,7 +243,7 @@ static void dump_node(struct radix_tree_node *node, unsigned offset,
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry), i,
+			dump_node(indirect_to_ptr(entry),
 					shift - RADIX_TREE_MAP_SHIFT, first);
 		}
 	}
@@ -257,7 +257,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode), 0,
+	dump_node(indirect_to_ptr(root->rnode),
 				(root->height - 1) * RADIX_TREE_MAP_SHIFT, 0);
 }
 #endif
@@ -421,7 +421,7 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
 
 static inline unsigned long node_maxindex(struct radix_tree_node *node)
 {
-	return radix_tree_maxindex(node->path & RADIX_TREE_HEIGHT_MASK);
+	return radix_tree_maxindex(node->height);
 }
 
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
@@ -434,8 +434,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	if (likely(radix_tree_is_indirect_ptr(node))) {
 		node = indirect_to_ptr(node);
 		*maxindex = node_maxindex(node);
-		return (node->path & RADIX_TREE_HEIGHT_MASK) *
-			RADIX_TREE_MAP_SHIFT;
+		return node->height * RADIX_TREE_MAP_SHIFT;
 	}
 
 	*maxindex = 0;
@@ -476,9 +475,10 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		}
 
 		/* Increase the height.  */
-		newheight = root->height+1;
-		BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
-		node->path = newheight;
+		newheight = root->height + 1;
+		BUG_ON(newheight > BITS_PER_LONG);
+		node->height = newheight;
+		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
 		slot = root->rnode;
@@ -546,13 +546,13 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			slot = radix_tree_node_alloc(root);
 			if (!slot)
 				return -ENOMEM;
-			slot->path = height;
+			slot->height = height;
+			slot->offset = offset;
 			slot->parent = node;
 			if (node) {
 				rcu_assign_pointer(node->slots[offset],
 							ptr_to_indirect(slot));
 				node->count++;
-				slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
 			} else
 				rcu_assign_pointer(root->rnode,
 							ptr_to_indirect(slot));
@@ -1319,11 +1319,10 @@ struct locate_info {
 static unsigned long __locate(struct radix_tree_node *slot, void *item,
 			      unsigned long index, struct locate_info *info)
 {
-	unsigned int shift, height;
+	unsigned int shift;
 	unsigned long i;
 
-	height = slot->path & RADIX_TREE_HEIGHT_MASK;
-	shift = height * RADIX_TREE_MAP_SHIFT;
+	shift = slot->height * RADIX_TREE_MAP_SHIFT;
 
 	do {
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -1508,10 +1507,7 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 
 		parent = node->parent;
 		if (parent) {
-			unsigned int offset;
-
-			offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
-			parent->slots[offset] = NULL;
+			parent->slots[node->offset] = NULL;
 			parent->count--;
 		} else {
 			root_tag_clear_all(root);
-- 
cgit v1.2.3


From c12e51b07b3ac4c188fd91a82f96840fdb9cca6f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:10 -0700
Subject: radix-tree: replace node->height with node->shift

node->shift represents the shift necessary for looking in the slots
array at this level.  It is equal to the old (node->height - 1) *
RADIX_TREE_MAP_SHIFT.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  2 +-
 lib/radix-tree.c           | 30 ++++++++++++++++--------------
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 2d2ad9d685a3..037458257e12 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -89,7 +89,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_COUNT_MASK	((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
 
 struct radix_tree_node {
-	unsigned char	height;	/* From the bottom */
+	unsigned char	shift;	/* Bits remaining in each slot */
 	unsigned char	offset;	/* Slot offset in parent */
 	unsigned int	count;
 	union {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index dd04b51e5fbb..648da9080418 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -223,10 +223,10 @@ static void dump_node(struct radix_tree_node *node,
 {
 	unsigned long i;
 
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx height %d count %d parent %p\n",
+	pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
 		node, node->offset,
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
-		node->height, node->count, node->parent);
+		node->shift, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		unsigned long first = index | (i << shift);
@@ -419,9 +419,14 @@ static inline unsigned long radix_tree_maxindex(unsigned int height)
 	return height_to_maxindex[height];
 }
 
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+	return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+
 static inline unsigned long node_maxindex(struct radix_tree_node *node)
 {
-	return radix_tree_maxindex(node->height);
+	return shift_maxindex(node->shift);
 }
 
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
@@ -434,7 +439,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	if (likely(radix_tree_is_indirect_ptr(node))) {
 		node = indirect_to_ptr(node);
 		*maxindex = node_maxindex(node);
-		return node->height * RADIX_TREE_MAP_SHIFT;
+		return node->shift + RADIX_TREE_MAP_SHIFT;
 	}
 
 	*maxindex = 0;
@@ -475,9 +480,9 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		}
 
 		/* Increase the height.  */
-		newheight = root->height + 1;
+		newheight = root->height;
 		BUG_ON(newheight > BITS_PER_LONG);
-		node->height = newheight;
+		node->shift = newheight * RADIX_TREE_MAP_SHIFT;
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
@@ -490,7 +495,7 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->slots[0] = slot;
 		node = ptr_to_indirect(node);
 		rcu_assign_pointer(root->rnode, node);
-		root->height = newheight;
+		root->height = ++newheight;
 	} while (height > root->height);
 out:
 	return height * RADIX_TREE_MAP_SHIFT;
@@ -519,7 +524,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 {
 	struct radix_tree_node *node = NULL, *slot;
 	unsigned long maxindex;
-	unsigned int height, shift, offset;
+	unsigned int shift, offset;
 	unsigned long max = index | ((1UL << order) - 1);
 
 	shift = radix_tree_load_root(root, &slot, &maxindex);
@@ -537,16 +542,15 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		}
 	}
 
-	height = root->height;
-
 	offset = 0;			/* uninitialised var warning */
 	while (shift > order) {
+		shift -= RADIX_TREE_MAP_SHIFT;
 		if (slot == NULL) {
 			/* Have to add a child node.  */
 			slot = radix_tree_node_alloc(root);
 			if (!slot)
 				return -ENOMEM;
-			slot->height = height;
+			slot->shift = shift;
 			slot->offset = offset;
 			slot->parent = node;
 			if (node) {
@@ -560,8 +564,6 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			break;
 
 		/* Go a level down */
-		height--;
-		shift -= RADIX_TREE_MAP_SHIFT;
 		node = indirect_to_ptr(slot);
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(node, &slot, offset);
@@ -1322,7 +1324,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 	unsigned int shift;
 	unsigned long i;
 
-	shift = slot->height * RADIX_TREE_MAP_SHIFT;
+	shift = slot->shift + RADIX_TREE_MAP_SHIFT;
 
 	do {
 		shift -= RADIX_TREE_MAP_SHIFT;
-- 
cgit v1.2.3


From d0891265bbc988dc91ed8580b38eb3dac128581b Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:19 -0700
Subject: radix-tree: remove root->height

The only remaining references to root->height were in extend and shrink,
where it was updated.  Now we can remove it entirely.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |   3 --
 lib/radix-tree.c           | 106 +++++++++++++--------------------------------
 2 files changed, 31 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 037458257e12..c0d223cfac00 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -110,13 +110,11 @@ struct radix_tree_node {
 
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
-	unsigned int		height;
 	gfp_t			gfp_mask;
 	struct radix_tree_node	__rcu *rnode;
 };
 
 #define RADIX_TREE_INIT(mask)	{					\
-	.height = 0,							\
 	.gfp_mask = (mask),						\
 	.rnode = NULL,							\
 }
@@ -126,7 +124,6 @@ struct radix_tree_root {
 
 #define INIT_RADIX_TREE(root, mask)					\
 do {									\
-	(root)->height = 0;						\
 	(root)->gfp_mask = (mask);					\
 	(root)->rnode = NULL;						\
 } while (0)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 75c9e6197b5b..58f79fee8c71 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -38,12 +38,6 @@
 #include <linux/preempt.h>		/* in_interrupt() */
 
 
-/*
- * The height_to_maxindex array needs to be one deeper than the maximum
- * path as height 0 holds only 1 entry.
- */
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
-
 /*
  * Radix tree node cache.
  */
@@ -218,8 +212,7 @@ radix_tree_find_next_bit(const unsigned long *addr,
 }
 
 #ifndef __KERNEL__
-static void dump_node(struct radix_tree_node *node,
-				unsigned shift, unsigned long index)
+static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
 	unsigned long i;
 
@@ -229,8 +222,8 @@ static void dump_node(struct radix_tree_node *node,
 		node->shift, node->count, node->parent);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
-		unsigned long first = index | (i << shift);
-		unsigned long last = first | ((1UL << shift) - 1);
+		unsigned long first = index | (i << node->shift);
+		unsigned long last = first | ((1UL << node->shift) - 1);
 		void *entry = node->slots[i];
 		if (!entry)
 			continue;
@@ -243,8 +236,7 @@ static void dump_node(struct radix_tree_node *node,
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry),
-					shift - RADIX_TREE_MAP_SHIFT, first);
+			dump_node(indirect_to_ptr(entry), first);
 		}
 	}
 }
@@ -252,13 +244,12 @@ static void dump_node(struct radix_tree_node *node,
 /* For debug */
 static void radix_tree_dump(struct radix_tree_root *root)
 {
-	pr_debug("radix root: %p height %d rnode %p tags %x\n",
-			root, root->height, root->rnode,
+	pr_debug("radix root: %p rnode %p tags %x\n",
+			root, root->rnode,
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode),
-				(root->height - 1) * RADIX_TREE_MAP_SHIFT, 0);
+	dump_node(indirect_to_ptr(root->rnode), 0);
 }
 #endif
 
@@ -411,14 +402,8 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
 EXPORT_SYMBOL(radix_tree_maybe_preload);
 
 /*
- *	Return the maximum key which can be store into a
- *	radix tree with height HEIGHT.
+ * The maximum index which can be stored in a radix tree
  */
-static inline unsigned long radix_tree_maxindex(unsigned int height)
-{
-	return height_to_maxindex[height];
-}
-
 static inline unsigned long shift_maxindex(unsigned int shift)
 {
 	return (RADIX_TREE_MAP_SIZE << shift) - 1;
@@ -450,24 +435,22 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
  *	Extend a radix tree so it can store key @index.
  */
 static int radix_tree_extend(struct radix_tree_root *root,
-				unsigned long index)
+				unsigned long index, unsigned int shift)
 {
 	struct radix_tree_node *slot;
-	unsigned int height;
+	unsigned int maxshift;
 	int tag;
 
-	/* Figure out what the height should be.  */
-	height = root->height + 1;
-	while (index > radix_tree_maxindex(height))
-		height++;
+	/* Figure out what the shift should be.  */
+	maxshift = shift;
+	while (index > shift_maxindex(maxshift))
+		maxshift += RADIX_TREE_MAP_SHIFT;
 
-	if (root->rnode == NULL) {
-		root->height = height;
+	slot = root->rnode;
+	if (!slot)
 		goto out;
-	}
 
 	do {
-		unsigned int newheight;
 		struct radix_tree_node *node = radix_tree_node_alloc(root);
 
 		if (!node)
@@ -479,14 +462,11 @@ static int radix_tree_extend(struct radix_tree_root *root,
 				tag_set(node, tag, 0);
 		}
 
-		/* Increase the height.  */
-		newheight = root->height;
-		BUG_ON(newheight > BITS_PER_LONG);
-		node->shift = newheight * RADIX_TREE_MAP_SHIFT;
+		BUG_ON(shift > BITS_PER_LONG);
+		node->shift = shift;
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		slot = root->rnode;
 		if (radix_tree_is_indirect_ptr(slot)) {
 			slot = indirect_to_ptr(slot);
 			slot->parent = node;
@@ -495,10 +475,11 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->slots[0] = slot;
 		node = ptr_to_indirect(node);
 		rcu_assign_pointer(root->rnode, node);
-		root->height = ++newheight;
-	} while (height > root->height);
+		shift += RADIX_TREE_MAP_SHIFT;
+		slot = node;
+	} while (shift <= maxshift);
 out:
-	return height * RADIX_TREE_MAP_SHIFT;
+	return maxshift + RADIX_TREE_MAP_SHIFT;
 }
 
 /**
@@ -531,15 +512,13 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 
 	/* Make sure the tree is high enough.  */
 	if (max > maxindex) {
-		int error = radix_tree_extend(root, max);
+		int error = radix_tree_extend(root, max, shift);
 		if (error < 0)
 			return error;
 		shift = error;
 		slot = root->rnode;
-		if (order == shift) {
+		if (order == shift)
 			shift += RADIX_TREE_MAP_SHIFT;
-			root->height++;
-		}
 	}
 
 	offset = 0;			/* uninitialised var warning */
@@ -1412,32 +1391,32 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 #endif /* CONFIG_SHMEM && CONFIG_SWAP */
 
 /**
- *	radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *	radix_tree_shrink    -    shrink radix tree to minimum height
  *	@root		radix tree root
  */
 static inline bool radix_tree_shrink(struct radix_tree_root *root)
 {
 	bool shrunk = false;
 
-	/* try to shrink tree height */
-	while (root->height > 0) {
+	for (;;) {
 		struct radix_tree_node *to_free = root->rnode;
 		struct radix_tree_node *slot;
 
-		BUG_ON(!radix_tree_is_indirect_ptr(to_free));
+		if (!radix_tree_is_indirect_ptr(to_free))
+			break;
 		to_free = indirect_to_ptr(to_free);
 
 		/*
 		 * The candidate node has more than one child, or its child
-		 * is not at the leftmost slot, or it is a multiorder entry,
-		 * we cannot shrink.
+		 * is not at the leftmost slot, or the child is a multiorder
+		 * entry, we cannot shrink.
 		 */
 		if (to_free->count != 1)
 			break;
 		slot = to_free->slots[0];
 		if (!slot)
 			break;
-		if (!radix_tree_is_indirect_ptr(slot) && (root->height > 1))
+		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
 			break;
 
 		if (radix_tree_is_indirect_ptr(slot)) {
@@ -1454,7 +1433,6 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		 * one (root->rnode) as far as dependent read barriers go.
 		 */
 		root->rnode = slot;
-		root->height--;
 
 		/*
 		 * We have a dilemma here. The node's slot[0] must not be
@@ -1515,7 +1493,6 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 			parent->count--;
 		} else {
 			root_tag_clear_all(root);
-			root->height = 0;
 			root->rnode = NULL;
 		}
 
@@ -1631,26 +1608,6 @@ radix_tree_node_ctor(void *arg)
 	INIT_LIST_HEAD(&node->private_list);
 }
 
-static __init unsigned long __maxindex(unsigned int height)
-{
-	unsigned int width = height * RADIX_TREE_MAP_SHIFT;
-	int shift = RADIX_TREE_INDEX_BITS - width;
-
-	if (shift < 0)
-		return ~0UL;
-	if (shift >= BITS_PER_LONG)
-		return 0UL;
-	return ~0UL >> shift;
-}
-
-static __init void radix_tree_init_maxindex(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
-		height_to_maxindex[i] = __maxindex(i);
-}
-
 static int radix_tree_callback(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
@@ -1677,6 +1634,5 @@ void __init radix_tree_init(void)
 			sizeof(struct radix_tree_node), 0,
 			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
 			radix_tree_node_ctor);
-	radix_tree_init_maxindex();
 	hotcpu_notifier(radix_tree_callback, 0);
 }
-- 
cgit v1.2.3


From 30ff46ccb303fb6f6c28b9aa9f2cdc4ba900ed3f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:22 -0700
Subject: radix-tree: rename INDIRECT_PTR to INTERNAL_NODE

The name RADIX_TREE_INDIRECT_PTR doesn't really match the meaning.
RADIX_TREE_INTERNAL_NODE is a better name.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 30 +++++++++++++-----------------
 lib/radix-tree.c           |  2 +-
 2 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c0d223cfac00..c8cc879046c7 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -29,20 +29,16 @@
 #include <linux/rcupdate.h>
 
 /*
- * An indirect pointer (root->rnode pointing to a radix_tree_node, rather
- * than a data item) is signalled by the low bit set in the root->rnode
- * pointer.
- *
- * In this case root->height is > 0, but the indirect pointer tests are
- * needed for RCU lookups (because root->height is unreliable). The only
- * time callers need worry about this is when doing a lookup_slot under
- * RCU.
- *
- * Indirect pointer in fact is also used to tag the last pointer of a node
- * when it is shrunk, before we rcu free the node. See shrink code for
- * details.
+ * Entries in the radix tree have the low bit set if they refer to a
+ * radix_tree_node.  If the low bit is clear then the entry is user data.
+ *
+ * We also use the low bit to indicate that the slot will be freed in the
+ * next RCU idle period, and users need to re-walk the tree to find the
+ * new slot for the index that they were looking for.  See the comment in
+ * radix_tree_shrink() for details.
  */
-#define RADIX_TREE_INDIRECT_PTR		1
+#define RADIX_TREE_INTERNAL_NODE	1
+
 /*
  * A common use of the radix tree is to store pointers to struct pages;
  * but shmem/tmpfs needs also to store swap entries in the same tree:
@@ -63,7 +59,7 @@
 
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
-	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
+	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
 }
 
 /*** radix-tree API starts here ***/
@@ -228,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot,
  */
 static inline int radix_tree_deref_retry(void *arg)
 {
-	return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
+	return unlikely(radix_tree_is_indirect_ptr(arg));
 }
 
 /**
@@ -250,7 +246,7 @@ static inline int radix_tree_exceptional_entry(void *arg)
 static inline int radix_tree_exception(void *arg)
 {
 	return unlikely((unsigned long)arg &
-		(RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
+		(RADIX_TREE_INTERNAL_NODE | RADIX_TREE_EXCEPTIONAL_ENTRY));
 }
 
 /**
@@ -448,7 +444,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
 
 static inline void *indirect_to_ptr(void *ptr)
 {
-	return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
+	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
 }
 
 /**
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 58f79fee8c71..31d5929a625b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -68,7 +68,7 @@ static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
 static inline void *ptr_to_indirect(void *ptr)
 {
-	return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
+	return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
 }
 
 #define RADIX_TREE_RETRY	ptr_to_indirect(NULL)
-- 
cgit v1.2.3


From 4dd6c0987ca43d6544f4f0a3f86f6ea3bfc60fc1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:27 -0700
Subject: radix-tree: rename indirect_to_ptr() to entry_to_node()

Mirrors the earlier commit introducing node_to_entry().

Also change the type returned to be a struct radix_tree_node pointer.
That lets us simplify a couple of places in the radix tree shrink &
extend paths where we could convert an entry into a pointer, modify the
node, then convert the pointer back into an entry.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h      | 12 +++++------
 lib/radix-tree.c                | 48 ++++++++++++++++++-----------------------
 tools/testing/radix-tree/test.c |  4 ++--
 tools/testing/radix-tree/test.h |  1 -
 4 files changed, 28 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c8cc879046c7..b94aa198dd6b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -442,7 +442,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
 	return (iter->next_index - iter->index) >> iter_shift(iter);
 }
 
-static inline void *indirect_to_ptr(void *ptr)
+static inline struct radix_tree_node *entry_to_node(void *ptr)
 {
 	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
 }
@@ -469,7 +469,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 			return NULL;
 		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
 					radix_tree_is_indirect_ptr(slot[1])) {
-			if (indirect_to_ptr(slot[1]) == canon) {
+			if (entry_to_node(slot[1]) == canon) {
 				iter->tags >>= 1;
 				iter->index = __radix_tree_iter_add(iter, 1);
 				slot++;
@@ -499,12 +499,10 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 
 			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
 			    radix_tree_is_indirect_ptr(*slot)) {
-				if (indirect_to_ptr(*slot) == canon)
+				if (entry_to_node(*slot) == canon)
 					continue;
-				else {
-					iter->next_index = iter->index;
-					break;
-				}
+				iter->next_index = iter->index;
+				break;
 			}
 
 			if (likely(*slot))
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index f66bb3932452..3c3fdd9c5bb3 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -230,13 +230,13 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 		if (is_sibling_entry(node, entry)) {
 			pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
 					entry, i,
-					*(void **)indirect_to_ptr(entry),
+					*(void **)entry_to_node(entry),
 					first, last);
 		} else if (!radix_tree_is_indirect_ptr(entry)) {
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
-			dump_node(indirect_to_ptr(entry), first);
+			dump_node(entry_to_node(entry), first);
 		}
 	}
 }
@@ -249,7 +249,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 			root->gfp_mask >> __GFP_BITS_SHIFT);
 	if (!radix_tree_is_indirect_ptr(root->rnode))
 		return;
-	dump_node(indirect_to_ptr(root->rnode), 0);
+	dump_node(entry_to_node(root->rnode), 0);
 }
 #endif
 
@@ -422,7 +422,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 	*nodep = node;
 
 	if (likely(radix_tree_is_indirect_ptr(node))) {
-		node = indirect_to_ptr(node);
+		node = entry_to_node(node);
 		*maxindex = node_maxindex(node);
 		return node->shift + RADIX_TREE_MAP_SHIFT;
 	}
@@ -467,11 +467,8 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		if (radix_tree_is_indirect_ptr(slot)) {
-			slot = indirect_to_ptr(slot);
-			slot->parent = node;
-			slot = node_to_entry(slot);
-		}
+		if (radix_tree_is_indirect_ptr(slot))
+			entry_to_node(slot)->parent = node;
 		node->slots[0] = slot;
 		slot = node_to_entry(node);
 		rcu_assign_pointer(root->rnode, slot);
@@ -542,7 +539,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			break;
 
 		/* Go a level down */
-		node = indirect_to_ptr(slot);
+		node = entry_to_node(slot);
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(node, &slot, offset);
 	}
@@ -645,7 +642,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
 
 		if (node == RADIX_TREE_RETRY)
 			goto restart;
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		offset = radix_tree_descend(parent, &node, offset);
@@ -729,7 +726,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 		BUG_ON(!node);
 
@@ -777,7 +774,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 	}
 
@@ -844,7 +841,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
-		parent = indirect_to_ptr(node);
+		parent = entry_to_node(node);
 		offset = radix_tree_descend(parent, &node, offset);
 
 		if (!node)
@@ -904,7 +901,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 		return NULL;
 
 	if (radix_tree_is_indirect_ptr(rnode)) {
-		rnode = indirect_to_ptr(rnode);
+		rnode = entry_to_node(rnode);
 	} else if (rnode) {
 		/* Single-slot tree */
 		iter->index = index;
@@ -963,7 +960,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 		if (!radix_tree_is_indirect_ptr(slot))
 			break;
 
-		node = indirect_to_ptr(slot);
+		node = entry_to_node(slot);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 	}
@@ -1048,7 +1045,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		return 1;
 	}
 
-	node = indirect_to_ptr(slot);
+	node = entry_to_node(slot);
 	shift -= RADIX_TREE_MAP_SHIFT;
 
 	for (;;) {
@@ -1063,7 +1060,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 			goto next;
 		/* Sibling slots never have tags set on them */
 		if (radix_tree_is_indirect_ptr(slot)) {
-			node = indirect_to_ptr(slot);
+			node = entry_to_node(slot);
 			shift -= RADIX_TREE_MAP_SHIFT;
 			continue;
 		}
@@ -1322,7 +1319,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 				}
 				continue;
 			}
-			node = indirect_to_ptr(node);
+			node = entry_to_node(node);
 			if (is_sibling_entry(slot, node))
 				continue;
 			slot = node;
@@ -1367,7 +1364,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 			break;
 		}
 
-		node = indirect_to_ptr(node);
+		node = entry_to_node(node);
 
 		max_index = node_maxindex(node);
 		if (cur_index > max_index) {
@@ -1403,7 +1400,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 
 		if (!radix_tree_is_indirect_ptr(to_free))
 			break;
-		to_free = indirect_to_ptr(to_free);
+		to_free = entry_to_node(to_free);
 
 		/*
 		 * The candidate node has more than one child, or its child
@@ -1418,11 +1415,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
 			break;
 
-		if (radix_tree_is_indirect_ptr(slot)) {
-			slot = indirect_to_ptr(slot);
-			slot->parent = NULL;
-			slot = node_to_entry(slot);
-		}
+		if (radix_tree_is_indirect_ptr(slot))
+			entry_to_node(slot)->parent = NULL;
 
 		/*
 		 * We don't need rcu_assign_pointer(), since we are simply
@@ -1481,7 +1475,7 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 		struct radix_tree_node *parent;
 
 		if (node->count) {
-			if (node == indirect_to_ptr(root->rnode))
+			if (node == entry_to_node(root->rnode))
 				deleted |= radix_tree_shrink(root);
 			return deleted;
 		}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index 3004c58b9021..7b0bc1fa5919 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -149,7 +149,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
 	int i;
 	int j;
 
-	slot = indirect_to_ptr(slot);
+	slot = entry_to_node(slot);
 
 	/* Verify consistency at this level */
 	for (i = 0; i < RADIX_TREE_TAG_LONGS; i++) {
@@ -227,7 +227,7 @@ void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
 		return;
 	}
 
-	node = indirect_to_ptr(node);
+	node = entry_to_node(node);
 	assert(maxindex <= node_maxindex(node));
 
 	shift = node->shift;
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 866c8c676aa4..e85131369723 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -39,7 +39,6 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
 extern int nr_allocated;
 
 /* Normally private parts of lib/radix-tree.c */
-void *indirect_to_ptr(void *ptr);
 void radix_tree_dump(struct radix_tree_root *root);
 int root_tag_get(struct radix_tree_root *root, unsigned int tag);
 unsigned long node_maxindex(struct radix_tree_node *);
-- 
cgit v1.2.3


From b194d16c27af905d6e3552f4851bc7d9fee4e90f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:30 -0700
Subject: radix-tree: rename radix_tree_is_indirect_ptr()

As with indirect_to_ptr(), ptr_to_indirect() and
RADIX_TREE_INDIRECT_PTR, change radix_tree_is_indirect_ptr() to
radix_tree_is_internal_node().

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h      | 10 ++++-----
 lib/radix-tree.c                | 48 ++++++++++++++++++++---------------------
 tools/testing/radix-tree/test.c |  4 ++--
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index b94aa198dd6b..bad63105e37e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -57,7 +57,7 @@
 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
 		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
 
-static inline int radix_tree_is_indirect_ptr(void *ptr)
+static inline int radix_tree_is_internal_node(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
 }
@@ -224,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot,
  */
 static inline int radix_tree_deref_retry(void *arg)
 {
-	return unlikely(radix_tree_is_indirect_ptr(arg));
+	return unlikely(radix_tree_is_internal_node(arg));
 }
 
 /**
@@ -259,7 +259,7 @@ static inline int radix_tree_exception(void *arg)
  */
 static inline void radix_tree_replace_slot(void **pslot, void *item)
 {
-	BUG_ON(radix_tree_is_indirect_ptr(item));
+	BUG_ON(radix_tree_is_internal_node(item));
 	rcu_assign_pointer(*pslot, item);
 }
 
@@ -468,7 +468,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 		if (unlikely(!iter->tags))
 			return NULL;
 		while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
-					radix_tree_is_indirect_ptr(slot[1])) {
+					radix_tree_is_internal_node(slot[1])) {
 			if (entry_to_node(slot[1]) == canon) {
 				iter->tags >>= 1;
 				iter->index = __radix_tree_iter_add(iter, 1);
@@ -498,7 +498,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 			iter->index = __radix_tree_iter_add(iter, 1);
 
 			if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
-			    radix_tree_is_indirect_ptr(*slot)) {
+			    radix_tree_is_internal_node(*slot)) {
 				if (entry_to_node(*slot) == canon)
 					continue;
 				iter->next_index = iter->index;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3c3fdd9c5bb3..b65c83036ca4 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -100,7 +100,7 @@ static unsigned radix_tree_descend(struct radix_tree_node *parent,
 	void **entry = rcu_dereference_raw(parent->slots[offset]);
 
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-	if (radix_tree_is_indirect_ptr(entry)) {
+	if (radix_tree_is_internal_node(entry)) {
 		unsigned long siboff = get_slot_offset(parent, entry);
 		if (siboff < RADIX_TREE_MAP_SIZE) {
 			offset = siboff;
@@ -232,7 +232,7 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 					entry, i,
 					*(void **)entry_to_node(entry),
 					first, last);
-		} else if (!radix_tree_is_indirect_ptr(entry)) {
+		} else if (!radix_tree_is_internal_node(entry)) {
 			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
 					entry, i, first, last);
 		} else {
@@ -247,7 +247,7 @@ static void radix_tree_dump(struct radix_tree_root *root)
 	pr_debug("radix root: %p rnode %p tags %x\n",
 			root, root->rnode,
 			root->gfp_mask >> __GFP_BITS_SHIFT);
-	if (!radix_tree_is_indirect_ptr(root->rnode))
+	if (!radix_tree_is_internal_node(root->rnode))
 		return;
 	dump_node(entry_to_node(root->rnode), 0);
 }
@@ -302,7 +302,7 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 	ret = kmem_cache_alloc(radix_tree_node_cachep,
 			       gfp_mask | __GFP_ACCOUNT);
 out:
-	BUG_ON(radix_tree_is_indirect_ptr(ret));
+	BUG_ON(radix_tree_is_internal_node(ret));
 	return ret;
 }
 
@@ -421,7 +421,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root,
 
 	*nodep = node;
 
-	if (likely(radix_tree_is_indirect_ptr(node))) {
+	if (likely(radix_tree_is_internal_node(node))) {
 		node = entry_to_node(node);
 		*maxindex = node_maxindex(node);
 		return node->shift + RADIX_TREE_MAP_SHIFT;
@@ -467,7 +467,7 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->offset = 0;
 		node->count = 1;
 		node->parent = NULL;
-		if (radix_tree_is_indirect_ptr(slot))
+		if (radix_tree_is_internal_node(slot))
 			entry_to_node(slot)->parent = node;
 		node->slots[0] = slot;
 		slot = node_to_entry(node);
@@ -535,7 +535,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			} else
 				rcu_assign_pointer(root->rnode,
 							node_to_entry(slot));
-		} else if (!radix_tree_is_indirect_ptr(slot))
+		} else if (!radix_tree_is_internal_node(slot))
 			break;
 
 		/* Go a level down */
@@ -585,7 +585,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	void **slot;
 	int error;
 
-	BUG_ON(radix_tree_is_indirect_ptr(item));
+	BUG_ON(radix_tree_is_internal_node(item));
 
 	error = __radix_tree_create(root, index, order, &node, &slot);
 	if (error)
@@ -637,7 +637,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
 	if (index > maxindex)
 		return NULL;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		unsigned offset;
 
 		if (node == RADIX_TREE_RETRY)
@@ -720,7 +720,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 	shift = radix_tree_load_root(root, &node, &maxindex);
 	BUG_ON(index > maxindex);
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		unsigned offset;
 
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -770,7 +770,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 
 	parent = NULL;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		shift -= RADIX_TREE_MAP_SHIFT;
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 
@@ -835,7 +835,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 	if (node == NULL)
 		return 0;
 
-	while (radix_tree_is_indirect_ptr(node)) {
+	while (radix_tree_is_internal_node(node)) {
 		int offset;
 
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -900,7 +900,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 	if (index > maxindex)
 		return NULL;
 
-	if (radix_tree_is_indirect_ptr(rnode)) {
+	if (radix_tree_is_internal_node(rnode)) {
 		rnode = entry_to_node(rnode);
 	} else if (rnode) {
 		/* Single-slot tree */
@@ -957,7 +957,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 
 		if ((slot == NULL) || (slot == RADIX_TREE_RETRY))
 			goto restart;
-		if (!radix_tree_is_indirect_ptr(slot))
+		if (!radix_tree_is_internal_node(slot))
 			break;
 
 		node = entry_to_node(slot);
@@ -1039,7 +1039,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		*first_indexp = last_index + 1;
 		return 0;
 	}
-	if (!radix_tree_is_indirect_ptr(slot)) {
+	if (!radix_tree_is_internal_node(slot)) {
 		*first_indexp = last_index + 1;
 		root_tag_set(root, settag);
 		return 1;
@@ -1059,7 +1059,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 		if (!tag_get(node, iftag, offset))
 			goto next;
 		/* Sibling slots never have tags set on them */
-		if (radix_tree_is_indirect_ptr(slot)) {
+		if (radix_tree_is_internal_node(slot)) {
 			node = entry_to_node(slot);
 			shift -= RADIX_TREE_MAP_SHIFT;
 			continue;
@@ -1152,7 +1152,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
-		if (radix_tree_is_indirect_ptr(results[ret])) {
+		if (radix_tree_is_internal_node(results[ret])) {
 			slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
@@ -1235,7 +1235,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
-		if (radix_tree_is_indirect_ptr(results[ret])) {
+		if (radix_tree_is_internal_node(results[ret])) {
 			slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
@@ -1311,7 +1311,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
 					rcu_dereference_raw(slot->slots[i]);
 			if (node == RADIX_TREE_RETRY)
 				goto out;
-			if (!radix_tree_is_indirect_ptr(node)) {
+			if (!radix_tree_is_internal_node(node)) {
 				if (node == item) {
 					info->found_index = index;
 					info->stop = true;
@@ -1357,7 +1357,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 	do {
 		rcu_read_lock();
 		node = rcu_dereference_raw(root->rnode);
-		if (!radix_tree_is_indirect_ptr(node)) {
+		if (!radix_tree_is_internal_node(node)) {
 			rcu_read_unlock();
 			if (node == item)
 				info.found_index = 0;
@@ -1398,7 +1398,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		struct radix_tree_node *to_free = root->rnode;
 		struct radix_tree_node *slot;
 
-		if (!radix_tree_is_indirect_ptr(to_free))
+		if (!radix_tree_is_internal_node(to_free))
 			break;
 		to_free = entry_to_node(to_free);
 
@@ -1412,10 +1412,10 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		slot = to_free->slots[0];
 		if (!slot)
 			break;
-		if (!radix_tree_is_indirect_ptr(slot) && to_free->shift)
+		if (!radix_tree_is_internal_node(slot) && to_free->shift)
 			break;
 
-		if (radix_tree_is_indirect_ptr(slot))
+		if (radix_tree_is_internal_node(slot))
 			entry_to_node(slot)->parent = NULL;
 
 		/*
@@ -1445,7 +1445,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
 		 * also results in a stale slot). So tag the slot as indirect
 		 * to force callers to retry.
 		 */
-		if (!radix_tree_is_indirect_ptr(slot))
+		if (!radix_tree_is_internal_node(slot))
 			to_free->slots[0] = RADIX_TREE_RETRY;
 
 		radix_tree_node_free(to_free);
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index 7b0bc1fa5919..a6e8099eaf4f 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -193,7 +193,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
 void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
 {
 	struct radix_tree_node *node = root->rnode;
-	if (!radix_tree_is_indirect_ptr(node))
+	if (!radix_tree_is_internal_node(node))
 		return;
 	verify_node(node, tag, !!root_tag_get(root, tag));
 }
@@ -222,7 +222,7 @@ void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
 {
 	unsigned shift;
 	struct radix_tree_node *node = root->rnode;
-	if (!radix_tree_is_indirect_ptr(node)) {
+	if (!radix_tree_is_internal_node(node)) {
 		assert(maxindex == 0);
 		return;
 	}
-- 
cgit v1.2.3


From d604c324524bf61c68182bb27db64656a78fe911 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:45 -0700
Subject: radix-tree: introduce radix_tree_replace_clear_tags()

In addition to replacing the entry, we also clear all associated tags.
This is really a one-off special for page_cache_tree_delete() which had
far too much detailed knowledge about how the radix tree works.

For efficiency, factor node_tag_clear() out of radix_tree_tag_clear() It
can be used by radix_tree_delete_item() as well as
radix_tree_replace_clear_tags().

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  9 ++++--
 lib/radix-tree.c           | 76 ++++++++++++++++++++++++++++------------------
 mm/filemap.c               | 23 ++------------
 3 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index bad63105e37e..11c8e7cc3920 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -281,9 +281,12 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 			      struct radix_tree_node *node);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items);
+struct radix_tree_node *radix_tree_replace_clear_tags(
+				struct radix_tree_root *root,
+				unsigned long index, void *entry);
+unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
+			void **results, unsigned long first_index,
+			unsigned int max_items);
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9d9b4b9af4b6..c7114d233b38 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -740,6 +740,26 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
 
+static void node_tag_clear(struct radix_tree_root *root,
+				struct radix_tree_node *node,
+				unsigned int tag, unsigned int offset)
+{
+	while (node) {
+		if (!tag_get(node, tag, offset))
+			return;
+		tag_clear(node, tag, offset);
+		if (any_tag_set(node, tag))
+			return;
+
+		offset = node->offset;
+		node = node->parent;
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+}
+
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
  *	@root:		radix tree root
@@ -776,28 +796,9 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		offset = radix_tree_descend(parent, &node, offset);
 	}
 
-	if (node == NULL)
-		goto out;
+	if (node)
+		node_tag_clear(root, parent, tag, offset);
 
-	index >>= shift;
-
-	while (parent) {
-		if (!tag_get(parent, tag, offset))
-			goto out;
-		tag_clear(parent, tag, offset);
-		if (any_tag_set(parent, tag))
-			goto out;
-
-		index >>= RADIX_TREE_MAP_SHIFT;
-		offset = index & RADIX_TREE_MAP_MASK;
-		parent = parent->parent;
-	}
-
-	/* clear the root's tag bit */
-	if (root_tag_get(root, tag))
-		root_tag_clear(root, tag);
-
-out:
 	return node;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
@@ -1525,14 +1526,9 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 
 	offset = get_slot_offset(node, slot);
 
-	/*
-	 * Clear all tags associated with the item to be deleted.
-	 * This way of doing it would be inefficient, but seldom is any set.
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (tag_get(node, tag, offset))
-			radix_tree_tag_clear(root, index, tag);
-	}
+	/* Clear all tags associated with the item to be deleted.  */
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		node_tag_clear(root, node, tag, offset);
 
 	delete_sibling_entries(node, node_to_entry(slot), offset);
 	node->slots[offset] = NULL;
@@ -1559,6 +1555,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
+struct radix_tree_node *radix_tree_replace_clear_tags(
+			struct radix_tree_root *root,
+			unsigned long index, void *entry)
+{
+	struct radix_tree_node *node;
+	void **slot;
+
+	__radix_tree_lookup(root, index, &node, &slot);
+
+	if (node) {
+		unsigned int tag, offset = get_slot_offset(node, slot);
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			node_tag_clear(root, node, tag, offset);
+	} else {
+		/* Clear root node tags */
+		root->gfp_mask &= __GFP_BITS_MASK;
+	}
+
+	radix_tree_replace_slot(slot, entry);
+	return node;
+}
+
 /**
  *	radix_tree_tagged - test whether any items in the tree are tagged
  *	@root:		radix tree root
diff --git a/mm/filemap.c b/mm/filemap.c
index b418405903bc..9665b1d4f318 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 				   struct page *page, void *shadow)
 {
 	struct radix_tree_node *node;
-	unsigned long index;
-	unsigned int offset;
-	unsigned int tag;
-	void **slot;
 
 	VM_BUG_ON(!PageLocked(page));
 
-	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+	node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index,
+								shadow);
 
 	if (shadow) {
 		mapping->nrexceptional++;
@@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	}
 	mapping->nrpages--;
 
-	if (!node) {
-		/* Clear direct pointer tags in root node */
-		mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
-		radix_tree_replace_slot(slot, shadow);
+	if (!node)
 		return;
-	}
-
-	/* Clear tree tags for the removed page */
-	index = page->index;
-	offset = index & RADIX_TREE_MAP_MASK;
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (test_bit(offset, node->tags[tag]))
-			radix_tree_tag_clear(&mapping->page_tree, index, tag);
-	}
 
-	/* Delete page, swap shadow entry */
-	radix_tree_replace_slot(slot, shadow);
 	workingset_node_pages_dec(node);
 	if (shadow)
 		workingset_node_shadows_inc(node);
-- 
cgit v1.2.3


From 78a9be0a0a3367b94af242632c525d22b26f1a87 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 20 May 2016 17:03:51 -0700
Subject: dax: move RADIX_DAX_ definitions to dax.c

These don't belong in radix-tree.h any more than PAGECACHE_TAG_* do.
Let's try to maintain the idea that radix-tree simply implements an
abstract data type.

Signed-off-by: NeilBrown <neilb@suse.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c                   | 9 +++++++++
 include/linux/radix-tree.h | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 0dbe4e0f16fe..a345c168acaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -32,6 +32,15 @@
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
+#define RADIX_DAX_MASK	0xf
+#define RADIX_DAX_SHIFT	4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 11c8e7cc3920..c2f69e25ba86 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -48,15 +48,6 @@
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-#define RADIX_DAX_MASK	0xf
-#define RADIX_DAX_SHIFT	4
-#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
-
 static inline int radix_tree_is_internal_node(void *ptr)
 {
 	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
-- 
cgit v1.2.3


From 3bcadd6fa6c4fd07ace3626357c824eb532488a6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:54 -0700
Subject: radix-tree: free up the bottom bit of exceptional entries for reuse

We are guaranteed that pointers to radix_tree_nodes always have the
bottom two bits clear (because they come from a slab cache, and slab
caches have a minimum alignment of sizeof(void *)), so we can redefine
'radix_tree_is_internal_node' to only return true if the bottom two bits
have value '01'.  This frees up one quarter of the potential values for
use by the user.

Idea from Neil Brown.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Suggested-by: Neil Brown <neilb@suse.de>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c2f69e25ba86..cb4b7e8cee81 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -29,28 +29,37 @@
 #include <linux/rcupdate.h>
 
 /*
- * Entries in the radix tree have the low bit set if they refer to a
- * radix_tree_node.  If the low bit is clear then the entry is user data.
- *
- * We also use the low bit to indicate that the slot will be freed in the
- * next RCU idle period, and users need to re-walk the tree to find the
- * new slot for the index that they were looking for.  See the comment in
- * radix_tree_shrink() for details.
+ * The bottom two bits of the slot determine how the remaining bits in the
+ * slot are interpreted:
+ *
+ * 00 - data pointer
+ * 01 - internal entry
+ * 10 - exceptional entry
+ * 11 - locked exceptional entry
+ *
+ * The internal entry may be a pointer to the next level in the tree, a
+ * sibling entry, or an indicator that the entry in this slot has been moved
+ * to another location in the tree and the lookup should be restarted.  While
+ * NULL fits the 'data pointer' pattern, it means that there is no entry in
+ * the tree for this index (no matter what level of the tree it is found at).
+ * This means that you cannot store NULL in the tree as a value for the index.
  */
-#define RADIX_TREE_INTERNAL_NODE	1
+#define RADIX_TREE_ENTRY_MASK		3UL
+#define RADIX_TREE_INTERNAL_NODE	1UL
 
 /*
- * A common use of the radix tree is to store pointers to struct pages;
- * but shmem/tmpfs needs also to store swap entries in the same tree:
- * those are marked as exceptional entries to distinguish them.
+ * Most users of the radix tree store pointers but shmem/tmpfs stores swap
+ * entries in the same tree.  They are marked as exceptional entries to
+ * distinguish them from pointers to struct page.
  * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
  */
 #define RADIX_TREE_EXCEPTIONAL_ENTRY	2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT	2
 
-static inline int radix_tree_is_internal_node(void *ptr)
+static inline bool radix_tree_is_internal_node(void *ptr)
 {
-	return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE);
+	return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
+				RADIX_TREE_INTERNAL_NODE;
 }
 
 /*** radix-tree API starts here ***/
@@ -236,8 +245,7 @@ static inline int radix_tree_exceptional_entry(void *arg)
  */
 static inline int radix_tree_exception(void *arg)
 {
-	return unlikely((unsigned long)arg &
-		(RADIX_TREE_INTERNAL_NODE | RADIX_TREE_EXCEPTIONAL_ENTRY));
+	return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
 }
 
 /**
-- 
cgit v1.2.3