From 59dc76b0d4dfdd7dc46a1010e4afb44f60f3e97f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 20 May 2016 16:56:31 -0700 Subject: mm: vmscan: reduce size of inactive file list The inactive file list should still be large enough to contain readahead windows and freshly written file data, but it no longer is the only source for detecting multiple accesses to file pages. The workingset refault measurement code causes recently evicted file pages that get accessed again after a shorter interval to be promoted directly to the active list. With that mechanism in place, we can afford to (on a larger system) dedicate more memory to the active file list, so we can actually cache more of the frequently used file pages in memory, and not have them pushed out by streaming writes, once-used streaming file reads, etc. This can help things like database workloads, where only half the page cache can currently be used to cache the database working set. This patch automatically increases that fraction on larger systems, using the same ratio that has already been used for anonymous memory. [hannes@cmpxchg.org: cgroup-awareness] Signed-off-by: Rik van Riel Signed-off-by: Johannes Weiner Reported-by: Andres Freund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 94da96738df3..a805474df4ab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -415,25 +415,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) -{ - unsigned long inactive_ratio; - unsigned long inactive; - unsigned long active; - unsigned long gb; - - inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); - active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); - - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); - else - inactive_ratio = 1; - - return inactive * inactive_ratio < active; -} - void mem_cgroup_handle_over_high(void); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, @@ -646,12 +627,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return true; } -static inline bool -mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) -{ - return true; -} - static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { -- cgit v1.2.3 From ea7ab982b6bdb7ce218fd3a7850bb2e2b414fdd0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:38 -0700 Subject: mm, compaction: change COMPACT_ constants into enum Compaction code is doing weird dances between COMPACT_FOO -> int -> unsigned long But there doesn't seem to be any reason for that. All functions which return/use one of those constants are not expecting any other value so it really makes sense to define an enum for them and make it clear that no other values are expected. This is a pure cleanup and shouldn't introduce any functional changes. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 45 +++++++++++++++++++++++++++------------------ mm/compaction.c | 27 ++++++++++++++------------- mm/page_alloc.c | 2 +- 3 files changed, 42 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 242b660f64e6..706cbf00e919 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -2,21 +2,29 @@ #define _LINUX_COMPACTION_H /* Return values for compact_zone() and try_to_compact_pages() */ -/* compaction didn't start as it was deferred due to past failures */ -#define COMPACT_DEFERRED 0 -/* compaction didn't start as it was not possible or direct reclaim was more suitable */ -#define COMPACT_SKIPPED 1 -/* compaction should continue to another pageblock */ -#define COMPACT_CONTINUE 2 -/* direct compaction partially compacted a zone and there are suitable pages */ -#define COMPACT_PARTIAL 3 -/* The full zone was compacted */ -#define COMPACT_COMPLETE 4 -/* For more detailed tracepoint output */ -#define COMPACT_NO_SUITABLE_PAGE 5 -#define COMPACT_NOT_SUITABLE_ZONE 6 -#define COMPACT_CONTENDED 7 /* When adding new states, please adjust include/trace/events/compaction.h */ +enum compact_result { + /* compaction didn't start as it was deferred due to past failures */ + COMPACT_DEFERRED, + /* + * compaction didn't start as it was not possible or direct reclaim + * was more suitable + */ + COMPACT_SKIPPED, + /* compaction should continue to another pageblock */ + COMPACT_CONTINUE, + /* + * direct compaction partially compacted a zone and there are suitable + * pages + */ + COMPACT_PARTIAL, + /* The full zone was compacted */ + COMPACT_COMPLETE, + /* For more detailed tracepoint output */ + COMPACT_NO_SUITABLE_PAGE, + COMPACT_NOT_SUITABLE_ZONE, + COMPACT_CONTENDED, +}; /* Used to signal whether compaction detected need_sched() or lock contention */ /* No contention detected */ @@ -38,12 +46,13 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); -extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, +extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, + unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); -extern unsigned long compaction_suitable(struct zone *zone, int order, +extern enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx); extern void defer_compaction(struct zone *zone, int order); @@ -57,7 +66,7 @@ extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); #else -static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, +static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended) @@ -73,7 +82,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) { } -static inline unsigned long compaction_suitable(struct zone *zone, int order, +static inline enum compact_result compaction_suitable(struct zone *zone, int order, int alloc_flags, int classzone_idx) { return COMPACT_SKIPPED; diff --git a/mm/compaction.c b/mm/compaction.c index eda3c2244f30..e721d252c5d2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1229,7 +1229,7 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static int __compact_finished(struct zone *zone, struct compact_control *cc, +static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { unsigned int order; @@ -1292,8 +1292,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_NO_SUITABLE_PAGE; } -static int compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result compact_finished(struct zone *zone, + struct compact_control *cc, + const int migratetype) { int ret; @@ -1312,7 +1313,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -static unsigned long __compaction_suitable(struct zone *zone, int order, +static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx) { @@ -1358,11 +1359,11 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, return COMPACT_CONTINUE; } -unsigned long compaction_suitable(struct zone *zone, int order, +enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx) { - unsigned long ret; + enum compact_result ret; ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); trace_mm_compaction_suitable(zone, order, ret); @@ -1372,9 +1373,9 @@ unsigned long compaction_suitable(struct zone *zone, int order, return ret; } -static int compact_zone(struct zone *zone, struct compact_control *cc) +static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) { - int ret; + enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); @@ -1530,11 +1531,11 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, int order, +static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, unsigned int alloc_flags, int classzone_idx) { - unsigned long ret; + enum compact_result ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -1572,7 +1573,7 @@ int sysctl_extfrag_threshold = 500; * * This is the main entry point for direct page compaction. */ -unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, +enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended) { @@ -1580,7 +1581,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - int rc = COMPACT_DEFERRED; + enum compact_result rc = COMPACT_DEFERRED; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1594,7 +1595,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { - int status; + enum compact_result status; int zone_contended; if (compaction_deferred(zone, order)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index edbdf56b3c9b..ed62c4b90598 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3188,7 +3188,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { - unsigned long compact_result; + enum compact_result compact_result; struct page *page; if (!order) -- cgit v1.2.3 From 1d4746d395975e0ff5103e20ab169d1a95b4ef9e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:44 -0700 Subject: mm, compaction: distinguish COMPACT_DEFERRED from COMPACT_SKIPPED try_to_compact_pages() can currently return COMPACT_SKIPPED even when the compaction is defered for some zone just because zone DMA is skipped in 99% of cases due to watermark checks. This makes COMPACT_DEFERRED basically unusable for the page allocator as a feedback mechanism. Make sure we distinguish those two states properly and switch their ordering in the enum. This would mean that the COMPACT_SKIPPED will be returned only when all eligible zones are skipped. As a result COMPACT_DEFERRED handling for THP in __alloc_pages_slowpath will be more precise and we would bail out rather than reclaim. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 7 +++++-- include/trace/events/compaction.h | 2 +- mm/compaction.c | 8 +++++--- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 706cbf00e919..11f228712ed5 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -4,13 +4,16 @@ /* Return values for compact_zone() and try_to_compact_pages() */ /* When adding new states, please adjust include/trace/events/compaction.h */ enum compact_result { - /* compaction didn't start as it was deferred due to past failures */ - COMPACT_DEFERRED, /* * compaction didn't start as it was not possible or direct reclaim * was more suitable */ COMPACT_SKIPPED, + /* compaction didn't start as it was deferred due to past failures */ + COMPACT_DEFERRED, + /* compaction not active last round */ + COMPACT_INACTIVE = COMPACT_DEFERRED, + /* compaction should continue to another pageblock */ COMPACT_CONTINUE, /* diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index e215bf68f521..6ba16c86d7db 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -10,8 +10,8 @@ #include #define COMPACTION_STATUS \ - EM( COMPACT_DEFERRED, "deferred") \ EM( COMPACT_SKIPPED, "skipped") \ + EM( COMPACT_DEFERRED, "deferred") \ EM( COMPACT_CONTINUE, "continue") \ EM( COMPACT_PARTIAL, "partial") \ EM( COMPACT_COMPLETE, "complete") \ diff --git a/mm/compaction.c b/mm/compaction.c index 455ecd87f48d..b2b94474dd28 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1578,7 +1578,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - enum compact_result rc = COMPACT_DEFERRED; + enum compact_result rc = COMPACT_SKIPPED; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1595,8 +1595,10 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, enum compact_result status; int zone_contended; - if (compaction_deferred(zone, order)) + if (compaction_deferred(zone, order)) { + rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; + } status = compact_zone_order(zone, order, gfp_mask, mode, &zone_contended, alloc_flags, @@ -1667,7 +1669,7 @@ break_loop: * If at least one zone wasn't deferred or skipped, we report if all * zones that were tried were lock contended. */ - if (rc > COMPACT_SKIPPED && all_zones_contended) + if (rc > COMPACT_INACTIVE && all_zones_contended) *contended = COMPACT_CONTENDED_LOCK; return rc; -- cgit v1.2.3 From c8f7de0bfae36e8532e5e25a39d15407f02aca78 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:47 -0700 Subject: mm, compaction: distinguish between full and partial COMPACT_COMPLETE COMPACT_COMPLETE now means that compaction and free scanner met. This is not very useful information if somebody just wants to use this feedback and make any decisions based on that. The current caller might be a poor guy who just happened to scan tiny portion of the zone and that could be the reason no suitable pages were compacted. Make sure we distinguish the full and partial zone walks. Consumers should treat COMPACT_PARTIAL_SKIPPED as a potential success and be optimistic in retrying. The existing users of COMPACT_COMPLETE are conservatively changed to use COMPACT_PARTIAL_SKIPPED as well but some of them should be probably reconsidered and only defer the compaction only for COMPACT_COMPLETE with the new semantic. This patch shouldn't introduce any functional changes. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 10 +++++++++- include/trace/events/compaction.h | 1 + mm/compaction.c | 14 +++++++++++--- mm/internal.h | 1 + 4 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 11f228712ed5..9b37f9d3f7a8 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -21,7 +21,15 @@ enum compact_result { * pages */ COMPACT_PARTIAL, - /* The full zone was compacted */ + /* + * direct compaction has scanned part of the zone but wasn't successfull + * to compact suitable pages. + */ + COMPACT_PARTIAL_SKIPPED, + /* + * The full zone was compacted scanned but wasn't successfull to compact + * suitable pages. + */ COMPACT_COMPLETE, /* For more detailed tracepoint output */ COMPACT_NO_SUITABLE_PAGE, diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 6ba16c86d7db..36e2d6fb1360 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -14,6 +14,7 @@ EM( COMPACT_DEFERRED, "deferred") \ EM( COMPACT_CONTINUE, "continue") \ EM( COMPACT_PARTIAL, "partial") \ + EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \ EM( COMPACT_COMPLETE, "complete") \ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ diff --git a/mm/compaction.c b/mm/compaction.c index b2b94474dd28..4af1577adb5c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1252,7 +1252,10 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ if (cc->direct_compaction) zone->compact_blockskip_flush = true; - return COMPACT_COMPLETE; + if (cc->whole_zone) + return COMPACT_COMPLETE; + else + return COMPACT_PARTIAL_SKIPPED; } if (is_via_compact_memory(cc->order)) @@ -1413,6 +1416,10 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } + + if (cc->migrate_pfn == start_pfn) + cc->whole_zone = true; + cc->last_migrated_pfn = 0; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, @@ -1634,7 +1641,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, goto break_loop; } - if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { + if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || + status == COMPACT_PARTIAL_SKIPPED)) { /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up @@ -1881,7 +1889,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.classzone_idx, 0)) { success = true; compaction_defer_reset(zone, cc.order, false); - } else if (status == COMPACT_COMPLETE) { + } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* * We use sync migration mode here, so we defer like * sync direct compaction does. diff --git a/mm/internal.h b/mm/internal.h index 3ac544f1963f..f6f3353b0868 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -174,6 +174,7 @@ struct compact_control { enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool direct_compaction; /* False from kcompactd or /proc/... */ + bool whole_zone; /* Whole zone has been scanned */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ -- cgit v1.2.3 From 4f9a358c36fcdad3ea1db263ec4d484a70ad543e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:50 -0700 Subject: mm, compaction: update compaction_result ordering compaction_result will be used as the primary feedback channel for compaction users. At the same time try_to_compact_pages (and potentially others) assume a certain ordering where a more specific feedback takes precendence. This gets a bit awkward when we have conflicting feedback from different zones. E.g one returing COMPACT_COMPLETE meaning the full zone has been scanned without any outcome while other returns with COMPACT_PARTIAL aka made some progress. The caller should get COMPACT_PARTIAL because that means that the compaction still can make some progress. The same applies for COMPACT_PARTIAL vs COMPACT_PARTIAL_SKIPPED. Reorder PARTIAL to be the largest one so the larger the value is the more progress we have done. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9b37f9d3f7a8..ff39fa0a1ede 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -4,6 +4,8 @@ /* Return values for compact_zone() and try_to_compact_pages() */ /* When adding new states, please adjust include/trace/events/compaction.h */ enum compact_result { + /* For more detailed tracepoint output - internal to compaction */ + COMPACT_NOT_SUITABLE_ZONE, /* * compaction didn't start as it was not possible or direct reclaim * was more suitable @@ -11,30 +13,34 @@ enum compact_result { COMPACT_SKIPPED, /* compaction didn't start as it was deferred due to past failures */ COMPACT_DEFERRED, + /* compaction not active last round */ COMPACT_INACTIVE = COMPACT_DEFERRED, + /* For more detailed tracepoint output - internal to compaction */ + COMPACT_NO_SUITABLE_PAGE, /* compaction should continue to another pageblock */ COMPACT_CONTINUE, + /* - * direct compaction partially compacted a zone and there are suitable - * pages + * The full zone was compacted scanned but wasn't successfull to compact + * suitable pages. */ - COMPACT_PARTIAL, + COMPACT_COMPLETE, /* * direct compaction has scanned part of the zone but wasn't successfull * to compact suitable pages. */ COMPACT_PARTIAL_SKIPPED, + + /* compaction terminated prematurely due to lock contentions */ + COMPACT_CONTENDED, + /* - * The full zone was compacted scanned but wasn't successfull to compact - * suitable pages. + * direct compaction partially compacted a zone and there might be + * suitable pages */ - COMPACT_COMPLETE, - /* For more detailed tracepoint output */ - COMPACT_NO_SUITABLE_PAGE, - COMPACT_NOT_SUITABLE_ZONE, - COMPACT_CONTENDED, + COMPACT_PARTIAL, }; /* Used to signal whether compaction detected need_sched() or lock contention */ -- cgit v1.2.3 From cab1802b5f0dddea30547a7451fda8c7e4c593f0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:56 -0700 Subject: mm, compaction: abstract compaction feedback to helpers Compaction can provide a wild variation of feedback to the caller. Many of them are implementation specific and the caller of the compaction (especially the page allocator) shouldn't be bound to specifics of the current implementation. This patch abstracts the feedback into three basic types: - compaction_made_progress - compaction was active and made some progress. - compaction_failed - compaction failed and further attempts to invoke it would most probably fail and therefore it is not worth retrying - compaction_withdrawn - compaction wasn't invoked for an implementation specific reasons. In the current implementation it means that the compaction was deferred, contended or the page scanners met too early without any progress. Retrying is still worthwhile. [vbabka@suse.cz: do not change thp back off behavior] [akpm@linux-foundation.org: fix typo in comment, per Hillf] Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 79 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index ff39fa0a1ede..8d8c916fe67a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -78,6 +78,70 @@ extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); extern bool compaction_restarting(struct zone *zone, int order); +/* Compaction has made some progress and retrying makes sense */ +static inline bool compaction_made_progress(enum compact_result result) +{ + /* + * Even though this might sound confusing this in fact tells us + * that the compaction successfully isolated and migrated some + * pageblocks. + */ + if (result == COMPACT_PARTIAL) + return true; + + return false; +} + +/* Compaction has failed and it doesn't make much sense to keep retrying. */ +static inline bool compaction_failed(enum compact_result result) +{ + /* All zones were scanned completely and still not result. */ + if (result == COMPACT_COMPLETE) + return true; + + return false; +} + +/* + * Compaction has backed off for some reason. It might be throttling or + * lock contention. Retrying is still worthwhile. + */ +static inline bool compaction_withdrawn(enum compact_result result) +{ + /* + * Compaction backed off due to watermark checks for order-0 + * so the regular reclaim has to try harder and reclaim something. + */ + if (result == COMPACT_SKIPPED) + return true; + + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a THP allocation, we do not want + * to heavily disrupt the system, so we fail the allocation + * instead of entering direct reclaim. + */ + if (result == COMPACT_DEFERRED) + return true; + + /* + * If compaction in async mode encounters contention or blocks higher + * priority task we back off early rather than cause stalls. + */ + if (result == COMPACT_CONTENDED) + return true; + + /* + * Page scanners have met but we haven't scanned full zones so this + * is a back off in fact. + */ + if (result == COMPACT_PARTIAL_SKIPPED) + return true; + + return false; +} + extern int kcompactd_run(int nid); extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); @@ -114,6 +178,21 @@ static inline bool compaction_deferred(struct zone *zone, int order) return true; } +static inline bool compaction_made_progress(enum compact_result result) +{ + return false; +} + +static inline bool compaction_failed(enum compact_result result) +{ + return false; +} + +static inline bool compaction_withdrawn(enum compact_result result) +{ + return true; +} + static inline int kcompactd_run(int nid) { return 0; -- cgit v1.2.3 From 0a0337e0d1d134465778a16f5cbea95086e8e9e0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:00 -0700 Subject: mm, oom: rework oom detection __alloc_pages_slowpath has traditionally relied on the direct reclaim and did_some_progress as an indicator that it makes sense to retry allocation rather than declaring OOM. shrink_zones had to rely on zone_reclaimable if shrink_zone didn't make any progress to prevent from a premature OOM killer invocation - the LRU might be full of dirty or writeback pages and direct reclaim cannot clean those up. zone_reclaimable allows to rescan the reclaimable lists several times and restart if a page is freed. This is really subtle behavior and it might lead to a livelock when a single freed page keeps allocator looping but the current task will not be able to allocate that single page. OOM killer would be more appropriate than looping without any progress for unbounded amount of time. This patch changes OOM detection logic and pulls it out from shrink_zone which is too low to be appropriate for any high level decisions such as OOM which is per zonelist property. It is __alloc_pages_slowpath which knows how many attempts have been done and what was the progress so far therefore it is more appropriate to implement this logic. The new heuristic is implemented in should_reclaim_retry helper called from __alloc_pages_slowpath. It tries to be more deterministic and easier to follow. It builds on an assumption that retrying makes sense only if the currently reclaimable memory + free pages would allow the current allocation request to succeed (as per __zone_watermark_ok) at least for one zone in the usable zonelist. This alone wouldn't be sufficient, though, because the writeback might get stuck and reclaimable pages might be pinned for a really long time or even depend on the current allocation context. Therefore there is a backoff mechanism implemented which reduces the reclaim target after each reclaim round without any progress. This means that we should eventually converge to only NR_FREE_PAGES as the target and fail on the wmark check and proceed to OOM. The backoff is simple and linear with 1/16 of the reclaimable pages for each round without any progress. We are optimistic and reset counter for successful reclaim rounds. Costly high order pages mostly preserve their semantic and those without __GFP_REPEAT fail right away while those which have the flag set will back off after the amount of reclaimable pages reaches equivalent of the requested order. The only difference is that if there was no progress during the reclaim we rely on zone watermark check. This is more logical thing to do than previous 1< Acked-by: Hillf Danton Cc: Vladimir Davydov Cc: Johannes Weiner Cc: David Rientjes Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/page_alloc.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++----- mm/vmscan.c | 25 +++---------- 3 files changed, 97 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index ad220359f1b0..0af2bb2028fd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -316,6 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, struct vm_area_struct *vma); /* linux/mm/vmscan.c */ +extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8bcc10616fab..fa39efc3a692 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3386,6 +3386,77 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask) return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; } +/* + * Maximum number of reclaim retries without any progress before OOM killer + * is consider as the only way to move forward. + */ +#define MAX_RECLAIM_RETRIES 16 + +/* + * Checks whether it makes sense to retry the reclaim to make a forward progress + * for the given allocation request. + * The reclaim feedback represented by did_some_progress (any progress during + * the last reclaim round), pages_reclaimed (cumulative number of reclaimed + * pages) and no_progress_loops (number of reclaim rounds without any progress + * in a row) is considered as well as the reclaimable pages on the applicable + * zone list (with a backoff mechanism which is a function of no_progress_loops). + * + * Returns true if a retry is viable or false to enter the oom path. + */ +static inline bool +should_reclaim_retry(gfp_t gfp_mask, unsigned order, + struct alloc_context *ac, int alloc_flags, + bool did_some_progress, unsigned long pages_reclaimed, + int no_progress_loops) +{ + struct zone *zone; + struct zoneref *z; + + /* + * Make sure we converge to OOM if we cannot make any progress + * several times in the row. + */ + if (no_progress_loops > MAX_RECLAIM_RETRIES) + return false; + + if (order > PAGE_ALLOC_COSTLY_ORDER) { + if (pages_reclaimed >= (1<zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; + + available = zone_reclaimable_pages(zone); + available -= DIV_ROUND_UP(no_progress_loops * available, + MAX_RECLAIM_RETRIES); + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + + /* + * Would the allocation succeed if we reclaimed the whole + * available? + */ + if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), + ac->high_zoneidx, alloc_flags, available)) { + /* Wait for some write requests to complete then retry */ + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50); + return true; + } + } + + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -3397,6 +3468,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; enum compact_result compact_result; + int no_progress_loops = 0; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3525,23 +3597,35 @@ retry: if (gfp_mask & __GFP_NORETRY) goto noretry; - /* Keep reclaiming pages as long as there is reasonable progress */ - pages_reclaimed += did_some_progress; - if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || - ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { - /* Wait for some write requests to complete then retry */ - wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50); - goto retry; + /* + * Do not retry costly high order allocations unless they are + * __GFP_REPEAT + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + goto noretry; + + if (did_some_progress) { + no_progress_loops = 0; + pages_reclaimed += did_some_progress; + } else { + no_progress_loops++; } + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, + did_some_progress > 0, pages_reclaimed, + no_progress_loops)) + goto retry; + /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) goto got_pg; /* Retry as long as the OOM killer is making progress */ - if (did_some_progress) + if (did_some_progress) { + no_progress_loops = 0; goto retry; + } noretry: /* diff --git a/mm/vmscan.c b/mm/vmscan.c index a386454c015a..c4a2f4512fca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc) } #endif -static unsigned long zone_reclaimable_pages(struct zone *zone) +unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; @@ -2507,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. - * - * Returns true if a zone was reclaimable. */ -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; @@ -2518,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_scanned; gfp_t orig_mask; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); - bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2583,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; - if (nr_soft_reclaimed) - reclaimable = true; /* need some check for avoid more shrink_zone() */ } - if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) - reclaimable = true; - - if (global_reclaim(sc) && - !reclaimable && zone_reclaimable(zone)) - reclaimable = true; + shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); } /* @@ -2601,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; - - return reclaimable; } /* @@ -2627,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, int initial_priority = sc->priority; unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool zones_reclaimable; retry: delayacct_freepages_start(); @@ -2638,7 +2625,7 @@ retry: vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - zones_reclaimable = shrink_zones(zonelist, sc); + shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) @@ -2685,10 +2672,6 @@ retry: goto retry; } - /* Any of the zones still reclaimable? Don't OOM. */ - if (zones_reclaimable) - return 1; - return 0; } -- cgit v1.2.3 From 86a294a81f93d6f36d00ec3ff779d36d218f852d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:12 -0700 Subject: mm, oom, compaction: prevent from should_compact_retry looping for ever for costly orders "mm: consider compaction feedback also for costly allocation" has removed the upper bound for the reclaim/compaction retries based on the number of reclaimed pages for costly orders. While this is desirable the patch did miss a mis interaction between reclaim, compaction and the retry logic. The direct reclaim tries to get zones over min watermark while compaction backs off and returns COMPACT_SKIPPED when all zones are below low watermark + 1< Acked-by: Hillf Danton Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 ++++ include/linux/mmzone.h | 3 +++ mm/compaction.c | 42 +++++++++++++++++++++++++++++++++++++++--- mm/page_alloc.c | 23 +++++++++++++---------- 4 files changed, 59 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 8d8c916fe67a..a58c852a268f 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -142,6 +142,10 @@ static inline bool compaction_withdrawn(enum compact_result result) return false; } + +bool compaction_zonelist_suitable(struct alloc_context *ac, int order, + int alloc_flags); + extern int kcompactd_run(int nid); extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c60db2096fd8..8dd0333b01dc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -739,6 +739,9 @@ static inline bool is_dev_zone(const struct zone *zone) extern struct mutex zonelists_mutex; void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); +bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int classzone_idx, unsigned int alloc_flags, + long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags); diff --git a/mm/compaction.c b/mm/compaction.c index 4af1577adb5c..d8a20fcf8678 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1318,7 +1318,8 @@ static enum compact_result compact_finished(struct zone *zone, */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx) + int classzone_idx, + unsigned long wmark_target) { int fragindex; unsigned long watermark; @@ -1341,7 +1342,8 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * allocated and for a short time, the footprint is higher */ watermark += (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) + if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + alloc_flags, wmark_target)) return COMPACT_SKIPPED; /* @@ -1368,7 +1370,8 @@ enum compact_result compaction_suitable(struct zone *zone, int order, { enum compact_result ret; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + zone_page_state(zone, NR_FREE_PAGES)); trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; @@ -1376,6 +1379,39 @@ enum compact_result compaction_suitable(struct zone *zone, int order, return ret; } +bool compaction_zonelist_suitable(struct alloc_context *ac, int order, + int alloc_flags) +{ + struct zone *zone; + struct zoneref *z; + + /* + * Make sure at least one zone would pass __compaction_suitable if we continue + * retrying the reclaim. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; + enum compact_result compact_result; + + /* + * Do not consider all the reclaimable memory because we do not + * want to trash just for a single high order allocation which + * is even not guaranteed to appear even if __compaction_suitable + * is happy about the watermark check. + */ + available = zone_reclaimable_pages(zone) / order; + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + compact_result = __compaction_suitable(zone, order, alloc_flags, + ac_classzone_idx(ac), available); + if (compact_result != COMPACT_SKIPPED && + compact_result != COMPACT_NOT_SUITABLE_ZONE) + return true; + } + + return false; +} + static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) { enum compact_result ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dea406a62e3d..089f760ce64a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2750,10 +2750,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * one free page of a suitable size. Checking now avoids taking the zone lock * to check in the allocation paths if no pages are free. */ -static bool __zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, - unsigned int alloc_flags, - long free_pages) +bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int classzone_idx, unsigned int alloc_flags, + long free_pages) { long min = mark; int o; @@ -3256,8 +3255,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(unsigned int order, enum compact_result compact_result, - enum migrate_mode *migrate_mode, +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, + enum compact_result compact_result, enum migrate_mode *migrate_mode, int compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; @@ -3281,9 +3280,11 @@ should_compact_retry(unsigned int order, enum compact_result compact_result, /* * make sure the compaction wasn't deferred or didn't bail out early * due to locks contention before we declare that we should give up. + * But do not retry if the given zonelist is not suitable for + * compaction. */ if (compaction_withdrawn(compact_result)) - return true; + return compaction_zonelist_suitable(ac, order, alloc_flags); /* * !costly requests are much more important than __GFP_REPEAT @@ -3311,7 +3312,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(unsigned int order, enum compact_result compact_result, +should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, + enum compact_result compact_result, enum migrate_mode *migrate_mode, int compaction_retries) { @@ -3706,8 +3708,9 @@ retry: * of free memory (see __compaction_suitable) */ if (did_some_progress > 0 && - should_compact_retry(order, compact_result, - &migration_mode, compaction_retries)) + should_compact_retry(ac, order, alloc_flags, + compact_result, &migration_mode, + compaction_retries)) goto retry; /* Reclaim has failed us, start killing things */ -- cgit v1.2.3 From bb8a4b7fd1266ef888b3a80aa5f266062b224ef4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:18 -0700 Subject: mm, oom_reaper: hide oom reaped tasks from OOM killer more carefully Commit 36324a990cf5 ("oom: clear TIF_MEMDIE after oom_reaper managed to unmap the address space") not only clears TIF_MEMDIE for oom reaped task but also set OOM_SCORE_ADJ_MIN for the target task to hide it from the oom killer. This works in simple cases but it is not sufficient for (unlikely) cases where the mm is shared between independent processes (as they do not share signal struct). If the mm had only small amount of memory which could be reaped then another task sharing the mm could be selected and that wouldn't help to move out from the oom situation. Introduce MMF_OOM_REAPED mm flag which is checked in oom_badness (same as OOM_SCORE_ADJ_MIN) and task is skipped if the flag is set. Set the flag after __oom_reap_task is done with a task. This will force the select_bad_process() to ignore all already oom reaped tasks as well as no such task is sacrificed for its parent. Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/oom_kill.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 31bd0d97d178..40eabf176ce2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -521,6 +521,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ +#define MMF_OOM_REAPED 21 /* mm has been already reaped */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 415f7eb913fa..c0376efa79ec 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; + /* + * Do not even consider tasks which are explicitly marked oom + * unkillable or have been already oom reaped. + */ adj = (long)p->signal->oom_score_adj; - if (adj == OOM_SCORE_ADJ_MIN) { + if (adj == OOM_SCORE_ADJ_MIN || + test_bit(MMF_OOM_REAPED, &p->mm->flags)) { task_unlock(p); return 0; } @@ -513,7 +518,7 @@ static bool __oom_reap_task(struct task_struct *tsk) * This task can be safely ignored because we cannot do much more * to release its memory. */ - tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; + set_bit(MMF_OOM_REAPED, &mm->flags); out: mmput(mm); return ret; -- cgit v1.2.3 From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:21 -0700 Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper context Tetsuo has properly noted that mmput slow path might get blocked waiting for another party (e.g. exit_aio waits for an IO). If that happens the oom_reaper would be put out of the way and will not be able to process next oom victim. We should strive for making this context as reliable and independent on other subsystems as much as possible. Introduce mmput_async which will perform the slow path from an async (WQ) context. This will delay the operation but that shouldn't be a problem because the oom_reaper has reclaimed the victim's address space for most cases as much as possible and the remaining context shouldn't bind too much memory anymore. The only exception is when mmap_sem trylock has failed which shouldn't happen too often. The issue is only theoretical but not impossible. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 5 +++++ kernel/fork.c | 50 +++++++++++++++++++++++++++++++++--------------- mm/oom_kill.c | 8 ++++++-- 4 files changed, 48 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1fda9c99ef95..d553855503e6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -513,6 +514,7 @@ struct mm_struct { #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif + struct work_struct async_put_work; }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index 40eabf176ce2..479e3cade7e9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2730,6 +2730,11 @@ static inline void mmdrop(struct mm_struct * mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +/* same as above but performs the slow path from the async kontext. Can + * be called from the atomic context as well + */ +extern void mmput_async(struct mm_struct *); + /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* diff --git a/kernel/fork.c b/kernel/fork.c index 3e8451527cbe..8fbed7194af1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm) { might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); +} +EXPORT_SYMBOL_GPL(mmput); + +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); } } -EXPORT_SYMBOL_GPL(mmput); /** * set_mm_exe_file - change a reference to the mm's executable file diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c0376efa79ec..c0e37dd1422f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -446,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); - static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; @@ -520,7 +519,12 @@ static bool __oom_reap_task(struct task_struct *tsk) */ set_bit(MMF_OOM_REAPED, &mm->flags); out: - mmput(mm); + /* + * Drop our reference but make sure the mmput slow path is called from a + * different context because we shouldn't risk we get stuck there and + * put the oom_reaper out of the way. + */ + mmput_async(mm); return ret; } -- cgit v1.2.3 From 98748bd722005be9de2662bd4f7e41ad8148bdbd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:24 -0700 Subject: oom: consider multi-threaded tasks in task_will_free_mem task_will_free_mem is a misnomer for a more complex PF_EXITING test for early break out from the oom killer because it is believed that such a task would release its memory shortly and so we do not have to select an oom victim and perform a disruptive action. Currently we make sure that the given task is not participating in the core dumping because it might get blocked for a long time - see commit d003f371b270 ("oom: don't assume that a coredumping thread will exit soon"). The check can still do better though. We shouldn't consider the task unless the whole thread group is going down. This is rather unlikely but not impossible. A single exiting thread would surely leave all the address space behind. If we are really unlucky it might get stuck on the exit path and keep its TIF_MEMDIE and so block the oom killer. Link: http://lkml.kernel.org/r/1460452756-15491-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Tetsuo Handa Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index 83b9c39bd8b7..d3f533f2f481 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -110,13 +110,24 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p); static inline bool task_will_free_mem(struct task_struct *task) { + struct signal_struct *sig = task->signal; + /* * A coredumping process may sleep for an extended period in exit_mm(), * so the oom killer cannot assume that the process will promptly exit * and release memory. */ - return (task->flags & PF_EXITING) && - !(task->signal->flags & SIGNAL_GROUP_COREDUMP); + if (sig->flags & SIGNAL_GROUP_COREDUMP) + return false; + + if (!(task->flags & PF_EXITING)) + return false; + + /* Make sure that the whole thread group is going down */ + if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT)) + return false; + + return true; } /* sysctls */ -- cgit v1.2.3 From f44666b04605d1c7fd94ab90b7ccf633e7eff228 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 20 May 2016 16:57:27 -0700 Subject: mm,oom: speed up select_bad_process() loop Since commit 3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel panics"), select_bad_process() is using for_each_process_thread(). Since oom_unkillable_task() scans all threads in the caller's thread group and oom_task_origin() scans signal_struct of the caller's thread group, we don't need to call oom_unkillable_task() and oom_task_origin() on each thread. Also, since !mm test will be done later at oom_badness(), we don't need to do !mm test on each thread. Therefore, we only need to do TIF_MEMDIE test on each thread. Although the original code was correct it was quite inefficient because each thread group was scanned num_threads times which can be a lot especially with processes with many threads. Even though the OOM is extremely cold path it is always good to be as effective as possible when we are inside rcu_read_lock() - aka unpreemptible context. If we track number of TIF_MEMDIE threads inside signal_struct, we don't need to do TIF_MEMDIE test on each thread. This will allow select_bad_process() to use for_each_process(). This patch adds a counter to signal_struct for tracking how many TIF_MEMDIE threads are in a given thread group, and check it at oom_scan_process_thread() so that select_bad_process() can use for_each_process() rather than for_each_process_thread(). [mhocko@suse.com: do not blow the signal_struct size] Link: http://lkml.kernel.org/r/20160520075035.GF19172@dhcp22.suse.cz Link: http://lkml.kernel.org/r/201605182230.IDC73435.MVSOHLFOQFOJtF@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/oom_kill.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 479e3cade7e9..01fe1bb68754 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -669,6 +669,7 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; + atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */ struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c0e37dd1422f..5bb2f7698ad7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -283,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves. */ - if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (!is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - } - if (!task->mm) - return OOM_SCAN_CONTINUE; + if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) + return OOM_SCAN_ABORT; /* * If task is allocating a lot of memory and has been marked to be @@ -307,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, static struct task_struct *select_bad_process(struct oom_control *oc, unsigned int *ppoints, unsigned long totalpages) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; unsigned long chosen_points = 0; rcu_read_lock(); - for_each_process_thread(g, p) { + for_each_process(p) { unsigned int points; switch (oom_scan_process_thread(oc, p, totalpages)) { @@ -331,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc, points = oom_badness(p, NULL, oc->nodemask, totalpages); if (!points || points < chosen_points) continue; - /* Prefer thread group leaders for display purposes */ - if (points == chosen_points && thread_group_leader(chosen)) - continue; chosen = p; chosen_points = points; @@ -673,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk) /* OOM killer might race with memcg OOM */ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_inc(&tsk->signal->oom_victims); /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -690,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk) { if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_dec(&tsk->signal->oom_victims); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); -- cgit v1.2.3 From 80c4bd7a5e4368b680e0aeb57050a1b06eb573d8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 May 2016 16:57:38 -0700 Subject: mm/vmalloc: keep a separate lazy-free list When mixing lots of vmallocs and set_memory_*() (which calls vm_unmap_aliases()) I encountered situations where the performance degraded severely due to the walking of the entire vmap_area list each invocation. One simple improvement is to add the lazily freed vmap_area to a separate lockless free list, such that we then avoid having to walk the full list on each purge. Signed-off-by: Chris Wilson Reviewed-by: Roman Pen Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Cc: Daniel Vetter Cc: David Rientjes Cc: Joonsoo Kim Cc: Roman Pen Cc: Mel Gorman Cc: Toshi Kani Cc: Shawn Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 ++- mm/vmalloc.c | 39 +++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index d1f1d338af20..957adb741b6f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -4,6 +4,7 @@ #include #include #include +#include #include /* pgprot_t */ #include @@ -44,7 +45,7 @@ struct vmap_area { unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ - struct list_head purge_list; /* "lazy purge" list */ + struct llist_node purge_list; /* "lazy purge" list */ struct vm_struct *vm; struct rcu_head rcu_head; }; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ae7d20b447ff..6e3291882739 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ -#define VM_LAZY_FREE 0x01 -#define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); +static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -601,7 +600,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { static DEFINE_SPINLOCK(purge_lock); - LIST_HEAD(valist); + struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; int nr = 0; @@ -620,20 +619,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (sync) purge_fragmented_blocks_allcpus(); - rcu_read_lock(); - list_for_each_entry_rcu(va, &vmap_area_list, list) { - if (va->flags & VM_LAZY_FREE) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - list_add_tail(&va->purge_list, &valist); - va->flags |= VM_LAZY_FREEING; - va->flags &= ~VM_LAZY_FREE; - } + valist = llist_del_all(&vmap_purge_list); + llist_for_each_entry(va, valist, purge_list) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; } - rcu_read_unlock(); if (nr) atomic_sub(nr, &vmap_lazy_nr); @@ -643,7 +636,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (nr) { spin_lock(&vmap_area_lock); - list_for_each_entry_safe(va, n_va, &valist, purge_list) + llist_for_each_entry_safe(va, n_va, valist, purge_list) __free_vmap_area(va); spin_unlock(&vmap_area_lock); } @@ -678,9 +671,15 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - va->flags |= VM_LAZY_FREE; - atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); - if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + int nr_lazy; + + nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, + &vmap_lazy_nr); + + /* After this point, we may free va at any time */ + llist_add(&va->purge_list, &vmap_purge_list); + + if (unlikely(nr_lazy > lazy_max_pages())) try_purge_vmap_area_lazy(); } -- cgit v1.2.3 From b8ca9e3a612eaf3e54c6fa136c62246a1a9aece7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 May 2016 16:57:53 -0700 Subject: mm: tighten fault_in_pages_writeable() copy_page_to_iter_iovec() is currently the only user of fault_in_pages_writeable(), and it definitely can use fragments from high order pages. Make sure fault_in_pages_writeable() is only touching two adjacent pages at most, as claimed. Signed-off-by: Eric Dumazet Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fe1513ffb7bf..97354102794d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -518,33 +518,27 @@ void page_endio(struct page *page, int rw, int err); extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter); /* - * Fault a userspace page into pagetables. Return non-zero on a fault. - * - * This assumes that two userspace pages are always sufficient. + * Fault one or two userspace pages into pagetables. + * Return -EINVAL if more than two pages would be needed. + * Return non-zero on a fault. */ static inline int fault_in_pages_writeable(char __user *uaddr, int size) { - int ret; + int span, ret; if (unlikely(size == 0)) return 0; + span = offset_in_page(uaddr) + size; + if (span > 2 * PAGE_SIZE) + return -EINVAL; /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. */ ret = __put_user(0, uaddr); - if (ret == 0) { - char __user *end = uaddr + size - 1; - - /* - * If the page was already mapped, this will get a cache miss - * for sure, so try to avoid doing it. - */ - if (((unsigned long)uaddr & PAGE_MASK) != - ((unsigned long)end & PAGE_MASK)) - ret = __put_user(0, end); - } + if (ret == 0 && span > PAGE_SIZE) + ret = __put_user(0, uaddr + size - 1); return ret; } -- cgit v1.2.3 From 7fab358d90e6ba9d9cb702bee0c8a5f5c13bb6df Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Fri, 20 May 2016 16:57:59 -0700 Subject: include/linux/hugetlb*.h: clean up code Macro HUGETLBFS_SB is clear enough, so one statement is clearer than 3 lines statements. Remove redundant return statements for non-return functions, which can save lines, at least. Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 +--- include/linux/hugetlb_cgroup.h | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e44c57876e89..7ef4b635015d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -353,9 +353,7 @@ extern unsigned int default_hstate_idx; static inline struct hstate *hstate_inode(struct inode *i) { - struct hugetlbfs_sb_info *hsb; - hsb = HUGETLBFS_SB(i->i_sb); - return hsb->hstate; + return HUGETLBFS_SB(i->i_sb)->hstate; } static inline struct hstate *hstate_file(struct file *f) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 24154c26d469..063962f6dfc6 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -93,20 +93,17 @@ hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { - return; } static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page) { - return; } static inline void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { - return; } static inline void hugetlb_cgroup_file_init(void) @@ -116,7 +113,6 @@ static inline void hugetlb_cgroup_file_init(void) static inline void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) { - return; } #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ -- cgit v1.2.3 From d70c17d436b3fb9dbdae8c93bf908a6110b0cb4f Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Fri, 20 May 2016 16:58:01 -0700 Subject: include/linux/hugetlb.h: use bool instead of int for hugepage_migration_supported() It is used as a pure bool function within kernel source wide. Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7ef4b635015d..c26d4638f665 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -452,12 +452,12 @@ static inline pgoff_t basepage_index(struct page *page) extern void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); -static inline int hugepage_migration_supported(struct hstate *h) +static inline bool hugepage_migration_supported(struct hstate *h) { #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION return huge_page_shift(h) == PMD_SHIFT; #else - return 0; + return false; #endif } @@ -519,7 +519,7 @@ static inline pgoff_t basepage_index(struct page *page) return page->index; } #define dissolve_free_huge_pages(s, e) do {} while (0) -#define hugepage_migration_supported(h) 0 +#define hugepage_migration_supported(h) false static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) -- cgit v1.2.3 From 0c9ad804f178eae02a34045bb0916fa0e31623d5 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Fri, 20 May 2016 16:58:04 -0700 Subject: mm fix commmets: if SPARSEMEM, pgdata doesn't have page_ext If SPARSEMEM, use page_ext in mem_section if !SPARSEMEM, use page_ext in pgdata Signed-off-by: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8dd0333b01dc..02069c23486d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1063,7 +1063,7 @@ struct mem_section { unsigned long *pageblock_flags; #ifdef CONFIG_PAGE_EXTENSION /* - * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use + * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; -- cgit v1.2.3 From d2a1a1f0a97a77d25cbada37161dc2ecdf01f93d Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 20 May 2016 16:58:16 -0700 Subject: mm: use unsigned long constant for page flags struct page->flags is unsigned long, so when shifting bits we should use UL suffix to match it. Found this problem after I added 64-bit CPU specific page flags and failed to compile the kernel: mm/page_alloc.c: In function '__free_one_page': mm/page_alloc.c:672:2: error: integer overflow in expression [-Werror=overflow] Link: http://lkml.kernel.org/r/1461971723-16187-1-git-send-email-yuzhao@google.com Signed-off-by: Yu Zhao Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Jerome Marchand Cc: Denys Vlasenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a61e06e5fbce..e5a32445f930 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -479,7 +479,7 @@ static inline void ClearPageCompound(struct page *page) } #endif -#define PG_head_mask ((1L << PG_head)) +#define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); @@ -670,7 +670,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) } #ifdef CONFIG_MMU -#define __PG_MLOCKED (1 << PG_mlocked) +#define __PG_MLOCKED (1UL << PG_mlocked) #else #define __PG_MLOCKED 0 #endif @@ -680,11 +680,11 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) * these flags set. It they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ - (1 << PG_lru | 1 << PG_locked | \ - 1 << PG_private | 1 << PG_private_2 | \ - 1 << PG_writeback | 1 << PG_reserved | \ - 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED) + (1UL << PG_lru | 1UL << PG_locked | \ + 1UL << PG_private | 1UL << PG_private_2 | \ + 1UL << PG_writeback | 1UL << PG_reserved | \ + 1UL << PG_slab | 1UL << PG_swapcache | 1UL << PG_active | \ + 1UL << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. @@ -695,10 +695,10 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ - (((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) + (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) #define PAGE_FLAGS_PRIVATE \ - (1 << PG_private | 1 << PG_private_2) + (1UL << PG_private | 1UL << PG_private_2) /** * page_has_private - Determine if page has private stuff * @page: The page to be checked -- cgit v1.2.3 From 5f527c2b3ea261bfccb7d12f9feade924cc4987c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 20 May 2016 16:58:24 -0700 Subject: mm: thp: microoptimize compound_mapcount() compound_mapcount() is only called after PageCompound() has already been checked by the caller, so there's no point to check it again. Gcc may optimize it away too because it's inline but this will remove the runtime check for sure and add it'll add an assert instead. Link: http://lkml.kernel.org/r/1462547040-1737-3-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Kirill A. Shutemov Cc: Alex Williamson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2b97be1147ec..65d18a45b8e8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -475,8 +475,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page) static inline int compound_mapcount(struct page *page) { - if (!PageCompound(page)) - return 0; + VM_BUG_ON_PAGE(!PageCompound(page), page); page = compound_head(page); return atomic_read(compound_mapcount_ptr(page)) + 1; } -- cgit v1.2.3 From d2005e3f41d4f9299e2df6a967c8beb5086967a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 20 May 2016 16:58:36 -0700 Subject: userfaultfd: don't pin the user memory in userfaultfd_file_create() userfaultfd_file_create() increments mm->mm_users; this means that the memory won't be unmapped/freed if mm owner exits/execs, and UFFDIO_COPY after that can populate the orphaned mm more. Change userfaultfd_file_create() and userfaultfd_ctx_put() to use mm->mm_count to pin mm_struct. This means that atomic_inc_not_zero(mm->mm_users) is needed when we are going to actually play with this memory. Except handle_userfault() path doesn't need this, the caller must already have a reference. The patch adds the new trivial helper, mmget_not_zero(), it can have more users. Link: http://lkml.kernel.org/r/20160516172254.GA8595@redhat.com Signed-off-by: Oleg Nesterov Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 41 ++++++++++++++++++++++++++++------------- include/linux/sched.h | 7 ++++++- 2 files changed, 34 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 66cdb44616d5..2d97952e341a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -137,7 +137,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); - mmput(ctx->mm); + mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } } @@ -434,6 +434,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file) ACCESS_ONCE(ctx->released) = true; + if (!mmget_not_zero(mm)) + goto wakeup; + /* * Flush page faults out of all CPUs. NOTE: all page faults * must be retried without returning VM_FAULT_SIGBUS if @@ -466,7 +469,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } up_write(&mm->mmap_sem); - + mmput(mm); +wakeup: /* * After no new page faults can wait on this fault_*wqh, flush * the last page faults that may have been already waiting on @@ -760,10 +764,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, start = uffdio_register.range.start; end = start + uffdio_register.range.len; + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + down_write(&mm->mmap_sem); vma = find_vma_prev(mm, start, &prev); - - ret = -ENOMEM; if (!vma) goto out_unlock; @@ -864,6 +870,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } while (vma && vma->vm_start < end); out_unlock: up_write(&mm->mmap_sem); + mmput(mm); if (!ret) { /* * Now that we scanned all vmas we can already tell @@ -902,10 +909,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, start = uffdio_unregister.start; end = start + uffdio_unregister.len; + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + down_write(&mm->mmap_sem); vma = find_vma_prev(mm, start, &prev); - - ret = -ENOMEM; if (!vma) goto out_unlock; @@ -998,6 +1007,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } while (vma && vma->vm_start < end); out_unlock: up_write(&mm->mmap_sem); + mmput(mm); out: return ret; } @@ -1067,9 +1077,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) goto out; - - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len); + if (mmget_not_zero(ctx->mm)) { + ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len); + mmput(ctx->mm); + } if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; if (ret < 0) @@ -1110,8 +1122,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) goto out; - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + if (mmget_not_zero(ctx->mm)) { + ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + mmput(ctx->mm); + } if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; if (ret < 0) @@ -1289,12 +1304,12 @@ static struct file *userfaultfd_file_create(int flags) ctx->released = false; ctx->mm = current->mm; /* prevent the mm struct to be freed */ - atomic_inc(&ctx->mm->mm_users); + atomic_inc(&ctx->mm->mm_count); file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); if (IS_ERR(file)) { - mmput(ctx->mm); + mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } out: diff --git a/include/linux/sched.h b/include/linux/sched.h index 01fe1bb68754..6b3213d96da6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2723,12 +2723,17 @@ extern struct mm_struct * mm_alloc(void); /* mmdrop drops the mm and the page tables */ extern void __mmdrop(struct mm_struct *); -static inline void mmdrop(struct mm_struct * mm) +static inline void mmdrop(struct mm_struct *mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +static inline bool mmget_not_zero(struct mm_struct *mm) +{ + return atomic_inc_not_zero(&mm->mm_users); +} + /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); /* same as above but performs the slow path from the async kontext. Can -- cgit v1.2.3 From 4b50bcc7eda4d3cc9e3f2a0aa60e590fedf728c5 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Fri, 20 May 2016 16:58:38 -0700 Subject: mm: use phys_addr_t for reserve_bootmem_region() arguments Since commit 92923ca3aace ("mm: meminit: only set page reserved in the memblock region") the reserved bit is set on reserved memblock regions. However start and end address are passed as unsigned long. This is only 32bit on i386, so it can end up marking the wrong pages reserved for ranges at 4GB and above. This was observed on a 32bit Xen dom0 which was booted with initial memory set to a value below 4G but allowing to balloon in memory (dom0_mem=1024M for example). This would define a reserved bootmem region for the additional memory (for example on a 8GB system there was a reverved region covering the 4GB-8GB range). But since the addresses were passed on as unsigned long, this was actually marking all pages from 0 to 4GB as reserved. Fixes: 92923ca3aacef63 ("mm: meminit: only set page reserved in the memblock region") Link: http://lkml.kernel.org/r/1463491221-10573-1-git-send-email-stefan.bader@canonical.com Signed-off-by: Stefan Bader Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 65d18a45b8e8..fbdb9d40847f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1763,7 +1763,7 @@ extern void free_highmem_page(struct page *page); extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); -extern void reserve_bootmem_region(unsigned long start, unsigned long end); +extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3f4b69aaa23a..2dd1ba4e70cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1205,7 +1205,7 @@ static inline void init_reserved_page(unsigned long pfn) * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); -- cgit v1.2.3 From 5c0a85fad949212b3e059692deecdeed74ae7ec7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 20 May 2016 16:58:41 -0700 Subject: mm: make faultaround produce old ptes Currently, faultaround code produces young pte. This can screw up vmscan behaviour[1], as it makes vmscan think that these pages are hot and not push them out on first round. During sparse file access faultaround gets more pages mapped and all of them are young. Under memory pressure, this makes vmscan swap out anon pages instead, or to drop other page cache pages which otherwise stay resident. Modify faultaround to produce old ptes, so they can easily be reclaimed under memory pressure. This can to some extend defeat the purpose of faultaround on machines without hardware accessed bit as it will not help us with reducing the number of minor page faults. We may want to disable faultaround on such machines altogether, but that's subject for separate patchset. Minchan: "I tested 512M mmap sequential word read test on non-HW access bit system (i.e., ARM) and confirmed it doesn't increase minor fault any more. old: 4096 fault_around minor fault: 131291 elapsed time: 6747645 usec new: 65536 fault_around minor fault: 131291 elapsed time: 6709263 usec 0.56% benefit" [1] https://lkml.kernel.org/r/1460992636-711-1-git-send-email-vinmenon@codeaurora.org Link: http://lkml.kernel.org/r/1463488366-47723-1-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Minchan Kim Tested-by: Minchan Kim Acked-by: Rik van Riel Cc: Mel Gorman Cc: Michal Hocko Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/filemap.c | 2 +- mm/memory.c | 23 ++++++++++++++++++----- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index fbdb9d40847f..f223ac26b5d9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -596,7 +596,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) } void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon); + struct page *page, pte_t *pte, bool write, bool anon, bool old); #endif /* diff --git a/mm/filemap.c b/mm/filemap.c index 8f4859989f1b..b418405903bc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2191,7 +2191,7 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; - do_set_pte(vma, addr, page, pte, false, false); + do_set_pte(vma, addr, page, pte, false, false, true); unlock_page(page); goto next; unlock: diff --git a/mm/memory.c b/mm/memory.c index 007c72ad03f6..f29e5ab0342d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2876,7 +2876,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, * vm_ops->map_pages. */ void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon) + struct page *page, pte_t *pte, bool write, bool anon, bool old) { pte_t entry; @@ -2884,6 +2884,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (old) + entry = pte_mkold(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address, false); @@ -3021,9 +3023,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); - do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) goto unlock_out; + do_fault_around(vma, address, pte, pgoff, flags); + /* Check if the fault is handled by faultaround */ + if (!pte_same(*pte, orig_pte)) { + /* + * Faultaround produce old pte, but the pte we've + * handler fault for should be young. + */ + pte_t entry = pte_mkyoung(*pte); + if (ptep_set_access_flags(vma, address, pte, entry, 0)) + update_mmu_cache(vma, address, pte); + goto unlock_out; + } pte_unmap_unlock(pte, ptl); } @@ -3038,7 +3051,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, false, false); + do_set_pte(vma, address, fault_page, pte, false, false, false); unlock_page(fault_page); unlock_out: pte_unmap_unlock(pte, ptl); @@ -3090,7 +3103,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } goto uncharge_out; } - do_set_pte(vma, address, new_page, pte, true, true); + do_set_pte(vma, address, new_page, pte, true, true, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); @@ -3147,7 +3160,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, true, false); + do_set_pte(vma, address, fault_page, pte, true, false, false); pte_unmap_unlock(pte, ptl); if (set_page_dirty(fault_page)) -- cgit v1.2.3 From 0bb2fd13b69abfd88880f356903b5c7ca36d5eea Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 20 May 2016 16:58:59 -0700 Subject: mm: page_is_guard(): return false when page_ext arrays are not allocated yet When enabling the below kernel configs: CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM kernel bootup may fail due to the following oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: ffff88017c080040 ti: ffff88017c084000 task.ti: ffff88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: 0000:ffff88017c087c48 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 0000000000000980 RSI: 0000000000000080 RDI: 0000000000660401 RBP: ffff88017c087cd0 R08: 0000000000000401 R09: 0000000000000009 R10: ffff88017c080040 R11: 000000000000000a R12: 0000000000000400 R13: ffffea0019810000 R14: ffffea0019810040 R15: ffff88066cfe6080 FS: 0000000000000000(0000) GS:ffff88066cd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000002406000 CR4: 00000000000006e0 Call Trace: free_hot_cold_page+0x192/0x1d0 __free_pages+0x5c/0x90 __free_pages_boot_core+0x11a/0x14e deferred_free_range+0x50/0x62 deferred_init_memmap+0x220/0x3c3 kthread+0xf8/0x110 ret_from_fork+0x22/0x40 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: 0000000000000000 The problem is lookup_page_ext() returns NULL then page_is_guard() tried to access it in page freeing. page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but freeing page might reach here before the page_ext arrays are allocated when feeding a range of pages to the allocator for the first time during bootup or memory hotplug. When it returns NULL, page_is_guard() should just return false instead of checking PAGE_EXT_DEBUG_GUARD unconditionally. Link: http://lkml.kernel.org/r/1463610225-29060-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f223ac26b5d9..b530c99e8e81 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2386,6 +2386,9 @@ static inline bool page_is_guard(struct page *page) return false; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return false; + return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); } #else -- cgit v1.2.3 From 55834c59098d0c5a97b0f3247e55832b67facdcf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 20 May 2016 16:59:11 -0700 Subject: mm: kasan: initial memory quarantine implementation Quarantine isolates freed objects in a separate queue. The objects are returned to the allocator later, which helps to detect use-after-free errors. When the object is freed, its state changes from KASAN_STATE_ALLOC to KASAN_STATE_QUARANTINE. The object is poisoned and put into quarantine instead of being returned to the allocator, therefore every subsequent access to that object triggers a KASAN error, and the error handler is able to say where the object has been allocated and deallocated. When it's time for the object to leave quarantine, its state becomes KASAN_STATE_FREE and it's returned to the allocator. From now on the allocator may reuse it for another allocation. Before that happens, it's still possible to detect a use-after free on that object (it retains the allocation/deallocation stacks). When the allocator reuses this object, the shadow is unpoisoned and old allocation/deallocation stacks are wiped. Therefore a use of this object, even an incorrect one, won't trigger ASan warning. Without the quarantine, it's not guaranteed that the objects aren't reused immediately, that's why the probability of catching a use-after-free is lower than with quarantine in place. Quarantine isolates freed objects in a separate queue. The objects are returned to the allocator later, which helps to detect use-after-free errors. Freed objects are first added to per-cpu quarantine queues. When a cache is destroyed or memory shrinking is requested, the objects are moved into the global quarantine queue. Whenever a kmalloc call allows memory reclaiming, the oldest objects are popped out of the global queue until the total size of objects in quarantine is less than 3/4 of the maximum quarantine size (which is a fraction of installed physical memory). As long as an object remains in the quarantine, KASAN is able to report accesses to it, so the chance of reporting a use-after-free is increased. Once the object leaves quarantine, the allocator may reuse it, in which case the object is unpoisoned and KASAN can't detect incorrect accesses to it. Right now quarantine support is only enabled in SLAB allocator. Unification of KASAN features in SLAB and SLUB will be done later. This patch is based on the "mm: kasan: quarantine" patch originally prepared by Dmitry Chernenkov. A number of improvements have been suggested by Andrey Ryabinin. [glider@google.com: v9] Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com Signed-off-by: Alexander Potapenko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Steven Rostedt Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 13 ++- mm/kasan/Makefile | 1 + mm/kasan/kasan.c | 57 ++++++++-- mm/kasan/kasan.h | 21 +++- mm/kasan/quarantine.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/report.c | 1 + mm/mempool.c | 2 +- mm/slab.c | 12 ++- mm/slab.h | 2 + mm/slab_common.c | 2 + 10 files changed, 387 insertions(+), 15 deletions(-) create mode 100644 mm/kasan/quarantine.c (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 737371b56044..611927f5870d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -50,6 +50,8 @@ void kasan_free_pages(struct page *page, unsigned int order); void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags); +void kasan_cache_shrink(struct kmem_cache *cache); +void kasan_cache_destroy(struct kmem_cache *cache); void kasan_poison_slab(struct page *page); void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); @@ -63,7 +65,8 @@ void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, void kasan_krealloc(const void *object, size_t new_size, gfp_t flags); void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); -void kasan_slab_free(struct kmem_cache *s, void *object); +bool kasan_slab_free(struct kmem_cache *s, void *object); +void kasan_poison_slab_free(struct kmem_cache *s, void *object); struct kasan_cache { int alloc_meta_offset; @@ -88,6 +91,8 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags) {} +static inline void kasan_cache_shrink(struct kmem_cache *cache) {} +static inline void kasan_cache_destroy(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct page *page) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, @@ -105,7 +110,11 @@ static inline void kasan_krealloc(const void *object, size_t new_size, static inline void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) {} -static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} +static inline bool kasan_slab_free(struct kmem_cache *s, void *object) +{ + return false; +} +static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {} static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 131daadf40e4..1548749a3d45 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) obj-y := kasan.o report.o kasan_init.o +obj-$(CONFIG_SLAB) += quarantine.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 38f1dd79acdb..8df666bb23be 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -388,6 +388,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, } #endif +void kasan_cache_shrink(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + +void kasan_cache_destroy(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -482,7 +492,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) kasan_kmalloc(cache, object, cache->object_size, flags); } -void kasan_slab_free(struct kmem_cache *cache, void *object) +void kasan_poison_slab_free(struct kmem_cache *cache, void *object) { unsigned long size = cache->object_size; unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); @@ -491,18 +501,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return; + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); +} + +bool kasan_slab_free(struct kmem_cache *cache, void *object) +{ #ifdef CONFIG_SLAB - if (cache->flags & SLAB_KASAN) { - struct kasan_free_meta *free_info = - get_free_info(cache, object); + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + return false; + + if (likely(cache->flags & SLAB_KASAN)) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - alloc_info->state = KASAN_STATE_FREE; - set_track(&free_info->track, GFP_NOWAIT); + struct kasan_free_meta *free_info = + get_free_info(cache, object); + + switch (alloc_info->state) { + case KASAN_STATE_ALLOC: + alloc_info->state = KASAN_STATE_QUARANTINE; + quarantine_put(free_info, cache); + set_track(&free_info->track, GFP_NOWAIT); + kasan_poison_slab_free(cache, object); + return true; + case KASAN_STATE_QUARANTINE: + case KASAN_STATE_FREE: + pr_err("Double free"); + dump_stack(); + break; + default: + break; + } } + return false; +#else + kasan_poison_slab_free(cache, object); + return false; #endif - - kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, @@ -511,6 +546,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, unsigned long redzone_start; unsigned long redzone_end; + if (flags & __GFP_RECLAIM) + quarantine_reduce(); + if (unlikely(object == NULL)) return; @@ -541,6 +579,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) unsigned long redzone_start; unsigned long redzone_end; + if (flags & __GFP_RECLAIM) + quarantine_reduce(); + if (unlikely(ptr == NULL)) return; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 30a2f0ba0e09..7f7ac51d7faf 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -62,6 +62,7 @@ struct kasan_global { enum kasan_state { KASAN_STATE_INIT, KASAN_STATE_ALLOC, + KASAN_STATE_QUARANTINE, KASAN_STATE_FREE }; @@ -79,9 +80,14 @@ struct kasan_alloc_meta { u32 reserved; }; +struct qlist_node { + struct qlist_node *next; +}; struct kasan_free_meta { - /* Allocator freelist pointer, unused by KASAN. */ - void **freelist; + /* This field is used while the object is in the quarantine. + * Otherwise it might be used for the allocator freelist. + */ + struct qlist_node quarantine_link; struct kasan_track track; }; @@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); +#ifdef CONFIG_SLAB +void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); +void quarantine_reduce(void); +void quarantine_remove_cache(struct kmem_cache *cache); +#else +static inline void quarantine_put(struct kasan_free_meta *info, + struct kmem_cache *cache) { } +static inline void quarantine_reduce(void) { } +static inline void quarantine_remove_cache(struct kmem_cache *cache) { } +#endif + #endif diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c new file mode 100644 index 000000000000..4973505a9bdd --- /dev/null +++ b/mm/kasan/quarantine.c @@ -0,0 +1,291 @@ +/* + * KASAN quarantine. + * + * Author: Alexander Potapenko + * Copyright (C) 2016 Google, Inc. + * + * Based on code by Dmitry Chernenkov. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../slab.h" +#include "kasan.h" + +/* Data structure and operations for quarantine queues. */ + +/* + * Each queue is a signle-linked list, which also stores the total size of + * objects inside of it. + */ +struct qlist_head { + struct qlist_node *head; + struct qlist_node *tail; + size_t bytes; +}; + +#define QLIST_INIT { NULL, NULL, 0 } + +static bool qlist_empty(struct qlist_head *q) +{ + return !q->head; +} + +static void qlist_init(struct qlist_head *q) +{ + q->head = q->tail = NULL; + q->bytes = 0; +} + +static void qlist_put(struct qlist_head *q, struct qlist_node *qlink, + size_t size) +{ + if (unlikely(qlist_empty(q))) + q->head = qlink; + else + q->tail->next = qlink; + q->tail = qlink; + qlink->next = NULL; + q->bytes += size; +} + +static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) +{ + if (unlikely(qlist_empty(from))) + return; + + if (qlist_empty(to)) { + *to = *from; + qlist_init(from); + return; + } + + to->tail->next = from->head; + to->tail = from->tail; + to->bytes += from->bytes; + + qlist_init(from); +} + +static void qlist_move(struct qlist_head *from, struct qlist_node *last, + struct qlist_head *to, size_t size) +{ + if (unlikely(last == from->tail)) { + qlist_move_all(from, to); + return; + } + if (qlist_empty(to)) + to->head = from->head; + else + to->tail->next = from->head; + to->tail = last; + from->head = last->next; + last->next = NULL; + from->bytes -= size; + to->bytes += size; +} + + +/* + * The object quarantine consists of per-cpu queues and a global queue, + * guarded by quarantine_lock. + */ +static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); + +static struct qlist_head global_quarantine; +static DEFINE_SPINLOCK(quarantine_lock); + +/* Maximum size of the global queue. */ +static unsigned long quarantine_size; + +/* + * The fraction of physical memory the quarantine is allowed to occupy. + * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep + * the ratio low to avoid OOM. + */ +#define QUARANTINE_FRACTION 32 + +#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4) +#define QUARANTINE_PERCPU_SIZE (1 << 20) + +static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) +{ + return virt_to_head_page(qlink)->slab_cache; +} + +static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) +{ + struct kasan_free_meta *free_info = + container_of(qlink, struct kasan_free_meta, + quarantine_link); + + return ((void *)free_info) - cache->kasan_info.free_meta_offset; +} + +static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) +{ + void *object = qlink_to_object(qlink, cache); + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + unsigned long flags; + + local_irq_save(flags); + alloc_info->state = KASAN_STATE_FREE; + ___cache_free(cache, object, _THIS_IP_); + local_irq_restore(flags); +} + +static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) +{ + struct qlist_node *qlink; + + if (unlikely(qlist_empty(q))) + return; + + qlink = q->head; + while (qlink) { + struct kmem_cache *obj_cache = + cache ? cache : qlink_to_cache(qlink); + struct qlist_node *next = qlink->next; + + qlink_free(qlink, obj_cache); + qlink = next; + } + qlist_init(q); +} + +void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) +{ + unsigned long flags; + struct qlist_head *q; + struct qlist_head temp = QLIST_INIT; + + local_irq_save(flags); + + q = this_cpu_ptr(&cpu_quarantine); + qlist_put(q, &info->quarantine_link, cache->size); + if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) + qlist_move_all(q, &temp); + + local_irq_restore(flags); + + if (unlikely(!qlist_empty(&temp))) { + spin_lock_irqsave(&quarantine_lock, flags); + qlist_move_all(&temp, &global_quarantine); + spin_unlock_irqrestore(&quarantine_lock, flags); + } +} + +void quarantine_reduce(void) +{ + size_t new_quarantine_size; + unsigned long flags; + struct qlist_head to_free = QLIST_INIT; + size_t size_to_free = 0; + struct qlist_node *last; + + if (likely(READ_ONCE(global_quarantine.bytes) <= + READ_ONCE(quarantine_size))) + return; + + spin_lock_irqsave(&quarantine_lock, flags); + + /* + * Update quarantine size in case of hotplug. Allocate a fraction of + * the installed memory to quarantine minus per-cpu queue limits. + */ + new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + QUARANTINE_FRACTION; + new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus(); + WRITE_ONCE(quarantine_size, new_quarantine_size); + + last = global_quarantine.head; + while (last) { + struct kmem_cache *cache = qlink_to_cache(last); + + size_to_free += cache->size; + if (!last->next || size_to_free > + global_quarantine.bytes - QUARANTINE_LOW_SIZE) + break; + last = last->next; + } + qlist_move(&global_quarantine, last, &to_free, size_to_free); + + spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, NULL); +} + +static void qlist_move_cache(struct qlist_head *from, + struct qlist_head *to, + struct kmem_cache *cache) +{ + struct qlist_node *prev = NULL, *curr; + + if (unlikely(qlist_empty(from))) + return; + + curr = from->head; + while (curr) { + struct qlist_node *qlink = curr; + struct kmem_cache *obj_cache = qlink_to_cache(qlink); + + if (obj_cache == cache) { + if (unlikely(from->head == qlink)) { + from->head = curr->next; + prev = curr; + } else + prev->next = curr->next; + if (unlikely(from->tail == qlink)) + from->tail = curr->next; + from->bytes -= cache->size; + qlist_put(to, qlink, cache->size); + } else { + prev = curr; + } + curr = curr->next; + } +} + +static void per_cpu_remove_cache(void *arg) +{ + struct kmem_cache *cache = arg; + struct qlist_head to_free = QLIST_INIT; + struct qlist_head *q; + + q = this_cpu_ptr(&cpu_quarantine); + qlist_move_cache(q, &to_free, cache); + qlist_free_all(&to_free, cache); +} + +void quarantine_remove_cache(struct kmem_cache *cache) +{ + unsigned long flags; + struct qlist_head to_free = QLIST_INIT; + + on_each_cpu(per_cpu_remove_cache, cache, 1); + + spin_lock_irqsave(&quarantine_lock, flags); + qlist_move_cache(&global_quarantine, &to_free, cache); + spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, cache); +} diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 60869a5a0124..b3c122ddd454 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page, print_track(&alloc_info->track); break; case KASAN_STATE_FREE: + case KASAN_STATE_QUARANTINE: pr_err("Object freed, allocated with size %u bytes\n", alloc_info->alloc_size); free_info = get_free_info(cache, object); diff --git a/mm/mempool.c b/mm/mempool.c index 9b7a14a791cc..9e075f829d0d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element) static void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab) - kasan_slab_free(pool->pool_data, element); + kasan_poison_slab_free(pool->pool_data, element); if (pool->alloc == mempool_kmalloc) kasan_kfree(element); if (pool->alloc == mempool_alloc_pages) diff --git a/mm/slab.c b/mm/slab.c index c11bf5007952..28864c022430 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3547,9 +3547,17 @@ free_done: static inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { - struct array_cache *ac = cpu_cache_get(cachep); + /* Put the object into the quarantine, don't touch it for now. */ + if (kasan_slab_free(cachep, objp)) + return; + + ___cache_free(cachep, objp, caller); +} - kasan_slab_free(cachep, objp); +void ___cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); diff --git a/mm/slab.h b/mm/slab.h index 5969769fbee6..dedb1a920fb8 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 3239bfd758e6..a65dad7fdcd1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s) get_online_cpus(); get_online_mems(); + kasan_cache_destroy(s); mutex_lock(&slab_mutex); s->refcount--; @@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); + kasan_cache_shrink(cachep); ret = __kmem_cache_shrink(cachep, false); put_online_mems(); put_online_cpus(); -- cgit v1.2.3 From 64f8ebaf115bcddc4aaa902f981c57ba6506bc42 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 20 May 2016 16:59:28 -0700 Subject: mm/kasan: add API to check memory regions Memory access coded in an assembly won't be seen by KASAN as a compiler can instrument only C code. Add kasan_check_[read,write]() API which is going to be used to check a certain memory range. Link: http://lkml.kernel.org/r/1462538722-1574-3-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- include/linux/kasan-checks.h | 12 ++++++++++++ mm/kasan/kasan.c | 12 ++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 include/linux/kasan-checks.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 374ffa2d81b7..8b92445561b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6242,7 +6242,7 @@ S: Maintained F: arch/*/include/asm/kasan.h F: arch/*/mm/kasan_init* F: Documentation/kasan.txt -F: include/linux/kasan.h +F: include/linux/kasan*.h F: lib/test_kasan.c F: mm/kasan/ F: scripts/Makefile.kasan diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h new file mode 100644 index 000000000000..b7f8aced7870 --- /dev/null +++ b/include/linux/kasan-checks.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_KASAN_CHECKS_H +#define _LINUX_KASAN_CHECKS_H + +#ifdef CONFIG_KASAN +void kasan_check_read(const void *p, unsigned int size); +void kasan_check_write(const void *p, unsigned int size); +#else +static inline void kasan_check_read(const void *p, unsigned int size) { } +static inline void kasan_check_write(const void *p, unsigned int size) { } +#endif + +#endif diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index e5beb40d97b1..18b6a2b8d183 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -299,6 +299,18 @@ static void check_memory_region(unsigned long addr, check_memory_region_inline(addr, size, write, ret_ip); } +void kasan_check_read(const void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, false, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_read); + +void kasan_check_write(const void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, true, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_write); + #undef memset void *memset(void *addr, int c, size_t len) { -- cgit v1.2.3 From d0d8da2dc49dfdfe1d788eaf4d55eb5d4964d926 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 20 May 2016 16:59:48 -0700 Subject: zsmalloc: require GFP in zs_malloc() Pass GFP flags to zs_malloc() instead of using a fixed mask supplied to zs_create_pool(), so we can be more flexible, but, more importantly, we need this to switch zram to per-cpu compression streams -- zram will try to allocate handle with preemption disabled in a fast path and switch to a slow path (using different gfp mask) if the fast one has failed. Apart from that, this also align zs_malloc() interface with zspool/zbud. [sergey.senozhatsky@gmail.com: pass GFP flags to zs_malloc() instead of using a fixed mask] Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 4 ++-- include/linux/zsmalloc.h | 4 ++-- mm/zsmalloc.c | 24 +++++++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 370c2f76016d..b09acdb753ee 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -514,7 +514,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) goto out_error; } - meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); + meta->mem_pool = zs_create_pool(pool_name); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); goto out_error; @@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, src = uncmem; } - handle = zs_malloc(meta->mem_pool, clen); + handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM); if (!handle) { pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 34eb16098a33..57a8e98f2708 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -41,10 +41,10 @@ struct zs_pool_stats { struct zs_pool; -struct zs_pool *zs_create_pool(const char *name, gfp_t flags); +struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); -unsigned long zs_malloc(struct zs_pool *pool, size_t size); +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); void zs_free(struct zs_pool *pool, unsigned long obj); void *zs_map_object(struct zs_pool *pool, unsigned long handle, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index ae288c9f7156..aba39a291523 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -247,7 +247,6 @@ struct zs_pool { struct size_class **size_class; struct kmem_cache *handle_cachep; - gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; struct zs_pool_stats stats; @@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool) kmem_cache_destroy(pool->handle_cachep); } -static unsigned long alloc_handle(struct zs_pool *pool) +static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - pool->flags & ~__GFP_HIGHMEM); + gfp & ~__GFP_HIGHMEM); } static void free_handle(struct zs_pool *pool, unsigned long handle) @@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { - return zs_create_pool(name, gfp); + /* + * Ignore global gfp flags: zs_malloc() may be invoked from + * different contexts and its caller must provide a valid + * gfp mask. + */ + return zs_create_pool(name); } static void zs_zpool_destroy(void *pool) @@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool) static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { - *handle = zs_malloc(pool, size); + *handle = zs_malloc(pool, size, gfp); return *handle ? 0 : -1; } static void zs_zpool_free(void *pool, unsigned long handle) @@ -1391,7 +1395,7 @@ static unsigned long obj_malloc(struct size_class *class, * otherwise 0. * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; @@ -1400,7 +1404,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - handle = alloc_handle(pool); + handle = alloc_handle(pool, gfp); if (!handle) return 0; @@ -1413,7 +1417,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (!first_page) { spin_unlock(&class->lock); - first_page = alloc_zspage(class, pool->flags); + first_page = alloc_zspage(class, gfp); if (unlikely(!first_page)) { free_handle(pool, handle); return 0; @@ -1878,7 +1882,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(const char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name) { int i; struct zs_pool *pool; @@ -1948,8 +1952,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) prev_class = class; } - pool->flags = flags; - if (zs_pool_stat_create(pool, name)) goto err; -- cgit v1.2.3 From 5f56a5dfdb9bcb3bca03df59980d4d2f012cbb53 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 May 2016 17:00:16 -0700 Subject: exit_thread: remove empty bodies Define HAVE_EXIT_THREAD for archs which want to do something in exit_thread. For others, let's define exit_thread as an empty inline. This is a cleanup before we change the prototype of exit_thread to accept a task parameter. [akpm@linux-foundation.org: fix mips] Signed-off-by: Jiri Slaby Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf Cc: Chris Zankel Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mikael Starvik Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Henderson Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Steven Miao Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 5 +++++ arch/alpha/kernel/process.c | 8 -------- arch/arc/kernel/process.c | 7 ------- arch/arm/Kconfig | 1 + arch/arm64/kernel/process.c | 7 ------- arch/avr32/Kconfig | 1 + arch/blackfin/include/asm/processor.h | 7 ------- arch/c6x/kernel/process.c | 4 ---- arch/cris/Kconfig | 1 + arch/cris/arch-v10/kernel/process.c | 9 --------- arch/frv/include/asm/processor.h | 7 ------- arch/h8300/include/asm/processor.h | 7 ------- arch/hexagon/kernel/process.c | 7 ------- arch/ia64/Kconfig | 1 + arch/m32r/kernel/process.c | 9 --------- arch/m68k/include/asm/processor.h | 7 ------- arch/metag/Kconfig | 1 + arch/metag/include/asm/processor.h | 2 -- arch/microblaze/include/asm/processor.h | 10 ---------- arch/mips/kernel/process.c | 4 ---- arch/mn10300/Kconfig | 1 + arch/nios2/include/asm/processor.h | 5 ----- arch/openrisc/include/asm/processor.h | 9 --------- arch/parisc/kernel/process.c | 7 ------- arch/powerpc/kernel/process.c | 4 ---- arch/s390/Kconfig | 1 + arch/score/kernel/process.c | 2 -- arch/sh/Kconfig | 1 + arch/sh/kernel/process_32.c | 7 ------- arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/um/kernel/process.c | 4 ---- arch/unicore32/kernel/process.c | 7 ------- arch/x86/Kconfig | 1 + arch/xtensa/Kconfig | 1 + include/linux/sched.h | 7 +++++++ 36 files changed, 24 insertions(+), 140 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index 81869a5e7e17..0f298f9123dc 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -517,6 +517,11 @@ config HAVE_ARCH_MMAP_RND_BITS - ARCH_MMAP_RND_BITS_MIN - ARCH_MMAP_RND_BITS_MAX +config HAVE_EXIT_THREAD + bool + help + An architecture implements exit_thread. + config ARCH_MMAP_RND_BITS_MIN int diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 84d13263ce46..b483156698d5 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -210,14 +210,6 @@ start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp) } EXPORT_SYMBOL(start_thread); -/* - * Free current thread data structures etc.. - */ -void -exit_thread(void) -{ -} - void flush_thread(void) { diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index a3f750e76b68..b5db9e7fd649 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -183,13 +183,6 @@ void flush_thread(void) { } -/* - * Free any architecture-specific thread data structures, etc. - */ -void exit_thread(void) -{ -} - int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) { return 0; diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b99d25b4133e..956d3575426c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -50,6 +50,7 @@ config ARM select HAVE_DMA_CONTIGUOUS if MMU select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU + select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL) select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) select HAVE_FUNCTION_TRACER if (!XIP_KERNEL) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 48eea6866c67..6cd2612236dc 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -200,13 +200,6 @@ void show_regs(struct pt_regs * regs) __show_regs(regs); } -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ -} - static void tls_thread_flush(void) { asm ("msr tpidr_el0, xzr"); diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 18b88779e701..e43519a2ca89 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -4,6 +4,7 @@ config AVR32 # that we usually don't need on AVR32. select EXPERT select HAVE_CLK + select HAVE_EXIT_THREAD select HAVE_OPROFILE select HAVE_KPROBES select VIRT_TO_BUS diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h index 7acd46653df3..0c265aba94ad 100644 --- a/arch/blackfin/include/asm/processor.h +++ b/arch/blackfin/include/asm/processor.h @@ -75,13 +75,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* - * Free current thread data structures etc.. - */ -static inline void exit_thread(void) -{ -} - /* * Return saved PC of a blocked thread. */ diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c index 3ae9f5a166a0..0ee7686a78f3 100644 --- a/arch/c6x/kernel/process.c +++ b/arch/c6x/kernel/process.c @@ -82,10 +82,6 @@ void flush_thread(void) { } -void exit_thread(void) -{ -} - /* * Do necessary setup to start up a newly executed thread. */ diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 99bda1ba3d2f..5c0ca8ae9293 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -59,6 +59,7 @@ config CRIS select GENERIC_IOMAP select MODULES_USE_ELF_RELA select CLONE_BACKWARDS2 + select HAVE_EXIT_THREAD if ETRAX_ARCH_V32 select OLD_SIGSUSPEND select OLD_SIGACTION select GPIOLIB diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index 02b783457be0..96e5afef6b47 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -35,15 +35,6 @@ void default_idle(void) local_irq_enable(); } -/* - * Free current thread data structures etc.. - */ - -void exit_thread(void) -{ - /* Nothing needs to be done. */ -} - /* if the watchdog is enabled, we can simply disable interrupts and go * into an eternal loop, and the watchdog will reset the CPU after 0.1s * if on the other hand the watchdog wasn't enabled, we just enable it and wait diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index ae8d423e79d9..73f0a79ad8e6 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -96,13 +96,6 @@ extern asmlinkage void *restore_user_regs(const struct user_context *target, ... #define release_segments(mm) do { } while (0) #define forget_segments() do { } while (0) -/* - * Free current thread data structures etc.. - */ -static inline void exit_thread(void) -{ -} - /* * Return saved PC of a blocked thread. */ diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h index 54e3fd83c336..111df7397ac7 100644 --- a/arch/h8300/include/asm/processor.h +++ b/arch/h8300/include/asm/processor.h @@ -110,13 +110,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* - * Free current thread data structures etc.. - */ -static inline void exit_thread(void) -{ -} - /* * Return saved PC of a blocked thread. */ diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index a9ebd471823a..d9edfd3fc52a 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -136,13 +136,6 @@ void release_thread(struct task_struct *dead_task) { } -/* - * Free any architecture-specific thread data structures, etc. - */ -void exit_thread(void) -{ -} - /* * Some archs flush debug and FPU info here */ diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index b534ebab36ea..f80758cb7157 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -18,6 +18,7 @@ config IA64 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select HAVE_UNSTABLE_SCHED_CLOCK + select HAVE_EXIT_THREAD select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index e69221d581d5..a88b1f01e91f 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -101,15 +101,6 @@ void show_regs(struct pt_regs * regs) #endif } -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - /* Nothing to do. */ - DPRINTK("pid = %d\n", current->pid); -} - void flush_thread(void) { DPRINTK("pid = %d\n", current->pid); diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index 20dda1d4b860..a6ce2ec8d693 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -153,13 +153,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* - * Free current thread data structures etc.. - */ -static inline void exit_thread(void) -{ -} - extern unsigned long thread_saved_pc(struct task_struct *tsk); unsigned long get_wchan(struct task_struct *p); diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index a0fa88da3e31..e47a08d72819 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -11,6 +11,7 @@ config METAG select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE + select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_KERNEL_BZIP2 diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h index 0838ca699764..a0333ebcac35 100644 --- a/arch/metag/include/asm/processor.h +++ b/arch/metag/include/asm/processor.h @@ -134,8 +134,6 @@ static inline void release_thread(struct task_struct *dead_task) #define copy_segments(tsk, mm) do { } while (0) #define release_segments(mm) do { } while (0) -extern void exit_thread(void); - /* * Return saved PC of a blocked thread. */ diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 497a988d79c2..c38d0dd91134 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -70,11 +70,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* Free all resources held by a thread. */ -static inline void exit_thread(void) -{ -} - extern unsigned long thread_saved_pc(struct task_struct *t); extern unsigned long get_wchan(struct task_struct *p); @@ -127,11 +122,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* Free current thread data structures etc. */ -static inline void exit_thread(void) -{ -} - /* Return saved (kernel) PC of a blocked thread. */ # define thread_saved_pc(tsk) \ ((tsk)->thread.regs ? (tsk)->thread.regs->r15 : 0) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index a6b3dc54260a..411c971e3417 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -73,10 +73,6 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp) regs->regs[29] = sp; } -void exit_thread(void) -{ -} - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { /* diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 06ddb5501ab1..9627e81a6cbb 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -1,5 +1,6 @@ config MN10300 def_bool y + select HAVE_EXIT_THREAD select HAVE_OPROFILE select HAVE_UID16 select GENERIC_IRQ_SHOW diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index c2ba45c159c7..1c953f0cadbf 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -75,11 +75,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* Free current thread data structures etc.. */ -static inline void exit_thread(void) -{ -} - /* Return saved PC of a blocked thread. */ #define thread_saved_pc(tsk) ((tsk)->thread.kregs->ea) diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index 4d235e3d2534..70334c9f7d24 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -84,15 +84,6 @@ void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); void release_thread(struct task_struct *); unsigned long get_wchan(struct task_struct *p); -/* - * Free current thread data structures etc.. - */ - -extern inline void exit_thread(void) -{ - /* Nothing needs to be done. */ -} - /* * Return saved PC of a blocked thread. For now, this is the "user" PC */ diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 809905a811ed..40639439d8b3 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -144,13 +144,6 @@ void machine_power_off(void) void (*pm_power_off)(void) = machine_power_off; EXPORT_SYMBOL(pm_power_off); -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ -} - void flush_thread(void) { /* Only needs to handle fpu stuff or perf monitors. diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ea8a28fd6f31..e2f12cbcade9 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1329,10 +1329,6 @@ void show_regs(struct pt_regs * regs) show_instructions(regs); } -void exit_thread(void) -{ -} - void flush_thread(void) { #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index de0fcc08dff5..e2c9aaaf64b2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -134,6 +134,7 @@ config S390 select HAVE_DMA_API_DEBUG select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c index a1519ad3d49d..aae9480706c2 100644 --- a/arch/score/kernel/process.c +++ b/arch/score/kernel/process.c @@ -56,8 +56,6 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) regs->regs[0] = sp; } -void exit_thread(void) {} - /* * When a process does an "exec", machine state like FPU and debug * registers need to be reset. This is a hook function for that. diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 7ed20fc3fc81..cb93af8f8017 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -71,6 +71,7 @@ config SUPERH32 config SUPERH64 def_bool ARCH = "sh64" + select HAVE_EXIT_THREAD select KALLSYMS config ARCH_DEFCONFIG diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 2885fc9d9dcd..ee12e9451874 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -76,13 +76,6 @@ void start_thread(struct pt_regs *regs, unsigned long new_pc, } EXPORT_SYMBOL(start_thread); -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ -} - void flush_thread(void) { struct task_struct *tsk = current; diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index db0a26cffa97..27b3a0ad40a0 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -20,6 +20,7 @@ config SPARC select HAVE_OPROFILE select HAVE_ARCH_KGDB if !SMP || SPARC64 select HAVE_ARCH_TRACEHOOK + select HAVE_EXIT_THREAD select SYSCTL_EXCEPTION_TRACE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select RTC_CLASS diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 81719302b056..174746225577 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -3,6 +3,7 @@ config TILE def_bool y + select HAVE_EXIT_THREAD select HAVE_PERF_EVENTS select USE_PMC if PERF_EVENTS select HAVE_DMA_API_DEBUG diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 48af59aae129..0b04711f1f18 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -103,10 +103,6 @@ void interrupt_end(void) tracehook_notify_resume(regs); } -void exit_thread(void) -{ -} - int get_current_pid(void) { return task_pid_nr(current); diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index b008e9961465..00299c927852 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -201,13 +201,6 @@ void show_regs(struct pt_regs *regs) __backtrace(); } -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ -} - void flush_thread(void) { struct thread_info *thread = current_thread_info(); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ace79d2da2c3..8ff5b3be95d4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -105,6 +105,7 @@ config X86 select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_FP_TEST diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 85257afe71c3..64336f666fb6 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -14,6 +14,7 @@ config XTENSA select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK select HAVE_DMA_API_DEBUG + select HAVE_EXIT_THREAD select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if !MMU select HAVE_HW_BREAKPOINT if PERF_EVENTS diff --git a/include/linux/sched.h b/include/linux/sched.h index 6b3213d96da6..167c0d4bf3fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2769,7 +2769,14 @@ static inline int copy_thread_tls( } #endif extern void flush_thread(void); + +#ifdef CONFIG_HAVE_EXIT_THREAD extern void exit_thread(void); +#else +static inline void exit_thread(void) +{ +} +#endif extern void exit_files(struct task_struct *); extern void __cleanup_sighand(struct sighand_struct *); -- cgit v1.2.3 From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 May 2016 17:00:20 -0700 Subject: exit_thread: accept a task parameter to be exited We need to call exit_thread from copy_process in a fail path. So make it accept task_struct as a parameter. [v2] * s390: exit_thread_runtime_instr doesn't make sense to be called for non-current tasks. * arm: fix the comment in vfp_thread_copy * change 'me' to 'tsk' for task_struct * now we can change only archs that actually have exit_thread [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jiri Slaby Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf Cc: Chris Zankel Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mikael Starvik Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Henderson Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Steven Miao Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/process.c | 4 ++-- arch/arm/vfp/vfpmodule.c | 4 ---- arch/avr32/kernel/process.c | 4 ++-- arch/cris/arch-v32/kernel/process.c | 4 ++-- arch/ia64/kernel/perfmon.c | 4 ++-- arch/ia64/kernel/process.c | 14 +++++++------- arch/metag/kernel/process.c | 6 +++--- arch/mn10300/kernel/process.c | 4 ++-- arch/s390/kernel/process.c | 5 +++-- arch/sh/kernel/process_64.c | 5 ++--- arch/sparc/kernel/process_32.c | 12 ++++++------ arch/sparc/kernel/process_64.c | 4 ++-- arch/tile/kernel/process.c | 4 ++-- arch/x86/kernel/process.c | 5 ++--- arch/xtensa/kernel/process.c | 4 ++-- include/linux/sched.h | 4 ++-- kernel/exit.c | 2 +- 17 files changed, 42 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 4adfb46e3ee9..a647d6642f3e 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -193,9 +193,9 @@ EXPORT_SYMBOL_GPL(thread_notify_head); /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - thread_notify(THREAD_NOTIFY_EXIT, current_thread_info()); + thread_notify(THREAD_NOTIFY_EXIT, task_thread_info(tsk)); } void flush_thread(void) diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 2a61e4b04600..73085d3482ed 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -156,10 +156,6 @@ static void vfp_thread_copy(struct thread_info *thread) * - we could be preempted if tree preempt rcu is enabled, so * it is unsafe to use thread->cpu. * THREAD_NOTIFY_EXIT - * - the thread (v) will be running on the local CPU, so - * v === current_thread_info() - * - thread->cpu is the local CPU number at the time it is accessed, - * but may change at any time. * - we could be preempted if tree preempt rcu is enabled, so * it is unsafe to use thread->cpu. */ diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 42a53e740a7e..68e5b9dac059 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -62,9 +62,9 @@ void machine_restart(char *cmd) /* * Free current thread data structures etc */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - ocd_disable(current); + ocd_disable(tsk); } void flush_thread(void) diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c index c7ce784a393c..4d1afa9f9fd3 100644 --- a/arch/cris/arch-v32/kernel/process.c +++ b/arch/cris/arch-v32/kernel/process.c @@ -33,9 +33,9 @@ void default_idle(void) */ extern void deconfigure_bp(long pid); -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - deconfigure_bp(current->pid); + deconfigure_bp(tsk->pid); } /* diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 9cd607b06964..2436ad5f92c1 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -4542,8 +4542,8 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg /* - * called only from exit_thread(): task == current - * we come here only if current has a context attached (loaded or masked) + * called only from exit_thread() + * we come here only if the task has a context attached (loaded or masked) */ void pfm_exit_thread(struct task_struct *task) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index b51514957620..aae6c4dc7ae7 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -570,22 +570,22 @@ flush_thread (void) } /* - * Clean up state associated with current thread. This is called when + * Clean up state associated with a thread. This is called when * the thread calls exit(). */ void -exit_thread (void) +exit_thread (struct task_struct *tsk) { - ia64_drop_fpu(current); + ia64_drop_fpu(tsk); #ifdef CONFIG_PERFMON /* if needed, stop monitoring and flush state to perfmon context */ - if (current->thread.pfm_context) - pfm_exit_thread(current); + if (tsk->thread.pfm_context) + pfm_exit_thread(tsk); /* free debug register resources */ - if (current->thread.flags & IA64_THREAD_DBG_VALID) - pfm_release_debug_registers(current); + if (tsk->thread.flags & IA64_THREAD_DBG_VALID) + pfm_release_debug_registers(tsk); #endif } diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c index 7f546183a0f0..35062796edf2 100644 --- a/arch/metag/kernel/process.c +++ b/arch/metag/kernel/process.c @@ -345,10 +345,10 @@ void flush_thread(void) /* * Free current thread data structures etc. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - clear_fpu(¤t->thread); - clear_dsp(¤t->thread); + clear_fpu(&tsk->thread); + clear_dsp(&tsk->thread); } /* TODO: figure out how to unwind the kernel stack here to figure out diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 74a96ccf7451..cbede4e88dee 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -103,9 +103,9 @@ void show_regs(struct pt_regs *regs) /* * free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - exit_fpu(current); + exit_fpu(tsk); } void flush_thread(void) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 481d7a83efc6..bba4fa74b321 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -68,9 +68,10 @@ extern void kernel_thread_starter(void); /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - exit_thread_runtime_instr(); + if (tsk == current) + exit_thread_runtime_instr(); } void flush_thread(void) diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index e2062e643341..9d3e9916555d 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -288,7 +288,7 @@ void show_regs(struct pt_regs *regs) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { /* * See arch/sparc/kernel/process.c for the precedent for doing @@ -307,9 +307,8 @@ void exit_thread(void) * which it would get safely nulled. */ #ifdef CONFIG_SH_FPU - if (last_task_used_math == current) { + if (last_task_used_math == tsk) last_task_used_math = NULL; - } #endif } diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index c5113c7ce2fd..b7780a5bef11 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -184,21 +184,21 @@ unsigned long thread_saved_pc(struct task_struct *tsk) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { #ifndef CONFIG_SMP - if(last_task_used_math == current) { + if (last_task_used_math == tsk) { #else - if (test_thread_flag(TIF_USEDFPU)) { + if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) { #endif /* Keep process from leaving FPU in a bogon state. */ put_psr(get_psr() | PSR_EF); - fpsave(¤t->thread.float_regs[0], ¤t->thread.fsr, - ¤t->thread.fpqueue[0], ¤t->thread.fpqdepth); + fpsave(&tsk->thread.float_regs[0], &tsk->thread.fsr, + &tsk->thread.fpqueue[0], &tsk->thread.fpqdepth); #ifndef CONFIG_SMP last_task_used_math = NULL; #else - clear_thread_flag(TIF_USEDFPU); + clear_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU); #endif } } diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index c16ef1af1843..fa14402b33f9 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -417,9 +417,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk) } /* Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - struct thread_info *t = current_thread_info(); + struct thread_info *t = task_thread_info(tsk); if (t->utraps) { if (t->utraps[0] < 2) diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index b5f30d376ce1..6b705ccc9cc1 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -541,7 +541,7 @@ void flush_thread(void) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { #ifdef CONFIG_HARDWALL /* @@ -550,7 +550,7 @@ void exit_thread(void) * the last reference to a hardwall fd, it would already have * been released and deactivated at this point.) */ - hardwall_deactivate_all(current); + hardwall_deactivate_all(tsk); #endif } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2915d54e9dd5..96becbbb52e0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - struct task_struct *me = current; - struct thread_struct *t = &me->thread; + struct thread_struct *t = &tsk->thread; unsigned long *bp = t->io_bitmap_ptr; struct fpu *fpu = &t->fpu; diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 5bbfed81c97b..e0ded48561db 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -115,10 +115,10 @@ void arch_cpu_idle(void) /* * This is called when the thread calls exit(). */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { #if XTENSA_HAVE_COPROCESSORS - coprocessor_release_all(current_thread_info()); + coprocessor_release_all(task_thread_info(tsk)); #endif } diff --git a/include/linux/sched.h b/include/linux/sched.h index 167c0d4bf3fa..02bdab4d6db7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2771,9 +2771,9 @@ static inline int copy_thread_tls( extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD -extern void exit_thread(void); +extern void exit_thread(struct task_struct *tsk); #else -static inline void exit_thread(void) +static inline void exit_thread(struct task_struct *tsk) { } #endif diff --git a/kernel/exit.c b/kernel/exit.c index fd90195667e1..75b34fe835b2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -746,7 +746,7 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - exit_thread(); + exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent -- cgit v1.2.3 From 2eeed7e98d6a1341b1574893a95ce5b8379140f2 Mon Sep 17 00:00:00 2001 From: René Nyffenegger Date: Fri, 20 May 2016 17:00:30 -0700 Subject: include/linux/syscalls.h: use pid_t instead of int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In include/linux/syscalls.h, the four functions sys_kill, sys_tgkill, sys_tkill and sys_rt_sigqueueinfo are declared with "int pid" and "int tgid". However, in kernel/signal.c, the corresponding definitions use the more appropriate "pid_t" (which is a typedef'd int). This patch changes "int" to "pid_t" in the declarations of sys_kill, sys_tgkill, sys_tkill and sys_rt_sigqueueinfo in in order to harmonize the function declarations with their respective definitions. Link: http://lkml.kernel.org/r/57302FDA.7020205@renenyffenegger.ch Signed-off-by: René Nyffenegger Cc: Andy Lutomirski Cc: Josh Triplett Cc: Al Viro Cc: "Steven Rostedt (Red Hat)" Cc: Zach Brown Cc: Milosz Tanski Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d795472c54d8..d02239022bd0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -371,10 +371,10 @@ asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese, size_t sigsetsize); asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t __user *uinfo); -asmlinkage long sys_kill(int pid, int sig); -asmlinkage long sys_tgkill(int tgid, int pid, int sig); -asmlinkage long sys_tkill(int pid, int sig); -asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo); +asmlinkage long sys_kill(pid_t pid, int sig); +asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig); +asmlinkage long sys_tkill(pid_t pid, int sig); +asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo); asmlinkage long sys_sgetmask(void); asmlinkage long sys_ssetmask(int newmask); asmlinkage long sys_signal(int sig, __sighandler_t handler); -- cgit v1.2.3 From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:33 -0700 Subject: printk/nmi: generic solution for safe printk in NMI printk() takes some locks and could not be used a safe way in NMI context. The chance of a deadlock is real especially when printing stacks from all CPUs. This particular problem has been addressed on x86 by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). The patchset brings two big advantages. First, it makes the NMI backtraces safe on all architectures for free. Second, it makes all NMI messages almost safe on all architectures (the temporary buffer is limited. We still should keep the number of messages in NMI context at minimum). Note that there already are several messages printed in NMI context: WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE handlers. These are not easy to avoid. This patch reuses most of the code and makes it generic. It is useful for all messages and architectures that support NMI. The alternative printk_func is set when entering and is reseted when leaving NMI context. It queues IRQ work to copy the messages into the main ring buffer in a safe context. __printk_nmi_flush() copies all available messages and reset the buffer. Then we could use a simple cmpxchg operations to get synchronized with writers. There is also used a spinlock to get synchronized with other flushers. We do not longer use seq_buf because it depends on external lock. It would be hard to make all supported operations safe for a lockless use. It would be confusing and error prone to make only some operations safe. The code is put into separate printk/nmi.c as suggested by Steven Rostedt. It needs a per-CPU buffer and is compiled only on architectures that call nmi_enter(). This is achieved by the new HAVE_NMI Kconfig flag. The are MN10300 and Xtensa architectures. We need to clean up NMI handling there first. Let's do it separately. The patch is heavily based on the draft from Peter Zijlstra, see https://lkml.org/lkml/2015/6/10/327 [arnd@arndb.de: printk-nmi: use %zu format string for size_t] [akpm@linux-foundation.org: min_t->min - all types are size_t here] Signed-off-by: Petr Mladek Suggested-by: Peter Zijlstra Suggested-by: Steven Rostedt Cc: Jan Kara Acked-by: Russell King [arm part] Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 4 + arch/arm/Kconfig | 1 + arch/arm/kernel/smp.c | 2 + arch/avr32/Kconfig | 1 + arch/blackfin/Kconfig | 1 + arch/cris/Kconfig | 1 + arch/mips/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 1 - include/linux/hardirq.h | 2 + include/linux/percpu.h | 3 - include/linux/printk.h | 12 ++- init/Kconfig | 5 + init/main.c | 1 + kernel/printk/Makefile | 1 + kernel/printk/internal.h | 44 +++++++++ kernel/printk/nmi.c | 219 ++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 19 +--- lib/nmi_backtrace.c | 89 +---------------- 24 files changed, 306 insertions(+), 107 deletions(-) create mode 100644 kernel/printk/internal.h create mode 100644 kernel/printk/nmi.c (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index 0f298f9123dc..8f84fd268dee 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -187,7 +187,11 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config HAVE_NMI + bool + config HAVE_NMI_WATCHDOG + depends on HAVE_NMI bool # # An arch should select this if it provides all these things: diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 956d3575426c..90542db1220d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -67,6 +67,7 @@ config ARM select HAVE_KRETPROBES if (HAVE_KPROBES) select HAVE_MEMBLOCK select HAVE_MOD_ARCH_SPECIFIC + select HAVE_NMI select HAVE_OPROFILE if (HAVE_PERF_EVENTS) select HAVE_OPTPROBES if !THUMB2_KERNEL select HAVE_PERF_EVENTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index baee70267f29..df90bc59bfce 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -644,9 +644,11 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; case IPI_CPU_BACKTRACE: + printk_nmi_enter(); irq_enter(); nmi_cpu_backtrace(regs); irq_exit(); + printk_nmi_exit(); break; default: diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index e43519a2ca89..7e75d45e20cd 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -18,6 +18,7 @@ config AVR32 select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA + select HAVE_NMI help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index a63c12259e77..28c63fea786d 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -40,6 +40,7 @@ config BLACKFIN select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW + select HAVE_NMI config GENERIC_CSUM def_bool y diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 5c0ca8ae9293..deba2662b9f3 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -70,6 +70,7 @@ config CRIS select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32 select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32 select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32 + select HAVE_NMI config HZ int diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5663f411c225..8040fb1845b4 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -48,6 +48,7 @@ config MIPS select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC select GENERIC_CMOS_UPDATE select HAVE_MOD_ARCH_SPECIFIC + select HAVE_NMI select VIRT_TO_BUS select MODULES_USE_ELF_REL if MODULES select MODULES_USE_ELF_RELA if MODULES && 64BIT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f0403b58ae8b..01f7464d9fea 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -155,6 +155,7 @@ config PPC select NO_BOOTMEM select HAVE_GENERIC_RCU_GUP select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_NMI if PERF_EVENTS select EDAC_SUPPORT select EDAC_ATOMIC_SCRUB select ARCH_HAS_DMA_SET_COHERENT_MASK diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e2c9aaaf64b2..1c3c43d9d1b5 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -166,6 +166,7 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select HAVE_NMI config SCHED_OMIT_FRAME_POINTER diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index cb93af8f8017..f6254341c065 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -44,6 +44,7 @@ config SUPERH select OLD_SIGSUSPEND select OLD_SIGACTION select HAVE_ARCH_AUDITSYSCALL + select HAVE_NMI help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 27b3a0ad40a0..1012f7ffcdf5 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -79,6 +79,7 @@ config SPARC64 select NO_BOOTMEM select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select HAVE_NMI config ARCH_DEFCONFIG string diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 174746225577..76989b878f3c 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -30,6 +30,7 @@ config TILE select HAVE_DEBUG_STACKOVERFLOW select ARCH_WANT_FRAME_POINTERS select HAVE_CONTEXT_TRACKING + select HAVE_NMI if USE_PMC select EDAC_SUPPORT select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ff5b3be95d4..0a7b885964ba 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -131,6 +131,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 045e424fb368..7788ce643bf4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,7 +18,6 @@ #include #include #include -#include #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index dfd59d6bc6f0..c683996110b1 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -61,6 +61,7 @@ extern void irq_exit(void); #define nmi_enter() \ do { \ + printk_nmi_enter(); \ lockdep_off(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi()); \ @@ -77,6 +78,7 @@ extern void irq_exit(void); preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ lockdep_on(); \ + printk_nmi_exit(); \ } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 4bc6dafb703e..56939d3f6e53 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -129,7 +129,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) -/* To avoid include hell, as printk can not declare this, we declare it here */ -DECLARE_PER_CPU(printk_func_t, printk_func); - #endif /* __LINUX_PERCPU_H */ diff --git a/include/linux/printk.h b/include/linux/printk.h index 9ccbdf2c1453..51dd6b824fe2 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -122,7 +122,17 @@ static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif -typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); +#ifdef CONFIG_PRINTK_NMI +extern void printk_nmi_init(void); +extern void printk_nmi_enter(void); +extern void printk_nmi_exit(void); +extern void printk_nmi_flush(void); +#else +static inline void printk_nmi_init(void) { } +static inline void printk_nmi_enter(void) { } +static inline void printk_nmi_exit(void) { } +static inline void printk_nmi_flush(void) { } +#endif /* PRINTK_NMI */ #ifdef CONFIG_PRINTK asmlinkage __printf(5, 0) diff --git a/init/Kconfig b/init/Kconfig index 79a91a2c0444..bccc1d607be5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1454,6 +1454,11 @@ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. +config PRINTK_NMI + def_bool y + depends on PRINTK + depends on HAVE_NMI + config BUG bool "BUG() support" if EXPERT default y diff --git a/init/main.c b/init/main.c index 2075fafaad59..fa9b2bdde183 100644 --- a/init/main.c +++ b/init/main.c @@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void) timekeeping_init(); time_init(); sched_clock_postinit(); + printk_nmi_init(); perf_event_init(); profile_init(); call_function_init(); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 85405bdcf2b3..abb0042a427b 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,2 +1,3 @@ obj-y = printk.o +obj-$(CONFIG_PRINTK_NMI) += nmi.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h new file mode 100644 index 000000000000..2de99faedfc1 --- /dev/null +++ b/kernel/printk/internal.h @@ -0,0 +1,44 @@ +/* + * internal.h - printk internal definitions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include + +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); + +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); + +#ifdef CONFIG_PRINTK_NMI + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it temporary stores the strings into a per-CPU buffer. + * The alternative implementation is chosen transparently + * via per-CPU variable. + */ +DECLARE_PER_CPU(printk_func_t, printk_func); +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return this_cpu_read(printk_func)(fmt, args); +} + +#else /* CONFIG_PRINTK_NMI */ + +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return vprintk_default(fmt, args); +} + +#endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c new file mode 100644 index 000000000000..303cf0d15e57 --- /dev/null +++ b/kernel/printk/nmi.c @@ -0,0 +1,219 @@ +/* + * nmi.c - Safe printk in NMI context + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it uses an alternative implementation that temporary stores + * the strings into a per-CPU buffer. The content of the buffer + * is later flushed into the main ring buffer via IRQ work. + * + * The alternative implementation is chosen transparently + * via @printk_func per-CPU variable. + * + * The implementation allows to flush the strings also from another CPU. + * There are situations when we want to make sure that all buffers + * were handled or when IRQs are blocked. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; +static int printk_nmi_irq_ready; + +#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) + +struct nmi_seq_buf { + atomic_t len; /* length of written data */ + struct irq_work work; /* IRQ work that flushes the buffer */ + unsigned char buffer[NMI_LOG_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + int add = 0; + size_t len; + +again: + len = atomic_read(&s->len); + + if (len >= sizeof(s->buffer)) + return 0; + + /* + * Make sure that all old data have been read before the buffer was + * reseted. This is not needed when we just append data. + */ + if (!len) + smp_rmb(); + + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + + /* + * Do it once again if the buffer has been flushed in the meantime. + * Note that atomic_cmpxchg() is an implicit memory barrier that + * makes sure that the data were written before updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, len + add) != len) + goto again; + + /* Get flushed in a more safe context. */ + if (add && printk_nmi_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } + + return add; +} + +/* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + +/* + * Flush data from the associated per_CPU buffer. The function + * can be called either via IRQ work or independently. + */ +static void __printk_nmi_flush(struct irq_work *work) +{ + static raw_spinlock_t read_lock = + __RAW_SPIN_LOCK_INITIALIZER(read_lock); + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + unsigned long flags; + size_t len, size; + int i, last_i; + + /* + * The lock has two functions. First, one reader has to flush all + * available message to make the lockless synchronization with + * writers easier. Second, we do not want to mix messages from + * different CPUs. This is especially important when printing + * a backtrace. + */ + raw_spin_lock_irqsave(&read_lock, flags); + + i = 0; +more: + len = atomic_read(&s->len); + + /* + * This is just a paranoid check that nobody has manipulated + * the buffer an unexpected way. If we printed something then + * @len must only increase. + */ + if (i && i >= len) + pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", + i, len); + + if (!len) + goto out; /* Someone else has already flushed the buffer. */ + + /* Make sure that data has been written up to the @len */ + smp_rmb(); + + size = min(len, sizeof(s->buffer)); + last_i = i; + + /* Print line by line. */ + for (; i < size; i++) { + if (s->buffer[i] == '\n') { + print_nmi_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < size) { + print_nmi_seq_line(s, last_i, size - 1); + pr_cont("\n"); + } + + /* + * Check that nothing has got added in the meantime and truncate + * the buffer. Note that atomic_cmpxchg() is an implicit memory + * barrier that makes sure that the data were copied before + * updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, 0) != len) + goto more; + +out: + raw_spin_unlock_irqrestore(&read_lock, flags); +} + +/** + * printk_nmi_flush - flush all per-cpu nmi buffers. + * + * The buffers are flushed automatically via IRQ work. This function + * is useful only when someone wants to be sure that all buffers have + * been flushed at some point. + */ +void printk_nmi_flush(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); +} + +void __init printk_nmi_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + + init_irq_work(&s->work, __printk_nmi_flush); + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_nmi_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_nmi_flush(); +} + +void printk_nmi_enter(void) +{ + this_cpu_write(printk_func, vprintk_nmi); +} + +void printk_nmi_exit(void) +{ + this_cpu_write(printk_func, vprintk_default); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bfbf284e4218..71eba0607034 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #include "console_cmdline.h" #include "braille.h" +#include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ @@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); -/* - * This allows printk to be diverted to another function per cpu. - * This is useful for calling printk functions from within NMI - * without worrying about race conditions that can lock up the - * box. - */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; - /** * printk - print a kernel message * @fmt: format string @@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; */ asmlinkage __visible int printk(const char *fmt, ...) { - printk_func_t vprintk_func; va_list args; int r; va_start(args, fmt); - - /* - * If a caller overrides the per_cpu printk_func, then it needs - * to disable preemption when calling printk(). Otherwise - * the printk_func should be set to the default. No need to - * disable preemption here. - */ - vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - va_end(args); return r; diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 6019c53c669e..26caf51cc238 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,33 +16,14 @@ #include #include #include -#include #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) -{ - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); -} - /* * When raise() is called it will be is passed a pointer to the * backtrace_mask. Architectures that call nmi_cpu_backtrace() @@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end) void nmi_trigger_all_cpu_backtrace(bool include_self, void (*raise)(cpumask_t *mask)) { - struct nmi_seq_buf *s; - int i, cpu, this_cpu = get_cpu(); + int i, this_cpu = get_cpu(); if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, if (!include_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("Sending NMI to %s CPUs:\n", (include_self ? "all" : "other")); @@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, } /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. */ - for_each_cpu(cpu, &printtrace_mask) { - int len, last_i = 0; + printk_nmi_flush(); - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); + clear_bit_unlock(0, &backtrace_flag); put_cpu(); } -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; -} - bool nmi_cpu_backtrace(struct pt_regs *regs) { int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); pr_warn("NMI backtrace for cpu %d\n", cpu); if (regs) show_regs(regs); else dump_stack(); - this_cpu_write(printk_func, printk_func_save); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } -- cgit v1.2.3 From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:42 -0700 Subject: printk/nmi: flush NMI messages on the system panic In NMI context, printk() messages are stored into per-CPU buffers to avoid a possible deadlock. They are normally flushed to the main ring buffer via an IRQ work. But the work is never called when the system calls panic() in the very same NMI handler. This patch tries to flush NMI buffers before the crash dump is generated. In this case it does not risk a double release and bails out when the logbuf_lock is already taken. The aim is to get the messages into the main ring buffer when possible. It makes them better accessible in the vmcore. Then the patch tries to flush the buffers second time when other CPUs are down. It might be more aggressive and reset logbuf_lock. The aim is to get the messages available for the consequent kmsg_dump() and console_flush_on_panic() calls. The patch causes vprintk_emit() to be called even in NMI context again. But it is done via printk_deferred() so that the console handling is skipped. Consoles use internal locks and we could not prevent a deadlock easily. They are explicitly called later when the crash dump is not generated, see console_flush_on_panic(). Signed-off-by: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Daniel Thompson Cc: David Miller Cc: Ingo Molnar Cc: Jan Kara Cc: Jiri Kosina Cc: Martin Schwidefsky Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 ++ kernel/kexec_core.c | 1 + kernel/panic.c | 6 +++++- kernel/printk/internal.h | 2 ++ kernel/printk/nmi.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/printk/printk.c | 2 +- 6 files changed, 49 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/printk.h b/include/linux/printk.h index 51dd6b824fe2..f4da695fd615 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -127,11 +127,13 @@ extern void printk_nmi_init(void); extern void printk_nmi_enter(void); extern void printk_nmi_exit(void); extern void printk_nmi_flush(void); +extern void printk_nmi_flush_on_panic(void); #else static inline void printk_nmi_init(void) { } static inline void printk_nmi_enter(void) { } static inline void printk_nmi_exit(void) { } static inline void printk_nmi_flush(void) { } +static inline void printk_nmi_flush_on_panic(void) { } #endif /* PRINTK_NMI */ #ifdef CONFIG_PRINTK diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1c03dfb4abfd..d5d408252992 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ + printk_nmi_flush_on_panic(); __crash_kexec(regs); /* diff --git a/kernel/panic.c b/kernel/panic.c index 535c96510a44..8aa74497cc5a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -160,8 +160,10 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (!crash_kexec_post_notifiers) + if (!crash_kexec_post_notifiers) { + printk_nmi_flush_on_panic(); __crash_kexec(NULL); + } /* * Note smp_send_stop is the usual smp shutdown function, which @@ -176,6 +178,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + /* Call flush even twice. It tries harder with a single online CPU */ + printk_nmi_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 341bedccc065..7fd2838fa417 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI +extern raw_spinlock_t logbuf_lock; + /* * printk() could not take logbuf_lock in NMI context. Instead, * it temporary stores the strings into a per-CPU buffer. diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index bf08557d7e3d..b69eb8a2876f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) { const char *buf = s->buffer + start; - printk("%.*s", (end - start) + 1, buf); + /* + * The buffers are flushed in NMI only on panic. The messages must + * go only into the ring buffer at this stage. Consoles will get + * explicitly called later when a crashdump is not generated. + */ + if (in_nmi()) + printk_deferred("%.*s", (end - start) + 1, buf); + else + printk("%.*s", (end - start) + 1, buf); + } /* @@ -194,6 +204,33 @@ void printk_nmi_flush(void) __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); } +/** + * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * goes down. + * + * Similar to printk_nmi_flush() but it can be called even in NMI context when + * the system goes down. It does the best effort to get NMI messages into + * the main ring buffer. + * + * Note that it could try harder when there is only one CPU online. + */ +void printk_nmi_flush_on_panic(void) +{ + /* + * Make sure that we could access the main ring buffer. + * Do not risk a double release when more CPUs are up. + */ + if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (num_online_cpus() > 1) + return; + + debug_locks_off(); + raw_spin_lock_init(&logbuf_lock); + } + + printk_nmi_flush(); +} + void __init printk_nmi_init(void) { int cpu; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e38579d730f4..60cdf6386763 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -245,7 +245,7 @@ __packed __aligned(4) * within the scheduler's rq lock. It must be released before calling * console_unlock() or anything else that might wake up a process. */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); +DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); -- cgit v1.2.3 From 8da4b8c48e7b43cb16d05e1dbb34ad9f73ab7efd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2016 17:01:00 -0700 Subject: lib/uuid.c: move generate_random_uuid() to uuid.c Let's gather the UUID related functions under one hood. Signed-off-by: Andy Shevchenko Reviewed-by: Matt Fleming Cc: Dmitry Kasatkin Cc: Mimi Zohar Cc: Rasmus Villemoes Cc: Arnd Bergmann Cc: "Theodore Ts'o" Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/random.c | 21 +-------------------- fs/btrfs/volumes.c | 2 +- fs/ext4/ioctl.c | 2 +- fs/f2fs/file.c | 2 +- fs/reiserfs/objectid.c | 2 +- fs/ubifs/sb.c | 2 +- include/linux/random.h | 1 - include/linux/uuid.h | 2 ++ lib/uuid.c | 20 ++++++++++++++++++++ 9 files changed, 28 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/drivers/char/random.c b/drivers/char/random.c index b583e5336630..0158d3bff7e5 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -260,6 +260,7 @@ #include #include #include +#include #include #include @@ -1621,26 +1622,6 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count, return urandom_read(NULL, buf, count, NULL); } -/*************************************************************** - * Random UUID interface - * - * Used here for a Boot ID, but can be useful for other kernel - * drivers. - ***************************************************************/ - -/* - * Generate random UUID - */ -void generate_random_uuid(unsigned char uuid_out[16]) -{ - get_random_bytes(uuid_out, 16); - /* Set UUID version to 4 --- truly random generation */ - uuid_out[6] = (uuid_out[6] & 0x0F) | 0x40; - /* Set the UUID variant to DCE */ - uuid_out[8] = (uuid_out[8] & 0x3F) | 0x80; -} -EXPORT_SYMBOL(generate_random_uuid); - /******************************************************************** * * Sysctl interface diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd0f45fb38c4..bfb80da3e6eb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,13 +20,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include "ctree.h" #include "extent_map.h" diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index eae5917c534e..7497f50cb293 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include "ext4_jbd2.h" #include "ext4.h" diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index eb9d027e5981..c6b14951bef3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 99a5d5dae46a..415d66ca87d1 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -3,8 +3,8 @@ */ #include -#include #include +#include #include "reiserfs.h" /* find where objectid map starts */ diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index f4fbc7b6b794..3cbb904a6d7d 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -28,8 +28,8 @@ #include "ubifs.h" #include -#include #include +#include /* * Default journal size in logical eraseblocks as a percent of total diff --git a/include/linux/random.h b/include/linux/random.h index 9c29122037f9..e47e533742b5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -26,7 +26,6 @@ extern void get_random_bytes(void *buf, int nbytes); extern int add_random_ready_callback(struct random_ready_callback *rdy); extern void del_random_ready_callback(struct random_ready_callback *rdy); extern void get_random_bytes_arch(void *buf, int nbytes); -void generate_random_uuid(unsigned char uuid_out[16]); extern int random_int_secret_init(void); #ifndef MODULE diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 6df2509033d7..91c2b6d9cbb7 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -33,6 +33,8 @@ static inline int uuid_be_cmp(const uuid_be u1, const uuid_be u2) return memcmp(&u1, &u2, sizeof(uuid_be)); } +void generate_random_uuid(unsigned char uuid[16]); + extern void uuid_le_gen(uuid_le *u); extern void uuid_be_gen(uuid_be *u); diff --git a/lib/uuid.c b/lib/uuid.c index 398821e4dce1..6c81c0b0467e 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -23,6 +23,26 @@ #include #include +/*************************************************************** + * Random UUID interface + * + * Used here for a Boot ID, but can be useful for other kernel + * drivers. + ***************************************************************/ + +/* + * Generate random UUID + */ +void generate_random_uuid(unsigned char uuid[16]) +{ + get_random_bytes(uuid, 16); + /* Set UUID version to 4 --- truly random generation */ + uuid[6] = (uuid[6] & 0x0F) | 0x40; + /* Set the UUID variant to DCE */ + uuid[8] = (uuid[8] & 0x3F) | 0x80; +} +EXPORT_SYMBOL(generate_random_uuid); + static void __uuid_gen_common(__u8 b[16]) { prandom_bytes(b, 16); -- cgit v1.2.3 From 2b1b0d66704a8cafe83be7114ec4c15ab3a314ad Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2016 17:01:04 -0700 Subject: lib/uuid.c: introduce a few more generic helpers There are new helpers in this patch: uuid_is_valid checks if a UUID is valid uuid_be_to_bin converts from string to binary (big endian) uuid_le_to_bin converts from string to binary (little endian) They will be used in future, i.e. in the following patches in the series. This also moves the indices arrays to lib/uuid.c to be shared accross modules. [andriy.shevchenko@linux.intel.com: fix typo] Signed-off-by: Andy Shevchenko Reviewed-by: Matt Fleming Cc: Dmitry Kasatkin Cc: Mimi Zohar Cc: Rasmus Villemoes Cc: Arnd Bergmann Cc: "Theodore Ts'o" Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/uuid.h | 13 +++++++++++ lib/uuid.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/vsprintf.c | 9 ++++---- 3 files changed, 82 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 91c2b6d9cbb7..e0b95e728a77 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -22,6 +22,11 @@ #include +/* + * The length of a UUID string ("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee") + * not including trailing NUL. + */ +#define UUID_STRING_LEN 36 static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2) { @@ -38,4 +43,12 @@ void generate_random_uuid(unsigned char uuid[16]); extern void uuid_le_gen(uuid_le *u); extern void uuid_be_gen(uuid_be *u); +bool __must_check uuid_is_valid(const char *uuid); + +extern const u8 uuid_le_index[16]; +extern const u8 uuid_be_index[16]; + +int uuid_le_to_bin(const char *uuid, uuid_le *u); +int uuid_be_to_bin(const char *uuid, uuid_be *u); + #endif diff --git a/lib/uuid.c b/lib/uuid.c index 6c81c0b0467e..82787f652fbc 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -19,10 +19,17 @@ */ #include +#include +#include #include #include #include +const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; +EXPORT_SYMBOL(uuid_le_index); +const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; +EXPORT_SYMBOL(uuid_be_index); + /*************************************************************** * Random UUID interface * @@ -65,3 +72,61 @@ void uuid_be_gen(uuid_be *bu) bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_be_gen); + +/** + * uuid_is_valid - checks if UUID string valid + * @uuid: UUID string to check + * + * Description: + * It checks if the UUID string is following the format: + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + * where x is a hex digit. + * + * Return: true if input is valid UUID string. + */ +bool uuid_is_valid(const char *uuid) +{ + unsigned int i; + + for (i = 0; i < UUID_STRING_LEN; i++) { + if (i == 8 || i == 13 || i == 18 || i == 23) { + if (uuid[i] != '-') + return false; + } else if (!isxdigit(uuid[i])) { + return false; + } + } + + return true; +} +EXPORT_SYMBOL(uuid_is_valid); + +static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16]) +{ + static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; + unsigned int i; + + if (!uuid_is_valid(uuid)) + return -EINVAL; + + for (i = 0; i < 16; i++) { + int hi = hex_to_bin(uuid[si[i]] + 0); + int lo = hex_to_bin(uuid[si[i]] + 1); + + b[ei[i]] = (hi << 4) | lo; + } + + return 0; +} + +int uuid_le_to_bin(const char *uuid, uuid_le *u) +{ + return __uuid_to_bin(uuid, u->b, uuid_le_index); +} +EXPORT_SYMBOL(uuid_le_to_bin); + +int uuid_be_to_bin(const char *uuid, uuid_be *u) +{ + return __uuid_to_bin(uuid, u->b, uuid_be_index); +} +EXPORT_SYMBOL(uuid_be_to_bin); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index be0e7cf11e48..0967771d8f7f 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #ifdef CONFIG_BLOCK #include @@ -1304,19 +1305,17 @@ static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { - char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; + char uuid[UUID_STRING_LEN + 1]; char *p = uuid; int i; - static const u8 be[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - static const u8 le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; - const u8 *index = be; + const u8 *index = uuid_be_index; bool uc = false; switch (*(++fmt)) { case 'L': uc = true; /* fall-through */ case 'l': - index = le; + index = uuid_le_index; break; case 'B': uc = true; -- cgit v1.2.3 From e3a93bce69ad3e2c38927abe311b8cb4f17abbaf Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2016 17:01:07 -0700 Subject: lib/uuid.c: remove FSF address There is no point in keeping an address in the file since it's subject to change. While here, update Intel Copyright years. Signed-off-by: Andy Shevchenko Reviewed-by: Matt Fleming Cc: Dmitry Kasatkin Cc: Mimi Zohar Cc: Rasmus Villemoes Cc: Arnd Bergmann Cc: "Theodore Ts'o" Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/uuid.h | 6 +----- include/uapi/linux/uuid.h | 4 ---- lib/uuid.c | 6 +----- 3 files changed, 2 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/uuid.h b/include/linux/uuid.h index e0b95e728a77..2d095fc60204 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -1,7 +1,7 @@ /* * UUID/GUID definition * - * Copyright (C) 2010, Intel Corp. + * Copyright (C) 2010, 2016 Intel Corp. * Huang Ying * * This program is free software; you can redistribute it and/or @@ -12,10 +12,6 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _LINUX_UUID_H_ #define _LINUX_UUID_H_ diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h index 786f0773cc33..3738e5fb6a4d 100644 --- a/include/uapi/linux/uuid.h +++ b/include/uapi/linux/uuid.h @@ -12,10 +12,6 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _UAPI_LINUX_UUID_H_ diff --git a/lib/uuid.c b/lib/uuid.c index 82787f652fbc..e116ae5fa00f 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -1,7 +1,7 @@ /* * Unified UUID/GUID definition * - * Copyright (C) 2009, Intel Corp. + * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying * * This program is free software; you can redistribute it and/or @@ -12,10 +12,6 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include -- cgit v1.2.3 From ba7e34b1bbd2722685bbc75d168672d5154d8614 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2016 17:01:18 -0700 Subject: include/linux/efi.h: redefine type, constant, macro from generic code Generic UUID library defines structure type, macro to define UUID, and the length of the UUID string. This patch removes duplicate data structure definition, UUID string length constant as well as macro for UUID handling. Signed-off-by: Andy Shevchenko Reviewed-by: Matt Fleming Cc: Dmitry Kasatkin Cc: Mimi Zohar Cc: Rasmus Villemoes Cc: Arnd Bergmann Cc: "Theodore Ts'o" Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/efi.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index df7acb51f3cc..c2db3ca22217 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -44,17 +45,10 @@ typedef u16 efi_char16_t; /* UNICODE character */ typedef u64 efi_physical_addr_t; typedef void *efi_handle_t; - -typedef struct { - u8 b[16]; -} efi_guid_t; +typedef uuid_le efi_guid_t; #define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \ -((efi_guid_t) \ -{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ - (b) & 0xff, ((b) >> 8) & 0xff, \ - (c) & 0xff, ((c) >> 8) & 0xff, \ - (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) + UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) /* * Generic EFI table header @@ -1117,7 +1111,7 @@ extern int efi_status_to_err(efi_status_t status); * Length of a GUID string (strlen("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee")) * not including trailing NUL */ -#define EFI_VARIABLE_GUID_LEN 36 +#define EFI_VARIABLE_GUID_LEN UUID_STRING_LEN /* * The type of search to perform when calling boottime->locate_handle -- cgit v1.2.3 From 63579785752ba7d0e842078ec6b2875367046f06 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2016 17:01:24 -0700 Subject: include/linux/genhd.h: move to use generic UUID library UUID library provides uuid_be type and uuid_be_to_bin() function. This substitutes open coded variant by generic library calls. Signed-off-by: Andy Shevchenko Reviewed-by: Matt Fleming Cc: Dmitry Kasatkin Cc: Mimi Zohar Cc: Rasmus Villemoes Cc: Arnd Bergmann Cc: "Theodore Ts'o" Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genhd.h | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5c706765404a..359a8e4bd44d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_BLOCK @@ -93,7 +94,7 @@ struct disk_stats { * Enough for the string representation of any kind of UUID plus NULL. * EFI UUID is 36 characters. MSDOS UUID is 11 characters. */ -#define PARTITION_META_INFO_UUIDLTH 37 +#define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1) struct partition_meta_info { char uuid[PARTITION_META_INFO_UUIDLTH]; @@ -228,27 +229,9 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part) return NULL; } -static inline void part_pack_uuid(const u8 *uuid_str, u8 *to) -{ - int i; - for (i = 0; i < 16; ++i) { - *to++ = (hex_to_bin(*uuid_str) << 4) | - (hex_to_bin(*(uuid_str + 1))); - uuid_str += 2; - switch (i) { - case 3: - case 5: - case 7: - case 9: - uuid_str++; - continue; - } - } -} - static inline int blk_part_pack_uuid(const u8 *uuid_str, u8 *to) { - part_pack_uuid(uuid_str, to); + uuid_be_to_bin(uuid_str, (uuid_be *)to); return 0; } -- cgit v1.2.3 From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:01:33 -0700 Subject: radix-tree: introduce radix_tree_empty Commit e61452365372 ("radix_tree: add support for multi-order entries") left the impression that the support for multiorder radix tree entries was functional. As soon as Ross tried to use it, it became apparent that my testing was completely inadequate, and it didn't even work a little bit for orders that were not a multiple of shift. This series of patches is the result of about 6 weeks of redesign, reimplementation, testing, arguing and hair-pulling. The great news is that the test-suite is now far better than it was. That's reflected in the diffstat for the test-suite alone: 12 files changed, 436 insertions(+), 28 deletions(-) The highlight for users of the tree is that the restriction on the order of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix tree now supports any order between 0 and 64. For those who are interested in how the tree works, patch 9 is probably the most interesting one as it introduces the new machinery for handling sibling entries. I've tried to be fair in attributing authorship to the person who contributed the majority of the code in each patch; Ross has been an invaluable partner in the development of this support and it's fair to say that each of us has code in every commit. I should also express my appreciation of the 0day testing. It prompted me that I was bloating the tinyconfig in an unacceptable way, and it bisected to a commit which contained a rather nasty memory-corruption bug. This patch (of 29): The irqdomain code was checking for 0 or 1 entries, not 0 entries like the comment said they were. Introduce a new helper that will actually check for an empty tree. Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 5 +++++ kernel/irq/irqdomain.c | 7 +------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 51a97ac8bfbf..83f708e5db59 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -136,6 +136,11 @@ do { \ (root)->rnode = NULL; \ } while (0) +static inline bool radix_tree_empty(struct radix_tree_root *root) +{ + return root->rnode == NULL; +} + /** * Radix-tree synchronization * diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d65f6f31a5b3..8798b6c9e945 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_tree.height); + WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); -- cgit v1.2.3 From 97d778b2de9213c7a7483dad0f533c1af9f0810f Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 20 May 2016 17:01:42 -0700 Subject: radix tree test suite: allow testing other fan-out values The defines in regression2.c are already in radix-tree.h and duplicating them in the test case makes experimenting with other values for the fan-out harder than necessary. Allow the user of the radix tree to decide what the fan-out should be rather than fixing it to 8 for non-kernel uses. Signed-off-by: Ross Zwisler Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 4 +--- tools/testing/radix-tree/linux/kernel.h | 2 ++ tools/testing/radix-tree/regression2.c | 7 ------- 3 files changed, 3 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 83f708e5db59..5ce5a1e0ecc5 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -70,10 +70,8 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_MAX_TAGS 3 -#ifdef __KERNEL__ +#ifndef RADIX_TREE_MAP_SHIFT #define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ #endif #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index 76a88f35fdc4..31fe2c77d7ae 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -12,6 +12,8 @@ #define CONFIG_SHMEM #define CONFIG_SWAP +#define RADIX_TREE_MAP_SHIFT 3 + #ifndef NULL #define NULL 0 #endif diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c index 5d2fa28cdca3..63bf347aaf33 100644 --- a/tools/testing/radix-tree/regression2.c +++ b/tools/testing/radix-tree/regression2.c @@ -51,13 +51,6 @@ #include "regression.h" -#ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ -#endif - -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define PAGECACHE_TAG_DIRTY 0 #define PAGECACHE_TAG_WRITEBACK 1 #define PAGECACHE_TAG_TOWRITE 2 -- cgit v1.2.3 From 6c4bd68a2962c03423a226d949caf64216d013cc Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 20 May 2016 17:01:51 -0700 Subject: radix-tree: remove unused looping macros radix_tree_for_each_chunk() and radix_tree_for_each_chunk_slot() have never been used in the kernel since their introduction in 2012, so remove them. Signed-off-by: Ross Zwisler Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 5ce5a1e0ecc5..e1512a607709 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -478,34 +478,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) return NULL; } -/** - * radix_tree_for_each_chunk - iterate over chunks - * - * @slot: the void** variable for pointer to chunk first slot - * @root: the struct radix_tree_root pointer - * @iter: the struct radix_tree_iter pointer - * @start: iteration starting index - * @flags: RADIX_TREE_ITER_* and tag index - * - * Locks can be released and reacquired between iterations. - */ -#define radix_tree_for_each_chunk(slot, root, iter, start, flags) \ - for (slot = radix_tree_iter_init(iter, start) ; \ - (slot = radix_tree_next_chunk(root, iter, flags)) ;) - -/** - * radix_tree_for_each_chunk_slot - iterate over slots in one chunk - * - * @slot: the void** variable, at the beginning points to chunk first slot - * @iter: the struct radix_tree_iter pointer - * @flags: RADIX_TREE_ITER_*, should be constant - * - * This macro is designed to be nested inside radix_tree_for_each_chunk(). - * @slot points to the radix tree slot, @iter->index contains its index. - */ -#define radix_tree_for_each_chunk_slot(slot, iter, flags) \ - for (; slot ; slot = radix_tree_next_slot(slot, iter, flags)) - /** * radix_tree_for_each_slot - iterate over non-empty slots * -- cgit v1.2.3 From 21ef533931f73a8e963a6107aa5ec51b192f28be Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 20 May 2016 17:02:26 -0700 Subject: radix-tree: add support for multi-order iterating This enables the macros radix_tree_for_each_slot() and friends to be used with multi-order entries. The way that this works is that we treat all entries in a given slots[] array as a single chunk. If the index given to radix_tree_next_chunk() happens to point us to a sibling entry, we will back up iter->index so that it points to the canonical entry, and that will be the place where we start our iteration. As we're processing a chunk in radix_tree_next_slot(), we process canonical entries, skip over sibling entries, and restart the chunk lookup if we find a non-sibling indirect pointer. This drops back to the radix_tree_next_chunk() code, which will re-walk the tree and look for another chunk. This allows us to properly handle multi-order entries mixed with other entries that are at various heights in the radix tree. Signed-off-by: Ross Zwisler Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 69 +++++++++++++++++++++++---- lib/radix-tree.c | 66 ++++++++++++++----------- tools/testing/radix-tree/generated/autoconf.h | 3 ++ tools/testing/radix-tree/linux/kernel.h | 5 +- 4 files changed, 102 insertions(+), 41 deletions(-) create mode 100644 tools/testing/radix-tree/generated/autoconf.h (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index e1512a607709..8558d52e1c7b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -330,8 +330,9 @@ static inline void radix_tree_preload_end(void) * struct radix_tree_iter - radix tree iterator state * * @index: index of current slot - * @next_index: next-to-last index for this chunk + * @next_index: one beyond the last index for this chunk * @tags: bit-mask for tag-iterating + * @shift: shift for the node that holds our slots * * This radix tree iterator works in terms of "chunks" of slots. A chunk is a * subinterval of slots contained within one radix tree leaf node. It is @@ -344,8 +345,20 @@ struct radix_tree_iter { unsigned long index; unsigned long next_index; unsigned long tags; +#ifdef CONFIG_RADIX_TREE_MULTIORDER + unsigned int shift; +#endif }; +static inline unsigned int iter_shift(struct radix_tree_iter *iter) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + return iter->shift; +#else + return 0; +#endif +} + #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ #define RADIX_TREE_ITER_CONTIG 0x0200 /* stop at first hole */ @@ -405,6 +418,12 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter) return NULL; } +static inline unsigned long +__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) +{ + return iter->index + (slots << iter_shift(iter)); +} + /** * radix_tree_iter_next - resume iterating when the chunk may be invalid * @iter: iterator state @@ -416,7 +435,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter) static inline __must_check void **radix_tree_iter_next(struct radix_tree_iter *iter) { - iter->next_index = iter->index + 1; + iter->next_index = __radix_tree_iter_add(iter, 1); iter->tags = 0; return NULL; } @@ -430,7 +449,12 @@ void **radix_tree_iter_next(struct radix_tree_iter *iter) static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { - return iter->next_index - iter->index; + return (iter->next_index - iter->index) >> iter_shift(iter); +} + +static inline void *indirect_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); } /** @@ -448,24 +472,51 @@ static __always_inline void ** radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) { if (flags & RADIX_TREE_ITER_TAGGED) { + void *canon = slot; + iter->tags >>= 1; + if (unlikely(!iter->tags)) + return NULL; + while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && + radix_tree_is_indirect_ptr(slot[1])) { + if (indirect_to_ptr(slot[1]) == canon) { + iter->tags >>= 1; + iter->index = __radix_tree_iter_add(iter, 1); + slot++; + continue; + } + iter->next_index = __radix_tree_iter_add(iter, 1); + return NULL; + } if (likely(iter->tags & 1ul)) { - iter->index++; + iter->index = __radix_tree_iter_add(iter, 1); return slot + 1; } - if (!(flags & RADIX_TREE_ITER_CONTIG) && likely(iter->tags)) { + if (!(flags & RADIX_TREE_ITER_CONTIG)) { unsigned offset = __ffs(iter->tags); iter->tags >>= offset; - iter->index += offset + 1; + iter->index = __radix_tree_iter_add(iter, offset + 1); return slot + offset + 1; } } else { - long size = radix_tree_chunk_size(iter); + long count = radix_tree_chunk_size(iter); + void *canon = slot; - while (--size > 0) { + while (--count > 0) { slot++; - iter->index++; + iter->index = __radix_tree_iter_add(iter, 1); + + if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && + radix_tree_is_indirect_ptr(*slot)) { + if (indirect_to_ptr(*slot) == canon) + continue; + else { + iter->next_index = iter->index; + break; + } + } + if (likely(*slot)) return slot; if (flags & RADIX_TREE_ITER_CONTIG) { diff --git a/lib/radix-tree.c b/lib/radix-tree.c index ff460423ff4b..a4da86e40def 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -75,11 +75,6 @@ static inline void *ptr_to_indirect(void *ptr) return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); } -static inline void *indirect_to_ptr(void *ptr) -{ - return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); -} - #define RADIX_TREE_RETRY ptr_to_indirect(NULL) #ifdef CONFIG_RADIX_TREE_MULTIORDER @@ -885,6 +880,14 @@ int radix_tree_tag_get(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_tag_get); +static inline void __set_iter_shift(struct radix_tree_iter *iter, + unsigned int shift) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + iter->shift = shift; +#endif +} + /** * radix_tree_next_chunk - find next chunk of slots for iteration * @@ -898,7 +901,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, { unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; struct radix_tree_node *rnode, *node; - unsigned long index, offset, height; + unsigned long index, offset, maxindex; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; @@ -916,33 +919,39 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, if (!index && iter->index) return NULL; - rnode = rcu_dereference_raw(root->rnode); + restart: + shift = radix_tree_load_root(root, &rnode, &maxindex); + if (index > maxindex) + return NULL; + if (radix_tree_is_indirect_ptr(rnode)) { rnode = indirect_to_ptr(rnode); - } else if (rnode && !index) { + } else if (rnode) { /* Single-slot tree */ - iter->index = 0; - iter->next_index = 1; + iter->index = index; + iter->next_index = maxindex + 1; iter->tags = 1; + __set_iter_shift(iter, shift); return (void **)&root->rnode; } else return NULL; -restart: - height = rnode->path & RADIX_TREE_HEIGHT_MASK; - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + shift -= RADIX_TREE_MAP_SHIFT; offset = index >> shift; - /* Index outside of the tree */ - if (offset >= RADIX_TREE_MAP_SIZE) - return NULL; - node = rnode; while (1) { struct radix_tree_node *slot; + unsigned new_off = radix_tree_descend(node, &slot, offset); + + if (new_off < offset) { + offset = new_off; + index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1); + index |= offset << shift; + } + if ((flags & RADIX_TREE_ITER_TAGGED) ? - !test_bit(offset, node->tags[tag]) : - !node->slots[offset]) { + !tag_get(node, tag, offset) : !slot) { /* Hole detected */ if (flags & RADIX_TREE_ITER_CONTIG) return NULL; @@ -954,7 +963,10 @@ restart: offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { - if (node->slots[offset]) + void *slot = node->slots[offset]; + if (is_sibling_entry(node, slot)) + continue; + if (slot) break; } index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1); @@ -964,25 +976,23 @@ restart: return NULL; if (offset == RADIX_TREE_MAP_SIZE) goto restart; + slot = rcu_dereference_raw(node->slots[offset]); } - /* This is leaf-node */ - if (!shift) - break; - - slot = rcu_dereference_raw(node->slots[offset]); - if (slot == NULL) + if ((slot == NULL) || (slot == RADIX_TREE_RETRY)) goto restart; if (!radix_tree_is_indirect_ptr(slot)) break; + node = indirect_to_ptr(slot); shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; } /* Update the iterator state */ - iter->index = index; - iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1; + iter->index = index & ~((1 << shift) - 1); + iter->next_index = (index | ((RADIX_TREE_MAP_SIZE << shift) - 1)) + 1; + __set_iter_shift(iter, shift); /* Construct iter->tags bit-mask from node->tags[tag] array */ if (flags & RADIX_TREE_ITER_TAGGED) { diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h new file mode 100644 index 000000000000..ad18cf5a2a3a --- /dev/null +++ b/tools/testing/radix-tree/generated/autoconf.h @@ -0,0 +1,3 @@ +#define CONFIG_RADIX_TREE_MULTIORDER 1 +#define CONFIG_SHMEM 1 +#define CONFIG_SWAP 1 diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index 8ea0ed450810..be98a47b4e1b 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -8,10 +8,7 @@ #include #include "../../include/linux/compiler.h" - -#define CONFIG_RADIX_TREE_MULTIORDER -#define CONFIG_SHMEM -#define CONFIG_SWAP +#include "../../../include/linux/kconfig.h" #define RADIX_TREE_MAP_SHIFT 3 -- cgit v1.2.3 From 0c7fa0a8418cbe0e8963fe36db9575d03b8589f7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:07 -0700 Subject: radix-tree: split node->path into offset and height Neither piece of information we're storing in node->path can be larger than 64, so store each in its own unsigned char instead of shifting and masking to store them both in an unsigned int. Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 7 ++----- lib/radix-tree.c | 38 +++++++++++++++++--------------------- 2 files changed, 19 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 8558d52e1c7b..2d2ad9d685a3 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -84,16 +84,13 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) -/* Height component in node->path */ -#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1) -#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1) - /* Internally used bits of node->count */ #define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) #define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) struct radix_tree_node { - unsigned int path; /* Offset in parent & height from the bottom */ + unsigned char height; /* From the bottom */ + unsigned char offset; /* Slot offset in parent */ unsigned int count; union { struct { diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 75944e42e4a0..dd04b51e5fbb 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -218,15 +218,15 @@ radix_tree_find_next_bit(const unsigned long *addr, } #ifndef __KERNEL__ -static void dump_node(struct radix_tree_node *node, unsigned offset, +static void dump_node(struct radix_tree_node *node, unsigned shift, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx path %x count %d parent %p\n", - node, offset, + pr_debug("radix node: %p offset %d tags %lx %lx %lx height %d count %d parent %p\n", + node, node->offset, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->path, node->count, node->parent); + node->height, node->count, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << shift); @@ -243,7 +243,7 @@ static void dump_node(struct radix_tree_node *node, unsigned offset, pr_debug("radix entry %p offset %ld indices %ld-%ld\n", entry, i, first, last); } else { - dump_node(indirect_to_ptr(entry), i, + dump_node(indirect_to_ptr(entry), shift - RADIX_TREE_MAP_SHIFT, first); } } @@ -257,7 +257,7 @@ static void radix_tree_dump(struct radix_tree_root *root) root->gfp_mask >> __GFP_BITS_SHIFT); if (!radix_tree_is_indirect_ptr(root->rnode)) return; - dump_node(indirect_to_ptr(root->rnode), 0, + dump_node(indirect_to_ptr(root->rnode), (root->height - 1) * RADIX_TREE_MAP_SHIFT, 0); } #endif @@ -421,7 +421,7 @@ static inline unsigned long radix_tree_maxindex(unsigned int height) static inline unsigned long node_maxindex(struct radix_tree_node *node) { - return radix_tree_maxindex(node->path & RADIX_TREE_HEIGHT_MASK); + return radix_tree_maxindex(node->height); } static unsigned radix_tree_load_root(struct radix_tree_root *root, @@ -434,8 +434,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root, if (likely(radix_tree_is_indirect_ptr(node))) { node = indirect_to_ptr(node); *maxindex = node_maxindex(node); - return (node->path & RADIX_TREE_HEIGHT_MASK) * - RADIX_TREE_MAP_SHIFT; + return node->height * RADIX_TREE_MAP_SHIFT; } *maxindex = 0; @@ -476,9 +475,10 @@ static int radix_tree_extend(struct radix_tree_root *root, } /* Increase the height. */ - newheight = root->height+1; - BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK); - node->path = newheight; + newheight = root->height + 1; + BUG_ON(newheight > BITS_PER_LONG); + node->height = newheight; + node->offset = 0; node->count = 1; node->parent = NULL; slot = root->rnode; @@ -546,13 +546,13 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, slot = radix_tree_node_alloc(root); if (!slot) return -ENOMEM; - slot->path = height; + slot->height = height; + slot->offset = offset; slot->parent = node; if (node) { rcu_assign_pointer(node->slots[offset], ptr_to_indirect(slot)); node->count++; - slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT; } else rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); @@ -1319,11 +1319,10 @@ struct locate_info { static unsigned long __locate(struct radix_tree_node *slot, void *item, unsigned long index, struct locate_info *info) { - unsigned int shift, height; + unsigned int shift; unsigned long i; - height = slot->path & RADIX_TREE_HEIGHT_MASK; - shift = height * RADIX_TREE_MAP_SHIFT; + shift = slot->height * RADIX_TREE_MAP_SHIFT; do { shift -= RADIX_TREE_MAP_SHIFT; @@ -1508,10 +1507,7 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, parent = node->parent; if (parent) { - unsigned int offset; - - offset = node->path >> RADIX_TREE_HEIGHT_SHIFT; - parent->slots[offset] = NULL; + parent->slots[node->offset] = NULL; parent->count--; } else { root_tag_clear_all(root); -- cgit v1.2.3 From c12e51b07b3ac4c188fd91a82f96840fdb9cca6f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:10 -0700 Subject: radix-tree: replace node->height with node->shift node->shift represents the shift necessary for looking in the slots array at this level. It is equal to the old (node->height - 1) * RADIX_TREE_MAP_SHIFT. Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 2 +- lib/radix-tree.c | 30 ++++++++++++++++-------------- 2 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 2d2ad9d685a3..037458257e12 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -89,7 +89,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) struct radix_tree_node { - unsigned char height; /* From the bottom */ + unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned int count; union { diff --git a/lib/radix-tree.c b/lib/radix-tree.c index dd04b51e5fbb..648da9080418 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -223,10 +223,10 @@ static void dump_node(struct radix_tree_node *node, { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx height %d count %d parent %p\n", + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", node, node->offset, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->height, node->count, node->parent); + node->shift, node->count, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << shift); @@ -419,9 +419,14 @@ static inline unsigned long radix_tree_maxindex(unsigned int height) return height_to_maxindex[height]; } +static inline unsigned long shift_maxindex(unsigned int shift) +{ + return (RADIX_TREE_MAP_SIZE << shift) - 1; +} + static inline unsigned long node_maxindex(struct radix_tree_node *node) { - return radix_tree_maxindex(node->height); + return shift_maxindex(node->shift); } static unsigned radix_tree_load_root(struct radix_tree_root *root, @@ -434,7 +439,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root, if (likely(radix_tree_is_indirect_ptr(node))) { node = indirect_to_ptr(node); *maxindex = node_maxindex(node); - return node->height * RADIX_TREE_MAP_SHIFT; + return node->shift + RADIX_TREE_MAP_SHIFT; } *maxindex = 0; @@ -475,9 +480,9 @@ static int radix_tree_extend(struct radix_tree_root *root, } /* Increase the height. */ - newheight = root->height + 1; + newheight = root->height; BUG_ON(newheight > BITS_PER_LONG); - node->height = newheight; + node->shift = newheight * RADIX_TREE_MAP_SHIFT; node->offset = 0; node->count = 1; node->parent = NULL; @@ -490,7 +495,7 @@ static int radix_tree_extend(struct radix_tree_root *root, node->slots[0] = slot; node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); - root->height = newheight; + root->height = ++newheight; } while (height > root->height); out: return height * RADIX_TREE_MAP_SHIFT; @@ -519,7 +524,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, { struct radix_tree_node *node = NULL, *slot; unsigned long maxindex; - unsigned int height, shift, offset; + unsigned int shift, offset; unsigned long max = index | ((1UL << order) - 1); shift = radix_tree_load_root(root, &slot, &maxindex); @@ -537,16 +542,15 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, } } - height = root->height; - offset = 0; /* uninitialised var warning */ while (shift > order) { + shift -= RADIX_TREE_MAP_SHIFT; if (slot == NULL) { /* Have to add a child node. */ slot = radix_tree_node_alloc(root); if (!slot) return -ENOMEM; - slot->height = height; + slot->shift = shift; slot->offset = offset; slot->parent = node; if (node) { @@ -560,8 +564,6 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, break; /* Go a level down */ - height--; - shift -= RADIX_TREE_MAP_SHIFT; node = indirect_to_ptr(slot); offset = (index >> shift) & RADIX_TREE_MAP_MASK; offset = radix_tree_descend(node, &slot, offset); @@ -1322,7 +1324,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item, unsigned int shift; unsigned long i; - shift = slot->height * RADIX_TREE_MAP_SHIFT; + shift = slot->shift + RADIX_TREE_MAP_SHIFT; do { shift -= RADIX_TREE_MAP_SHIFT; -- cgit v1.2.3 From d0891265bbc988dc91ed8580b38eb3dac128581b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:19 -0700 Subject: radix-tree: remove root->height The only remaining references to root->height were in extend and shrink, where it was updated. Now we can remove it entirely. Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 3 -- lib/radix-tree.c | 106 +++++++++++++-------------------------------- 2 files changed, 31 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 037458257e12..c0d223cfac00 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -110,13 +110,11 @@ struct radix_tree_node { /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { - unsigned int height; gfp_t gfp_mask; struct radix_tree_node __rcu *rnode; }; #define RADIX_TREE_INIT(mask) { \ - .height = 0, \ .gfp_mask = (mask), \ .rnode = NULL, \ } @@ -126,7 +124,6 @@ struct radix_tree_root { #define INIT_RADIX_TREE(root, mask) \ do { \ - (root)->height = 0; \ (root)->gfp_mask = (mask); \ (root)->rnode = NULL; \ } while (0) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 75c9e6197b5b..58f79fee8c71 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -38,12 +38,6 @@ #include /* in_interrupt() */ -/* - * The height_to_maxindex array needs to be one deeper than the maximum - * path as height 0 holds only 1 entry. - */ -static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; - /* * Radix tree node cache. */ @@ -218,8 +212,7 @@ radix_tree_find_next_bit(const unsigned long *addr, } #ifndef __KERNEL__ -static void dump_node(struct radix_tree_node *node, - unsigned shift, unsigned long index) +static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; @@ -229,8 +222,8 @@ static void dump_node(struct radix_tree_node *node, node->shift, node->count, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - unsigned long first = index | (i << shift); - unsigned long last = first | ((1UL << shift) - 1); + unsigned long first = index | (i << node->shift); + unsigned long last = first | ((1UL << node->shift) - 1); void *entry = node->slots[i]; if (!entry) continue; @@ -243,8 +236,7 @@ static void dump_node(struct radix_tree_node *node, pr_debug("radix entry %p offset %ld indices %ld-%ld\n", entry, i, first, last); } else { - dump_node(indirect_to_ptr(entry), - shift - RADIX_TREE_MAP_SHIFT, first); + dump_node(indirect_to_ptr(entry), first); } } } @@ -252,13 +244,12 @@ static void dump_node(struct radix_tree_node *node, /* For debug */ static void radix_tree_dump(struct radix_tree_root *root) { - pr_debug("radix root: %p height %d rnode %p tags %x\n", - root, root->height, root->rnode, + pr_debug("radix root: %p rnode %p tags %x\n", + root, root->rnode, root->gfp_mask >> __GFP_BITS_SHIFT); if (!radix_tree_is_indirect_ptr(root->rnode)) return; - dump_node(indirect_to_ptr(root->rnode), - (root->height - 1) * RADIX_TREE_MAP_SHIFT, 0); + dump_node(indirect_to_ptr(root->rnode), 0); } #endif @@ -411,14 +402,8 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) EXPORT_SYMBOL(radix_tree_maybe_preload); /* - * Return the maximum key which can be store into a - * radix tree with height HEIGHT. + * The maximum index which can be stored in a radix tree */ -static inline unsigned long radix_tree_maxindex(unsigned int height) -{ - return height_to_maxindex[height]; -} - static inline unsigned long shift_maxindex(unsigned int shift) { return (RADIX_TREE_MAP_SIZE << shift) - 1; @@ -450,24 +435,22 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root, * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, - unsigned long index) + unsigned long index, unsigned int shift) { struct radix_tree_node *slot; - unsigned int height; + unsigned int maxshift; int tag; - /* Figure out what the height should be. */ - height = root->height + 1; - while (index > radix_tree_maxindex(height)) - height++; + /* Figure out what the shift should be. */ + maxshift = shift; + while (index > shift_maxindex(maxshift)) + maxshift += RADIX_TREE_MAP_SHIFT; - if (root->rnode == NULL) { - root->height = height; + slot = root->rnode; + if (!slot) goto out; - } do { - unsigned int newheight; struct radix_tree_node *node = radix_tree_node_alloc(root); if (!node) @@ -479,14 +462,11 @@ static int radix_tree_extend(struct radix_tree_root *root, tag_set(node, tag, 0); } - /* Increase the height. */ - newheight = root->height; - BUG_ON(newheight > BITS_PER_LONG); - node->shift = newheight * RADIX_TREE_MAP_SHIFT; + BUG_ON(shift > BITS_PER_LONG); + node->shift = shift; node->offset = 0; node->count = 1; node->parent = NULL; - slot = root->rnode; if (radix_tree_is_indirect_ptr(slot)) { slot = indirect_to_ptr(slot); slot->parent = node; @@ -495,10 +475,11 @@ static int radix_tree_extend(struct radix_tree_root *root, node->slots[0] = slot; node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); - root->height = ++newheight; - } while (height > root->height); + shift += RADIX_TREE_MAP_SHIFT; + slot = node; + } while (shift <= maxshift); out: - return height * RADIX_TREE_MAP_SHIFT; + return maxshift + RADIX_TREE_MAP_SHIFT; } /** @@ -531,15 +512,13 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, /* Make sure the tree is high enough. */ if (max > maxindex) { - int error = radix_tree_extend(root, max); + int error = radix_tree_extend(root, max, shift); if (error < 0) return error; shift = error; slot = root->rnode; - if (order == shift) { + if (order == shift) shift += RADIX_TREE_MAP_SHIFT; - root->height++; - } } offset = 0; /* uninitialised var warning */ @@ -1412,32 +1391,32 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) #endif /* CONFIG_SHMEM && CONFIG_SWAP */ /** - * radix_tree_shrink - shrink height of a radix tree to minimal + * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ static inline bool radix_tree_shrink(struct radix_tree_root *root) { bool shrunk = false; - /* try to shrink tree height */ - while (root->height > 0) { + for (;;) { struct radix_tree_node *to_free = root->rnode; struct radix_tree_node *slot; - BUG_ON(!radix_tree_is_indirect_ptr(to_free)); + if (!radix_tree_is_indirect_ptr(to_free)) + break; to_free = indirect_to_ptr(to_free); /* * The candidate node has more than one child, or its child - * is not at the leftmost slot, or it is a multiorder entry, - * we cannot shrink. + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. */ if (to_free->count != 1) break; slot = to_free->slots[0]; if (!slot) break; - if (!radix_tree_is_indirect_ptr(slot) && (root->height > 1)) + if (!radix_tree_is_indirect_ptr(slot) && to_free->shift) break; if (radix_tree_is_indirect_ptr(slot)) { @@ -1454,7 +1433,6 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) * one (root->rnode) as far as dependent read barriers go. */ root->rnode = slot; - root->height--; /* * We have a dilemma here. The node's slot[0] must not be @@ -1515,7 +1493,6 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, parent->count--; } else { root_tag_clear_all(root); - root->height = 0; root->rnode = NULL; } @@ -1631,26 +1608,6 @@ radix_tree_node_ctor(void *arg) INIT_LIST_HEAD(&node->private_list); } -static __init unsigned long __maxindex(unsigned int height) -{ - unsigned int width = height * RADIX_TREE_MAP_SHIFT; - int shift = RADIX_TREE_INDEX_BITS - width; - - if (shift < 0) - return ~0UL; - if (shift >= BITS_PER_LONG) - return 0UL; - return ~0UL >> shift; -} - -static __init void radix_tree_init_maxindex(void) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) - height_to_maxindex[i] = __maxindex(i); -} - static int radix_tree_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1677,6 +1634,5 @@ void __init radix_tree_init(void) sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); - radix_tree_init_maxindex(); hotcpu_notifier(radix_tree_callback, 0); } -- cgit v1.2.3 From 30ff46ccb303fb6f6c28b9aa9f2cdc4ba900ed3f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:22 -0700 Subject: radix-tree: rename INDIRECT_PTR to INTERNAL_NODE The name RADIX_TREE_INDIRECT_PTR doesn't really match the meaning. RADIX_TREE_INTERNAL_NODE is a better name. Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 30 +++++++++++++----------------- lib/radix-tree.c | 2 +- 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index c0d223cfac00..c8cc879046c7 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -29,20 +29,16 @@ #include /* - * An indirect pointer (root->rnode pointing to a radix_tree_node, rather - * than a data item) is signalled by the low bit set in the root->rnode - * pointer. - * - * In this case root->height is > 0, but the indirect pointer tests are - * needed for RCU lookups (because root->height is unreliable). The only - * time callers need worry about this is when doing a lookup_slot under - * RCU. - * - * Indirect pointer in fact is also used to tag the last pointer of a node - * when it is shrunk, before we rcu free the node. See shrink code for - * details. + * Entries in the radix tree have the low bit set if they refer to a + * radix_tree_node. If the low bit is clear then the entry is user data. + * + * We also use the low bit to indicate that the slot will be freed in the + * next RCU idle period, and users need to re-walk the tree to find the + * new slot for the index that they were looking for. See the comment in + * radix_tree_shrink() for details. */ -#define RADIX_TREE_INDIRECT_PTR 1 +#define RADIX_TREE_INTERNAL_NODE 1 + /* * A common use of the radix tree is to store pointers to struct pages; * but shmem/tmpfs needs also to store swap entries in the same tree: @@ -63,7 +59,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) { - return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); + return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE); } /*** radix-tree API starts here ***/ @@ -228,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot, */ static inline int radix_tree_deref_retry(void *arg) { - return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR); + return unlikely(radix_tree_is_indirect_ptr(arg)); } /** @@ -250,7 +246,7 @@ static inline int radix_tree_exceptional_entry(void *arg) static inline int radix_tree_exception(void *arg) { return unlikely((unsigned long)arg & - (RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY)); + (RADIX_TREE_INTERNAL_NODE | RADIX_TREE_EXCEPTIONAL_ENTRY)); } /** @@ -448,7 +444,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) static inline void *indirect_to_ptr(void *ptr) { - return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); } /** diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 58f79fee8c71..31d5929a625b 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -68,7 +68,7 @@ static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; static inline void *ptr_to_indirect(void *ptr) { - return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); + return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } #define RADIX_TREE_RETRY ptr_to_indirect(NULL) -- cgit v1.2.3 From 4dd6c0987ca43d6544f4f0a3f86f6ea3bfc60fc1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:27 -0700 Subject: radix-tree: rename indirect_to_ptr() to entry_to_node() Mirrors the earlier commit introducing node_to_entry(). Also change the type returned to be a struct radix_tree_node pointer. That lets us simplify a couple of places in the radix tree shrink & extend paths where we could convert an entry into a pointer, modify the node, then convert the pointer back into an entry. Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 12 +++++------ lib/radix-tree.c | 48 ++++++++++++++++++----------------------- tools/testing/radix-tree/test.c | 4 ++-- tools/testing/radix-tree/test.h | 1 - 4 files changed, 28 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index c8cc879046c7..b94aa198dd6b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -442,7 +442,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) return (iter->next_index - iter->index) >> iter_shift(iter); } -static inline void *indirect_to_ptr(void *ptr) +static inline struct radix_tree_node *entry_to_node(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); } @@ -469,7 +469,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) return NULL; while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && radix_tree_is_indirect_ptr(slot[1])) { - if (indirect_to_ptr(slot[1]) == canon) { + if (entry_to_node(slot[1]) == canon) { iter->tags >>= 1; iter->index = __radix_tree_iter_add(iter, 1); slot++; @@ -499,12 +499,10 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && radix_tree_is_indirect_ptr(*slot)) { - if (indirect_to_ptr(*slot) == canon) + if (entry_to_node(*slot) == canon) continue; - else { - iter->next_index = iter->index; - break; - } + iter->next_index = iter->index; + break; } if (likely(*slot)) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f66bb3932452..3c3fdd9c5bb3 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -230,13 +230,13 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) if (is_sibling_entry(node, entry)) { pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", entry, i, - *(void **)indirect_to_ptr(entry), + *(void **)entry_to_node(entry), first, last); } else if (!radix_tree_is_indirect_ptr(entry)) { pr_debug("radix entry %p offset %ld indices %ld-%ld\n", entry, i, first, last); } else { - dump_node(indirect_to_ptr(entry), first); + dump_node(entry_to_node(entry), first); } } } @@ -249,7 +249,7 @@ static void radix_tree_dump(struct radix_tree_root *root) root->gfp_mask >> __GFP_BITS_SHIFT); if (!radix_tree_is_indirect_ptr(root->rnode)) return; - dump_node(indirect_to_ptr(root->rnode), 0); + dump_node(entry_to_node(root->rnode), 0); } #endif @@ -422,7 +422,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root, *nodep = node; if (likely(radix_tree_is_indirect_ptr(node))) { - node = indirect_to_ptr(node); + node = entry_to_node(node); *maxindex = node_maxindex(node); return node->shift + RADIX_TREE_MAP_SHIFT; } @@ -467,11 +467,8 @@ static int radix_tree_extend(struct radix_tree_root *root, node->offset = 0; node->count = 1; node->parent = NULL; - if (radix_tree_is_indirect_ptr(slot)) { - slot = indirect_to_ptr(slot); - slot->parent = node; - slot = node_to_entry(slot); - } + if (radix_tree_is_indirect_ptr(slot)) + entry_to_node(slot)->parent = node; node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -542,7 +539,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, break; /* Go a level down */ - node = indirect_to_ptr(slot); + node = entry_to_node(slot); offset = (index >> shift) & RADIX_TREE_MAP_MASK; offset = radix_tree_descend(node, &slot, offset); } @@ -645,7 +642,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, if (node == RADIX_TREE_RETRY) goto restart; - parent = indirect_to_ptr(node); + parent = entry_to_node(node); shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; offset = radix_tree_descend(parent, &node, offset); @@ -729,7 +726,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - parent = indirect_to_ptr(node); + parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, offset); BUG_ON(!node); @@ -777,7 +774,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - parent = indirect_to_ptr(node); + parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, offset); } @@ -844,7 +841,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; - parent = indirect_to_ptr(node); + parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, offset); if (!node) @@ -904,7 +901,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, return NULL; if (radix_tree_is_indirect_ptr(rnode)) { - rnode = indirect_to_ptr(rnode); + rnode = entry_to_node(rnode); } else if (rnode) { /* Single-slot tree */ iter->index = index; @@ -963,7 +960,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, if (!radix_tree_is_indirect_ptr(slot)) break; - node = indirect_to_ptr(slot); + node = entry_to_node(slot); shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; } @@ -1048,7 +1045,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, return 1; } - node = indirect_to_ptr(slot); + node = entry_to_node(slot); shift -= RADIX_TREE_MAP_SHIFT; for (;;) { @@ -1063,7 +1060,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, goto next; /* Sibling slots never have tags set on them */ if (radix_tree_is_indirect_ptr(slot)) { - node = indirect_to_ptr(slot); + node = entry_to_node(slot); shift -= RADIX_TREE_MAP_SHIFT; continue; } @@ -1322,7 +1319,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item, } continue; } - node = indirect_to_ptr(node); + node = entry_to_node(node); if (is_sibling_entry(slot, node)) continue; slot = node; @@ -1367,7 +1364,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) break; } - node = indirect_to_ptr(node); + node = entry_to_node(node); max_index = node_maxindex(node); if (cur_index > max_index) { @@ -1403,7 +1400,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) if (!radix_tree_is_indirect_ptr(to_free)) break; - to_free = indirect_to_ptr(to_free); + to_free = entry_to_node(to_free); /* * The candidate node has more than one child, or its child @@ -1418,11 +1415,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) if (!radix_tree_is_indirect_ptr(slot) && to_free->shift) break; - if (radix_tree_is_indirect_ptr(slot)) { - slot = indirect_to_ptr(slot); - slot->parent = NULL; - slot = node_to_entry(slot); - } + if (radix_tree_is_indirect_ptr(slot)) + entry_to_node(slot)->parent = NULL; /* * We don't need rcu_assign_pointer(), since we are simply @@ -1481,7 +1475,7 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *parent; if (node->count) { - if (node == indirect_to_ptr(root->rnode)) + if (node == entry_to_node(root->rnode)) deleted |= radix_tree_shrink(root); return deleted; } diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 3004c58b9021..7b0bc1fa5919 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -149,7 +149,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag, int i; int j; - slot = indirect_to_ptr(slot); + slot = entry_to_node(slot); /* Verify consistency at this level */ for (i = 0; i < RADIX_TREE_TAG_LONGS; i++) { @@ -227,7 +227,7 @@ void tree_verify_min_height(struct radix_tree_root *root, int maxindex) return; } - node = indirect_to_ptr(node); + node = entry_to_node(node); assert(maxindex <= node_maxindex(node)); shift = node->shift; diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 866c8c676aa4..e85131369723 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -39,7 +39,6 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag); extern int nr_allocated; /* Normally private parts of lib/radix-tree.c */ -void *indirect_to_ptr(void *ptr); void radix_tree_dump(struct radix_tree_root *root); int root_tag_get(struct radix_tree_root *root, unsigned int tag); unsigned long node_maxindex(struct radix_tree_node *); -- cgit v1.2.3 From b194d16c27af905d6e3552f4851bc7d9fee4e90f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:30 -0700 Subject: radix-tree: rename radix_tree_is_indirect_ptr() As with indirect_to_ptr(), ptr_to_indirect() and RADIX_TREE_INDIRECT_PTR, change radix_tree_is_indirect_ptr() to radix_tree_is_internal_node(). Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 10 ++++----- lib/radix-tree.c | 48 ++++++++++++++++++++--------------------- tools/testing/radix-tree/test.c | 4 ++-- 3 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index b94aa198dd6b..bad63105e37e 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -57,7 +57,7 @@ #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) -static inline int radix_tree_is_indirect_ptr(void *ptr) +static inline int radix_tree_is_internal_node(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE); } @@ -224,7 +224,7 @@ static inline void *radix_tree_deref_slot_protected(void **pslot, */ static inline int radix_tree_deref_retry(void *arg) { - return unlikely(radix_tree_is_indirect_ptr(arg)); + return unlikely(radix_tree_is_internal_node(arg)); } /** @@ -259,7 +259,7 @@ static inline int radix_tree_exception(void *arg) */ static inline void radix_tree_replace_slot(void **pslot, void *item) { - BUG_ON(radix_tree_is_indirect_ptr(item)); + BUG_ON(radix_tree_is_internal_node(item)); rcu_assign_pointer(*pslot, item); } @@ -468,7 +468,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) if (unlikely(!iter->tags)) return NULL; while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && - radix_tree_is_indirect_ptr(slot[1])) { + radix_tree_is_internal_node(slot[1])) { if (entry_to_node(slot[1]) == canon) { iter->tags >>= 1; iter->index = __radix_tree_iter_add(iter, 1); @@ -498,7 +498,7 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) iter->index = __radix_tree_iter_add(iter, 1); if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && - radix_tree_is_indirect_ptr(*slot)) { + radix_tree_is_internal_node(*slot)) { if (entry_to_node(*slot) == canon) continue; iter->next_index = iter->index; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3c3fdd9c5bb3..b65c83036ca4 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -100,7 +100,7 @@ static unsigned radix_tree_descend(struct radix_tree_node *parent, void **entry = rcu_dereference_raw(parent->slots[offset]); #ifdef CONFIG_RADIX_TREE_MULTIORDER - if (radix_tree_is_indirect_ptr(entry)) { + if (radix_tree_is_internal_node(entry)) { unsigned long siboff = get_slot_offset(parent, entry); if (siboff < RADIX_TREE_MAP_SIZE) { offset = siboff; @@ -232,7 +232,7 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) entry, i, *(void **)entry_to_node(entry), first, last); - } else if (!radix_tree_is_indirect_ptr(entry)) { + } else if (!radix_tree_is_internal_node(entry)) { pr_debug("radix entry %p offset %ld indices %ld-%ld\n", entry, i, first, last); } else { @@ -247,7 +247,7 @@ static void radix_tree_dump(struct radix_tree_root *root) pr_debug("radix root: %p rnode %p tags %x\n", root, root->rnode, root->gfp_mask >> __GFP_BITS_SHIFT); - if (!radix_tree_is_indirect_ptr(root->rnode)) + if (!radix_tree_is_internal_node(root->rnode)) return; dump_node(entry_to_node(root->rnode), 0); } @@ -302,7 +302,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask | __GFP_ACCOUNT); out: - BUG_ON(radix_tree_is_indirect_ptr(ret)); + BUG_ON(radix_tree_is_internal_node(ret)); return ret; } @@ -421,7 +421,7 @@ static unsigned radix_tree_load_root(struct radix_tree_root *root, *nodep = node; - if (likely(radix_tree_is_indirect_ptr(node))) { + if (likely(radix_tree_is_internal_node(node))) { node = entry_to_node(node); *maxindex = node_maxindex(node); return node->shift + RADIX_TREE_MAP_SHIFT; @@ -467,7 +467,7 @@ static int radix_tree_extend(struct radix_tree_root *root, node->offset = 0; node->count = 1; node->parent = NULL; - if (radix_tree_is_indirect_ptr(slot)) + if (radix_tree_is_internal_node(slot)) entry_to_node(slot)->parent = node; node->slots[0] = slot; slot = node_to_entry(node); @@ -535,7 +535,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, } else rcu_assign_pointer(root->rnode, node_to_entry(slot)); - } else if (!radix_tree_is_indirect_ptr(slot)) + } else if (!radix_tree_is_internal_node(slot)) break; /* Go a level down */ @@ -585,7 +585,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, void **slot; int error; - BUG_ON(radix_tree_is_indirect_ptr(item)); + BUG_ON(radix_tree_is_internal_node(item)); error = __radix_tree_create(root, index, order, &node, &slot); if (error) @@ -637,7 +637,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, if (index > maxindex) return NULL; - while (radix_tree_is_indirect_ptr(node)) { + while (radix_tree_is_internal_node(node)) { unsigned offset; if (node == RADIX_TREE_RETRY) @@ -720,7 +720,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, shift = radix_tree_load_root(root, &node, &maxindex); BUG_ON(index > maxindex); - while (radix_tree_is_indirect_ptr(node)) { + while (radix_tree_is_internal_node(node)) { unsigned offset; shift -= RADIX_TREE_MAP_SHIFT; @@ -770,7 +770,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, parent = NULL; - while (radix_tree_is_indirect_ptr(node)) { + while (radix_tree_is_internal_node(node)) { shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; @@ -835,7 +835,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, if (node == NULL) return 0; - while (radix_tree_is_indirect_ptr(node)) { + while (radix_tree_is_internal_node(node)) { int offset; shift -= RADIX_TREE_MAP_SHIFT; @@ -900,7 +900,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, if (index > maxindex) return NULL; - if (radix_tree_is_indirect_ptr(rnode)) { + if (radix_tree_is_internal_node(rnode)) { rnode = entry_to_node(rnode); } else if (rnode) { /* Single-slot tree */ @@ -957,7 +957,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, if ((slot == NULL) || (slot == RADIX_TREE_RETRY)) goto restart; - if (!radix_tree_is_indirect_ptr(slot)) + if (!radix_tree_is_internal_node(slot)) break; node = entry_to_node(slot); @@ -1039,7 +1039,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, *first_indexp = last_index + 1; return 0; } - if (!radix_tree_is_indirect_ptr(slot)) { + if (!radix_tree_is_internal_node(slot)) { *first_indexp = last_index + 1; root_tag_set(root, settag); return 1; @@ -1059,7 +1059,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, if (!tag_get(node, iftag, offset)) goto next; /* Sibling slots never have tags set on them */ - if (radix_tree_is_indirect_ptr(slot)) { + if (radix_tree_is_internal_node(slot)) { node = entry_to_node(slot); shift -= RADIX_TREE_MAP_SHIFT; continue; @@ -1152,7 +1152,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; - if (radix_tree_is_indirect_ptr(results[ret])) { + if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } @@ -1235,7 +1235,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; - if (radix_tree_is_indirect_ptr(results[ret])) { + if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } @@ -1311,7 +1311,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item, rcu_dereference_raw(slot->slots[i]); if (node == RADIX_TREE_RETRY) goto out; - if (!radix_tree_is_indirect_ptr(node)) { + if (!radix_tree_is_internal_node(node)) { if (node == item) { info->found_index = index; info->stop = true; @@ -1357,7 +1357,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) do { rcu_read_lock(); node = rcu_dereference_raw(root->rnode); - if (!radix_tree_is_indirect_ptr(node)) { + if (!radix_tree_is_internal_node(node)) { rcu_read_unlock(); if (node == item) info.found_index = 0; @@ -1398,7 +1398,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) struct radix_tree_node *to_free = root->rnode; struct radix_tree_node *slot; - if (!radix_tree_is_indirect_ptr(to_free)) + if (!radix_tree_is_internal_node(to_free)) break; to_free = entry_to_node(to_free); @@ -1412,10 +1412,10 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) slot = to_free->slots[0]; if (!slot) break; - if (!radix_tree_is_indirect_ptr(slot) && to_free->shift) + if (!radix_tree_is_internal_node(slot) && to_free->shift) break; - if (radix_tree_is_indirect_ptr(slot)) + if (radix_tree_is_internal_node(slot)) entry_to_node(slot)->parent = NULL; /* @@ -1445,7 +1445,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ - if (!radix_tree_is_indirect_ptr(slot)) + if (!radix_tree_is_internal_node(slot)) to_free->slots[0] = RADIX_TREE_RETRY; radix_tree_node_free(to_free); diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 7b0bc1fa5919..a6e8099eaf4f 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -193,7 +193,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag, void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag) { struct radix_tree_node *node = root->rnode; - if (!radix_tree_is_indirect_ptr(node)) + if (!radix_tree_is_internal_node(node)) return; verify_node(node, tag, !!root_tag_get(root, tag)); } @@ -222,7 +222,7 @@ void tree_verify_min_height(struct radix_tree_root *root, int maxindex) { unsigned shift; struct radix_tree_node *node = root->rnode; - if (!radix_tree_is_indirect_ptr(node)) { + if (!radix_tree_is_internal_node(node)) { assert(maxindex == 0); return; } -- cgit v1.2.3 From d604c324524bf61c68182bb27db64656a78fe911 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:45 -0700 Subject: radix-tree: introduce radix_tree_replace_clear_tags() In addition to replacing the entry, we also clear all associated tags. This is really a one-off special for page_cache_tree_delete() which had far too much detailed knowledge about how the radix tree works. For efficiency, factor node_tag_clear() out of radix_tree_tag_clear() It can be used by radix_tree_delete_item() as well as radix_tree_replace_clear_tags(). Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 9 ++++-- lib/radix-tree.c | 76 ++++++++++++++++++++++++++++------------------ mm/filemap.c | 23 ++------------ 3 files changed, 56 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index bad63105e37e..11c8e7cc3920 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -281,9 +281,12 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); -unsigned int -radix_tree_gang_lookup(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items); +struct radix_tree_node *radix_tree_replace_clear_tags( + struct radix_tree_root *root, + unsigned long index, void *entry); +unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, + void **results, unsigned long first_index, + unsigned int max_items); unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 9d9b4b9af4b6..c7114d233b38 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -740,6 +740,26 @@ void *radix_tree_tag_set(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_tag_set); +static void node_tag_clear(struct radix_tree_root *root, + struct radix_tree_node *node, + unsigned int tag, unsigned int offset) +{ + while (node) { + if (!tag_get(node, tag, offset)) + return; + tag_clear(node, tag, offset); + if (any_tag_set(node, tag)) + return; + + offset = node->offset; + node = node->parent; + } + + /* clear the root's tag bit */ + if (root_tag_get(root, tag)) + root_tag_clear(root, tag); +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root @@ -776,28 +796,9 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, offset = radix_tree_descend(parent, &node, offset); } - if (node == NULL) - goto out; + if (node) + node_tag_clear(root, parent, tag, offset); - index >>= shift; - - while (parent) { - if (!tag_get(parent, tag, offset)) - goto out; - tag_clear(parent, tag, offset); - if (any_tag_set(parent, tag)) - goto out; - - index >>= RADIX_TREE_MAP_SHIFT; - offset = index & RADIX_TREE_MAP_MASK; - parent = parent->parent; - } - - /* clear the root's tag bit */ - if (root_tag_get(root, tag)) - root_tag_clear(root, tag); - -out: return node; } EXPORT_SYMBOL(radix_tree_tag_clear); @@ -1525,14 +1526,9 @@ void *radix_tree_delete_item(struct radix_tree_root *root, offset = get_slot_offset(node, slot); - /* - * Clear all tags associated with the item to be deleted. - * This way of doing it would be inefficient, but seldom is any set. - */ - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (tag_get(node, tag, offset)) - radix_tree_tag_clear(root, index, tag); - } + /* Clear all tags associated with the item to be deleted. */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); node->slots[offset] = NULL; @@ -1559,6 +1555,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); +struct radix_tree_node *radix_tree_replace_clear_tags( + struct radix_tree_root *root, + unsigned long index, void *entry) +{ + struct radix_tree_node *node; + void **slot; + + __radix_tree_lookup(root, index, &node, &slot); + + if (node) { + unsigned int tag, offset = get_slot_offset(node, slot); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); + } else { + /* Clear root node tags */ + root->gfp_mask &= __GFP_BITS_MASK; + } + + radix_tree_replace_slot(slot, entry); + return node; +} + /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root diff --git a/mm/filemap.c b/mm/filemap.c index b418405903bc..9665b1d4f318 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { struct radix_tree_node *node; - unsigned long index; - unsigned int offset; - unsigned int tag; - void **slot; VM_BUG_ON(!PageLocked(page)); - __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index, + shadow); if (shadow) { mapping->nrexceptional++; @@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping, } mapping->nrpages--; - if (!node) { - /* Clear direct pointer tags in root node */ - mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; - radix_tree_replace_slot(slot, shadow); + if (!node) return; - } - - /* Clear tree tags for the removed page */ - index = page->index; - offset = index & RADIX_TREE_MAP_MASK; - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (test_bit(offset, node->tags[tag])) - radix_tree_tag_clear(&mapping->page_tree, index, tag); - } - /* Delete page, swap shadow entry */ - radix_tree_replace_slot(slot, shadow); workingset_node_pages_dec(node); if (shadow) workingset_node_shadows_inc(node); -- cgit v1.2.3 From 78a9be0a0a3367b94af242632c525d22b26f1a87 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 May 2016 17:03:51 -0700 Subject: dax: move RADIX_DAX_ definitions to dax.c These don't belong in radix-tree.h any more than PAGECACHE_TAG_* do. Let's try to maintain the idea that radix-tree simply implements an abstract data type. Signed-off-by: NeilBrown Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 9 +++++++++ include/linux/radix-tree.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/dax.c b/fs/dax.c index 0dbe4e0f16fe..a345c168acaa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -32,6 +32,15 @@ #include #include +#define RADIX_DAX_MASK 0xf +#define RADIX_DAX_SHIFT 4 +#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) +#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) + static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { struct request_queue *q = bdev->bd_queue; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 11c8e7cc3920..c2f69e25ba86 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -48,15 +48,6 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 -#define RADIX_DAX_MASK 0xf -#define RADIX_DAX_SHIFT 4 -#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) -#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) -#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) -#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ - RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) - static inline int radix_tree_is_internal_node(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE); -- cgit v1.2.3 From 3bcadd6fa6c4fd07ace3626357c824eb532488a6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:54 -0700 Subject: radix-tree: free up the bottom bit of exceptional entries for reuse We are guaranteed that pointers to radix_tree_nodes always have the bottom two bits clear (because they come from a slab cache, and slab caches have a minimum alignment of sizeof(void *)), so we can redefine 'radix_tree_is_internal_node' to only return true if the bottom two bits have value '01'. This frees up one quarter of the potential values for use by the user. Idea from Neil Brown. Signed-off-by: Matthew Wilcox Suggested-by: Neil Brown Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index c2f69e25ba86..cb4b7e8cee81 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -29,28 +29,37 @@ #include /* - * Entries in the radix tree have the low bit set if they refer to a - * radix_tree_node. If the low bit is clear then the entry is user data. - * - * We also use the low bit to indicate that the slot will be freed in the - * next RCU idle period, and users need to re-walk the tree to find the - * new slot for the index that they were looking for. See the comment in - * radix_tree_shrink() for details. + * The bottom two bits of the slot determine how the remaining bits in the + * slot are interpreted: + * + * 00 - data pointer + * 01 - internal entry + * 10 - exceptional entry + * 11 - locked exceptional entry + * + * The internal entry may be a pointer to the next level in the tree, a + * sibling entry, or an indicator that the entry in this slot has been moved + * to another location in the tree and the lookup should be restarted. While + * NULL fits the 'data pointer' pattern, it means that there is no entry in + * the tree for this index (no matter what level of the tree it is found at). + * This means that you cannot store NULL in the tree as a value for the index. */ -#define RADIX_TREE_INTERNAL_NODE 1 +#define RADIX_TREE_ENTRY_MASK 3UL +#define RADIX_TREE_INTERNAL_NODE 1UL /* - * A common use of the radix tree is to store pointers to struct pages; - * but shmem/tmpfs needs also to store swap entries in the same tree: - * those are marked as exceptional entries to distinguish them. + * Most users of the radix tree store pointers but shmem/tmpfs stores swap + * entries in the same tree. They are marked as exceptional entries to + * distinguish them from pointers to struct page. * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it. */ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 -static inline int radix_tree_is_internal_node(void *ptr) +static inline bool radix_tree_is_internal_node(void *ptr) { - return (int)((unsigned long)ptr & RADIX_TREE_INTERNAL_NODE); + return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) == + RADIX_TREE_INTERNAL_NODE; } /*** radix-tree API starts here ***/ @@ -236,8 +245,7 @@ static inline int radix_tree_exceptional_entry(void *arg) */ static inline int radix_tree_exception(void *arg) { - return unlikely((unsigned long)arg & - (RADIX_TREE_INTERNAL_NODE | RADIX_TREE_EXCEPTIONAL_ENTRY)); + return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } /** -- cgit v1.2.3