From 91fbdc0f89807bb97792ea6893717a8d3154b871 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 11 Feb 2015 15:25:04 -0800 Subject: mm/page_alloc.c:__alloc_pages_nodemask(): don't alter arg gfp_mask __alloc_pages_nodemask() strips __GFP_IO when retrying the page allocation. But it does this by altering the function-wide variable gfp_mask. This will cause subsequent allocation attempts to inadvertently use the modified gfp_mask. Also, pass the correct mask (the mask we actually used) into trace_mm_page_alloc(). Cc: Ming Lei Cc: Mel Gorman Cc: Johannes Weiner Reviewed-by: Yasuaki Ishimatsu Cc: David Rientjes Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f121050e8530..1c7d90f7a84a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2865,6 +2865,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; int classzone_idx; + gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ gfp_mask &= gfp_allowed_mask; @@ -2898,22 +2899,24 @@ retry_cpuset: classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, alloc_flags, - preferred_zone, classzone_idx, migratetype); + alloc_mask = gfp_mask|__GFP_HARDWALL; + page = get_page_from_freelist(alloc_mask, nodemask, order, zonelist, + high_zoneidx, alloc_flags, preferred_zone, + classzone_idx, migratetype); if (unlikely(!page)) { /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. */ - gfp_mask = memalloc_noio_flags(gfp_mask); - page = __alloc_pages_slowpath(gfp_mask, order, + alloc_mask = memalloc_noio_flags(gfp_mask); + + page = __alloc_pages_slowpath(alloc_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, classzone_idx, migratetype); } - trace_mm_page_alloc(page, order, gfp_mask, migratetype); + trace_mm_page_alloc(page, order, alloc_mask, migratetype); out: /* -- cgit v1.2.3 From 23f086f962e67a1b8a508c0d8e86b7833c941564 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 11 Feb 2015 15:25:07 -0800 Subject: kmemcheck: move hook into __alloc_pages_nodemask() for the page allocator Now kmemcheck_pagealloc_alloc() is only called by __alloc_pages_slowpath(). __alloc_pages_nodemask() __alloc_pages_slowpath() kmemcheck_pagealloc_alloc() And the page will not be tracked by kmemcheck in the following path. __alloc_pages_nodemask() get_page_from_freelist() So move kmemcheck_pagealloc_alloc() into __alloc_pages_nodemask(), like this: __alloc_pages_nodemask() ... get_page_from_freelist() if (!page) __alloc_pages_slowpath() kmemcheck_pagealloc_alloc() ... Signed-off-by: Xishi Qiu Cc: Vegard Nossum Cc: Pekka Enberg Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1c7d90f7a84a..a88cb0cbf352 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2842,11 +2842,7 @@ retry: nopage: warn_alloc_failed(gfp_mask, order, NULL); - return page; got_pg: - if (kmemcheck_enabled) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); - return page; } @@ -2916,6 +2912,9 @@ retry_cpuset: preferred_zone, classzone_idx, migratetype); } + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + trace_mm_page_alloc(page, order, alloc_mask, migratetype); out: -- cgit v1.2.3 From 753791910e23a95aade78f69e49713acddf8bb8c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Feb 2015 15:25:38 -0800 Subject: mm: set page->pfmemalloc in prep_new_page() The possibility of replacing the numerous parameters of alloc_pages* functions with a single structure has been discussed when Minchan proposed to expand the x86 kernel stack [1]. This series implements the change, along with few more cleanups/microoptimizations. The series is based on next-20150108 and I used gcc 4.8.3 20140627 on openSUSE 13.2 for compiling. Config includess NUMA and COMPACTION. The core change is the introduction of a new struct alloc_context, which looks like this: struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zone *preferred_zone; int classzone_idx; int migratetype; enum zone_type high_zoneidx; }; All the contents is mostly constant, except that __alloc_pages_slowpath() changes preferred_zone, classzone_idx and potentially zonelist. But that's not a problem in case control returns to retry_cpuset: in __alloc_pages_nodemask(), those will be reset to initial values again (although it's a bit subtle). On the other hand, gfp_flags and alloc_info mutate so much that it doesn't make sense to put them into alloc_context. Still, the result is one parameter instead of up to 7. This is all in Patch 2. Patch 3 is a step to expand alloc_context usage out of page_alloc.c itself. The function try_to_compact_pages() can also much benefit from the parameter reduction, but it means the struct definition has to be moved to a shared header. Patch 1 should IMHO be included even if the rest is deemed not useful enough. It improves maintainability and also has some code/stack reduction. Patch 4 is OTOH a tiny optimization. Overall bloat-o-meter results: add/remove: 0/0 grow/shrink: 0/4 up/down: 0/-460 (-460) function old new delta nr_free_zone_pages 129 115 -14 __alloc_pages_direct_compact 329 256 -73 get_page_from_freelist 2670 2576 -94 __alloc_pages_nodemask 2564 2285 -279 try_to_compact_pages 582 579 -3 Overall stack sizes per ./scripts/checkstack.pl: old new delta get_page_from_freelist: 184 184 0 __alloc_pages_nodemask 248 200 -48 __alloc_pages_direct_c 40 - -40 try_to_compact_pages 72 72 0 -88 [1] http://marc.info/?l=linux-mm&m=140142462528257&w=2 This patch (of 4): prep_new_page() sets almost everything in the struct page of the page being allocated, except page->pfmemalloc. This is not obvious and has at least once led to a bug where page->pfmemalloc was forgotten to be set correctly, see commit 8fb74b9fb2b1 ("mm: compaction: partially revert capture of suitable high-order page"). This patch moves the pfmemalloc setting to prep_new_page(), which means it needs to gain alloc_flags parameter. The call to prep_new_page is moved from buffered_rmqueue() to get_page_from_freelist(), which also leads to simpler code. An obsolete comment for buffered_rmqueue() is replaced. In addition to better maintainability there is a small reduction of code and stack usage for get_page_from_freelist(), which inlines the other functions involved. add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-145 (-145) function old new delta get_page_from_freelist 2670 2525 -145 Stack usage is reduced from 184 to 168 bytes. Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Zhang Yanfei Cc: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a88cb0cbf352..30a3250c0a21 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -970,7 +970,8 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + int alloc_flags) { int i; @@ -994,6 +995,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) set_page_owner(page, order, gfp_flags); + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * allocate the page. The expectation is that the caller is taking + * steps that will free more memory. The caller should avoid the page + * being used for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return 0; } @@ -1642,9 +1651,7 @@ int split_free_page(struct page *page) } /* - * Really, prep_compound_page() should be called from __rmqueue_bulk(). But - * we cheat by calling it from here, in the order > 0 path. Saves a branch - * or two. + * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, @@ -1655,7 +1662,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, struct page *page; bool cold = ((gfp_flags & __GFP_COLD) != 0); -again: if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; @@ -1711,8 +1717,6 @@ again: local_irq_restore(flags); VM_BUG_ON_PAGE(bad_range(zone, page), page); - if (prep_new_page(page, order, gfp_flags)) - goto again; return page; failed: @@ -2177,25 +2181,16 @@ zonelist_scan: try_this_zone: page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask, migratetype); - if (page) - break; + if (page) { + if (prep_new_page(page, order, gfp_mask, alloc_flags)) + goto try_this_zone; + return page; + } this_zone_full: if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } - if (page) { - /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was - * necessary to allocate the page. The expectation is - * that the caller is taking steps that will free more - * memory. The caller should avoid the page being used - * for !PFMEMALLOC purposes. - */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); - return page; - } - /* * The first pass makes sure allocations are spread fairly within the * local node. However, the local node might have free pages left -- cgit v1.2.3 From a9263751e11a07af40a98dba88021821cd430cfd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Feb 2015 15:25:41 -0800 Subject: mm, page_alloc: reduce number of alloc_pages* functions' parameters Introduce struct alloc_context to accumulate the numerous parameters passed between the alloc_pages* family of functions and get_page_from_freelist(). This excludes gfp_flags and alloc_info, which mutate too much along the way, and allocation order, which is conceptually different. The result is shorter function signatures, as well as overal code size and stack usage reductions. bloat-o-meter: add/remove: 0/0 grow/shrink: 1/2 up/down: 127/-310 (-183) function old new delta get_page_from_freelist 2525 2652 +127 __alloc_pages_direct_compact 329 283 -46 __alloc_pages_nodemask 2564 2300 -264 checkstack.pl: function old new __alloc_pages_nodemask 248 200 get_page_from_freelist 168 184 __alloc_pages_direct_compact 40 24 Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Zhang Yanfei Cc: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 229 ++++++++++++++++++++++++++------------------------------ 1 file changed, 108 insertions(+), 121 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30a3250c0a21..4aead0bd8d44 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -232,6 +232,27 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif +/* + * Structure for holding the mostly immutable allocation parameters passed + * between alloc_pages* family of functions. + * + * nodemask, migratetype and high_zoneidx are initialized only once in + * __alloc_pages_nodemask() and then never change. + * + * zonelist, preferred_zone and classzone_idx are set first in + * __alloc_pages_nodemask() for the fast path, and might be later changed + * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * by a const pointer. + */ +struct alloc_context { + struct zonelist *zonelist; + nodemask_t *nodemask; + struct zone *preferred_zone; + int classzone_idx; + int migratetype; + enum zone_type high_zoneidx; +}; + int page_group_by_mobility_disabled __read_mostly; void set_pageblock_migratetype(struct page *page, int migratetype) @@ -2037,10 +2058,10 @@ static void reset_alloc_batches(struct zone *preferred_zone) * a page. */ static struct page * -get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, - struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int classzone_idx, int migratetype) +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + const struct alloc_context *ac) { + struct zonelist *zonelist = ac->zonelist; struct zoneref *z; struct page *page = NULL; struct zone *zone; @@ -2059,8 +2080,8 @@ zonelist_scan: * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, - high_zoneidx, nodemask) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { unsigned long mark; if (IS_ENABLED(CONFIG_NUMA) && zlc_active && @@ -2077,7 +2098,7 @@ zonelist_scan: * time the page has in memory before being reclaimed. */ if (alloc_flags & ALLOC_FAIR) { - if (!zone_local(preferred_zone, zone)) + if (!zone_local(ac->preferred_zone, zone)) break; if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { nr_fair_skipped++; @@ -2115,7 +2136,7 @@ zonelist_scan: mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { + ac->classzone_idx, alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ @@ -2136,7 +2157,7 @@ zonelist_scan: } if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(preferred_zone, zone)) + !zone_allows_reclaim(ac->preferred_zone, zone)) goto this_zone_full; /* @@ -2158,7 +2179,7 @@ zonelist_scan: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) + ac->classzone_idx, alloc_flags)) goto try_this_zone; /* @@ -2179,8 +2200,8 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(preferred_zone, zone, order, - gfp_mask, migratetype); + page = buffered_rmqueue(ac->preferred_zone, zone, order, + gfp_mask, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; @@ -2203,7 +2224,7 @@ this_zone_full: alloc_flags &= ~ALLOC_FAIR; if (nr_fair_skipped) { zonelist_rescan = true; - reset_alloc_batches(preferred_zone); + reset_alloc_batches(ac->preferred_zone); } if (nr_online_nodes > 1) zonelist_rescan = true; @@ -2325,9 +2346,7 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype, unsigned long *did_some_progress) + const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page; @@ -2340,7 +2359,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * Acquire the per-zone oom lock for each zone. If that * fails, somebody else is making progress for us. */ - if (!oom_zonelist_trylock(zonelist, gfp_mask)) { + if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { *did_some_progress = 1; schedule_timeout_uninterruptible(1); return NULL; @@ -2359,10 +2378,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * here, this is only to catch a parallel oom killing, we must fail if * we're still under heavy pressure. */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, - order, zonelist, high_zoneidx, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); if (page) goto out; @@ -2374,7 +2391,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (high_zoneidx < ZONE_NORMAL) + if (ac->high_zoneidx < ZONE_NORMAL) goto out; /* The OOM killer does not compensate for light reclaim */ if (!(gfp_mask & __GFP_FS)) @@ -2390,10 +2407,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order, nodemask, false); + out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false); *did_some_progress = 1; out: - oom_zonelist_unlock(zonelist, gfp_mask); + oom_zonelist_unlock(ac->zonelist, gfp_mask); return page; } @@ -2401,10 +2418,9 @@ out: /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, enum migrate_mode mode, - int *contended_compaction, bool *deferred_compaction) + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended_compaction, + bool *deferred_compaction) { unsigned long compact_result; struct page *page; @@ -2413,10 +2429,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; current->flags |= PF_MEMALLOC; - compact_result = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, mode, + compact_result = try_to_compact_pages(ac->zonelist, order, gfp_mask, + ac->nodemask, mode, contended_compaction, - alloc_flags, classzone_idx); + alloc_flags, ac->classzone_idx); current->flags &= ~PF_MEMALLOC; switch (compact_result) { @@ -2435,10 +2451,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, nodemask, - order, zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); if (page) { struct zone *zone = page_zone(page); @@ -2462,10 +2476,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, enum migrate_mode mode, - int *contended_compaction, bool *deferred_compaction) + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended_compaction, + bool *deferred_compaction) { return NULL; } @@ -2473,8 +2486,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, /* Perform direct synchronous page reclaim */ static int -__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, - nodemask_t *nodemask) +__perform_reclaim(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac) { struct reclaim_state reclaim_state; int progress; @@ -2488,7 +2501,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + progress = try_to_free_pages(ac->zonelist, order, gfp_mask, + ac->nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -2502,28 +2516,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, unsigned long *did_some_progress) + int alloc_flags, const struct alloc_context *ac, + unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; - *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, - nodemask); + *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) return NULL; /* After successful reclaim, reconsider all zones for allocation */ if (IS_ENABLED(CONFIG_NUMA)) - zlc_clear_zones_full(zonelist); + zlc_clear_zones_full(ac->zonelist); retry: - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, - migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); /* * If an allocation failed after direct reclaim, it could be because @@ -2544,36 +2553,30 @@ retry: */ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype) + const struct alloc_context *ac) { struct page *page; do { - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); if (!page && gfp_mask & __GFP_NOFAIL) - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, + HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; } -static void wake_all_kswapds(unsigned int order, - struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone, - nodemask_t *nodemask) +static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist_nodemask(zone, z, zonelist, - high_zoneidx, nodemask) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->high_zoneidx, ac->nodemask) + wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); } static inline int @@ -2632,9 +2635,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype) + struct alloc_context *ac) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; @@ -2670,8 +2671,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, retry: if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapds(order, zonelist, high_zoneidx, - preferred_zone, nodemask); + wake_all_kswapds(order, ac); /* * OK, we're below the kswapd watermark and have kicked background @@ -2684,17 +2684,16 @@ retry: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, - NULL, &preferred_zone); - classzone_idx = zonelist_zone_idx(preferred_zoneref); + preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, NULL, &ac->preferred_zone); + ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); } /* This is the last chance, in general, before the goto nopage. */ - page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, - high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); if (page) goto got_pg; @@ -2705,11 +2704,10 @@ retry: * the allocation is high priority and these type of * allocations are system rather than user orientated */ - zonelist = node_zonelist(numa_node_id(), gfp_mask); + ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); + + page = __alloc_pages_high_priority(gfp_mask, order, ac); - page = __alloc_pages_high_priority(gfp_mask, order, - zonelist, high_zoneidx, nodemask, - preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2738,11 +2736,9 @@ retry: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, - high_zoneidx, nodemask, alloc_flags, - preferred_zone, - classzone_idx, migratetype, - migration_mode, &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + migration_mode, + &contended_compaction, &deferred_compaction); if (page) goto got_pg; @@ -2788,12 +2784,8 @@ retry: migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ - page = __alloc_pages_direct_reclaim(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - classzone_idx, migratetype, - &did_some_progress); + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, + &did_some_progress); if (page) goto got_pg; @@ -2807,17 +2799,15 @@ retry: * start OOM killing tasks. */ if (!did_some_progress) { - page = __alloc_pages_may_oom(gfp_mask, order, zonelist, - high_zoneidx, nodemask, - preferred_zone, classzone_idx, - migratetype,&did_some_progress); + page = __alloc_pages_may_oom(gfp_mask, order, ac, + &did_some_progress); if (page) goto got_pg; if (!did_some_progress) goto nopage; } /* Wait for some write requests to complete then retry */ - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); goto retry; } else { /* @@ -2825,11 +2815,9 @@ retry: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, - high_zoneidx, nodemask, alloc_flags, - preferred_zone, - classzone_idx, migratetype, - migration_mode, &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, migration_mode, + &contended_compaction, &deferred_compaction); if (page) goto got_pg; @@ -2848,15 +2836,16 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - enum zone_type high_zoneidx = gfp_zone(gfp_mask); - struct zone *preferred_zone; struct zoneref *preferred_zoneref; struct page *page = NULL; - int migratetype = gfpflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - int classzone_idx; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { + .high_zoneidx = gfp_zone(gfp_mask), + .nodemask = nodemask, + .migratetype = gfpflags_to_migratetype(gfp_mask), + }; gfp_mask &= gfp_allowed_mask; @@ -2875,25 +2864,25 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) + if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); + /* We set it here, as __alloc_pages_slowpath might have changed it */ + ac.zonelist = zonelist; /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, - nodemask ? : &cpuset_current_mems_allowed, - &preferred_zone); - if (!preferred_zone) + preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, + ac.nodemask ? : &cpuset_current_mems_allowed, + &ac.preferred_zone); + if (!ac.preferred_zone) goto out; - classzone_idx = zonelist_zone_idx(preferred_zoneref); + ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ alloc_mask = gfp_mask|__GFP_HARDWALL; - page = get_page_from_freelist(alloc_mask, nodemask, order, zonelist, - high_zoneidx, alloc_flags, preferred_zone, - classzone_idx, migratetype); + page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (unlikely(!page)) { /* * Runtime PM, block IO and its error handling path @@ -2902,15 +2891,13 @@ retry_cpuset: */ alloc_mask = memalloc_noio_flags(gfp_mask); - page = __alloc_pages_slowpath(alloc_mask, order, - zonelist, high_zoneidx, nodemask, - preferred_zone, classzone_idx, migratetype); + page = __alloc_pages_slowpath(alloc_mask, order, &ac); } if (kmemcheck_enabled && page) kmemcheck_pagealloc_alloc(page, order, gfp_mask); - trace_mm_page_alloc(page, order, alloc_mask, migratetype); + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); out: /* -- cgit v1.2.3 From 1a6d53a105406d97396c87511afd6f09b4dc8ad2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Feb 2015 15:25:44 -0800 Subject: mm: reduce try_to_compact_pages parameters Expand the usage of the struct alloc_context introduced in the previous patch also for calling try_to_compact_pages(), to reduce the number of its parameters. Since the function is in different compilation unit, we need to move alloc_context definition in the shared mm/internal.h header. With this change we get simpler code and small savings of code size and stack usage: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27) function old new delta __alloc_pages_direct_compact 283 256 -27 add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-13 (-13) function old new delta try_to_compact_pages 582 569 -13 Stack usage of __alloc_pages_direct_compact goes from 24 to none (per scripts/checkstack.pl). Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Mel Gorman Cc: Zhang Yanfei Cc: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 17 +++++++++-------- mm/compaction.c | 23 +++++++++++------------ mm/internal.h | 22 ++++++++++++++++++++++ mm/page_alloc.c | 27 ++------------------------- 4 files changed, 44 insertions(+), 45 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 3238ffa33f68..f2efda2e6ac6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -21,6 +21,8 @@ /* Zone lock or lru_lock was contended in async compaction */ #define COMPACT_CONTENDED_LOCK 2 +struct alloc_context; /* in mm/internal.h */ + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -30,10 +32,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); extern int fragmentation_index(struct zone *zone, unsigned int order); -extern unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *mask, - enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx); +extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order, @@ -101,10 +102,10 @@ static inline bool compaction_restarting(struct zone *zone, int order) } #else -static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) +static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, + unsigned int order, int alloc_flags, + const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index 546e571e9d60..9c7e6909dd29 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1335,22 +1335,20 @@ int sysctl_extfrag_threshold = 500; /** * try_to_compact_pages - Direct compact to satisfy a high-order allocation - * @zonelist: The zonelist used for the current allocation - * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation - * @nodemask: The allowed nodes to allocate from + * @order: The order of the current allocation + * @alloc_flags: The allocation flags of the current allocation + * @ac: The context of current allocation * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that determines if compaction was aborted due to * need_resched() or lock contention * * This is the main entry point for direct page compaction. */ -unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) +unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { - enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; @@ -1365,8 +1363,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_SKIPPED; /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, - nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { int status; int zone_contended; @@ -1374,7 +1372,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, continue; status = compact_zone_order(zone, order, gfp_mask, mode, - &zone_contended, alloc_flags, classzone_idx); + &zone_contended, alloc_flags, + ac->classzone_idx); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1384,7 +1383,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - classzone_idx, alloc_flags)) { + ac->classzone_idx, alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller diff --git a/mm/internal.h b/mm/internal.h index efad241f7014..c4d6c9b43491 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -109,6 +109,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * in mm/page_alloc.c */ +/* + * Structure for holding the mostly immutable allocation parameters passed + * between functions involved in allocations, including the alloc_pages* + * family of functions. + * + * nodemask, migratetype and high_zoneidx are initialized only once in + * __alloc_pages_nodemask() and then never change. + * + * zonelist, preferred_zone and classzone_idx are set first in + * __alloc_pages_nodemask() for the fast path, and might be later changed + * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * by a const pointer. + */ +struct alloc_context { + struct zonelist *zonelist; + nodemask_t *nodemask; + struct zone *preferred_zone; + int classzone_idx; + int migratetype; + enum zone_type high_zoneidx; +}; + /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4aead0bd8d44..d664eb922a7d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -232,27 +232,6 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif -/* - * Structure for holding the mostly immutable allocation parameters passed - * between alloc_pages* family of functions. - * - * nodemask, migratetype and high_zoneidx are initialized only once in - * __alloc_pages_nodemask() and then never change. - * - * zonelist, preferred_zone and classzone_idx are set first in - * __alloc_pages_nodemask() for the fast path, and might be later changed - * in __alloc_pages_slowpath(). All other functions pass the whole strucure - * by a const pointer. - */ -struct alloc_context { - struct zonelist *zonelist; - nodemask_t *nodemask; - struct zone *preferred_zone; - int classzone_idx; - int migratetype; - enum zone_type high_zoneidx; -}; - int page_group_by_mobility_disabled __read_mostly; void set_pageblock_migratetype(struct page *page, int migratetype) @@ -2429,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; current->flags |= PF_MEMALLOC; - compact_result = try_to_compact_pages(ac->zonelist, order, gfp_mask, - ac->nodemask, mode, - contended_compaction, - alloc_flags, ac->classzone_idx); + compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + mode, contended_compaction); current->flags &= ~PF_MEMALLOC; switch (compact_result) { -- cgit v1.2.3 From 6e9f0d582dde095d971a3c6ce4685a218a0eac8e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:25:50 -0800 Subject: mm/page_alloc.c: drop dead destroy_compound_page() The only caller is __free_one_page(). By the time we should have page->flags to be cleared already: - for 0-order pages though PCP list: free_hot_cold_page() free_pages_prepare() free_pages_check() page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; free_pcppages_bulk() page = __free_one_page(page) - for non-0-order pages: __free_pages_ok() free_pages_prepare() free_pages_check() page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; free_one_page() __free_one_page() So there's no way PageCompound() will return true in __free_one_page(). Let's remove dead destroy_compound_page() and put assert for page->flags there instead. Signed-off-by: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d664eb922a7d..12d55b859d3f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -381,36 +381,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -/* update __split_huge_page_refcount if you change this function */ -static int destroy_compound_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - int bad = 0; - - if (unlikely(compound_order(page) != order)) { - bad_page(page, "wrong compound order", 0); - bad++; - } - - __ClearPageHead(page); - - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - - if (unlikely(!PageTail(p))) { - bad_page(page, "PageTail not set", 0); - bad++; - } else if (unlikely(p->first_page != page)) { - bad_page(page, "first_page not consistent", 0); - bad++; - } - __ClearPageTail(p); - } - - return bad; -} - static inline void prep_zero_page(struct page *page, unsigned int order, gfp_t gfp_flags) { @@ -613,10 +583,7 @@ static inline void __free_one_page(struct page *page, int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); - - if (unlikely(PageCompound(page))) - if (unlikely(destroy_compound_page(page, order))) - return; + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); if (is_migrate_isolate(migratetype)) { -- cgit v1.2.3 From 81422f29c5f4fb968023f465218c3d978c133ceb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:25:52 -0800 Subject: mm: more checks on free_pages_prepare() for tail pages Although it was not called, destroy_compound_page() did some potentially useful checks. Let's re-introduce them in free_pages_prepare(), where they can be actually triggered when CONFIG_DEBUG_VM=y. compound_order() assert is already in free_pages_prepare(). We have few checks for tail pages left. Signed-off-by: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12d55b859d3f..0081228d1caa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -764,21 +764,40 @@ static void free_one_page(struct zone *zone, spin_unlock(&zone->lock); } +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return 0; + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + return 1; + } + if (unlikely(page->first_page != head_page)) { + bad_page(page, "first_page not consistent", 0); + return 1; + } + return 0; +} + static bool free_pages_prepare(struct page *page, unsigned int order) { - int i; - int bad = 0; + bool compound = PageCompound(page); + int i, bad = 0; VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) page->mapping = NULL; - for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page); + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); bad += free_pages_check(page + i); + } if (bad) return false; -- cgit v1.2.3 From 8d29e18a459dfc2adeafc1acb9c4185ee6713116 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 11 Feb 2015 15:26:01 -0800 Subject: mm: use correct format specifiers when printing address ranges Especially on 32 bit kernels memory node ranges are printed with 32 bit wide addresses only. Use u64 types and %llx specifiers to print full width of addresses. Signed-off-by: Juergen Gross Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0081228d1caa..641d5a9a8617 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4994,8 +4994,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); - printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, - (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5367,9 +5367,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) arch_zone_highest_possible_pfn[i]) pr_cont("empty\n"); else - pr_cont("[mem %0#10lx-%0#10lx]\n", - arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, - (arch_zone_highest_possible_pfn[i] + pr_cont("[mem %#018Lx-%#018Lx]\n", + (u64)arch_zone_lowest_possible_pfn[i] + << PAGE_SHIFT, + ((u64)arch_zone_highest_possible_pfn[i] << PAGE_SHIFT) - 1); } @@ -5377,15 +5378,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) pr_info("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - pr_info(" Node %d: %#010lx\n", i, - zone_movable_pfn[i] << PAGE_SHIFT); + pr_info(" Node %d: %#018Lx\n", i, + (u64)zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early node map */ pr_info("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, - start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); + pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + ((u64)end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ mminit_verify_pageflags_layout(); -- cgit v1.2.3 From c32b3cbe0d067a9cfae85aa70ba1e97ceba0ced7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 11 Feb 2015 15:26:24 -0800 Subject: oom, PM: make OOM detection in the freezer path raceless Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend") has left a race window when OOM killer manages to note_oom_kill after freeze_processes checks the counter. The race window is quite small and really unlikely and partial solution deemed sufficient at the time of submission. Tejun wasn't happy about this partial solution though and insisted on a full solution. That requires the full OOM and freezer's task freezing exclusion, though. This is done by this patch which introduces oom_sem RW lock and turns oom_killer_disable() into a full OOM barrier. oom_killer_disabled check is moved from the allocation path to the OOM level and we take oom_sem for reading for both the check and the whole OOM invocation. oom_killer_disable() takes oom_sem for writing so it waits for all currently running OOM killer invocations. Then it disable all the further OOMs by setting oom_killer_disabled and checks for any oom victims. Victims are counted via mark_tsk_oom_victim resp. unmark_oom_victim. The last victim wakes up all waiters enqueued by oom_killer_disable(). Therefore this function acts as the full OOM barrier. The page fault path is covered now as well although it was assumed to be safe before. As per Tejun, "We used to have freezing points deep in file system code which may be reacheable from page fault." so it would be better and more robust to not rely on freezing points here. Same applies to the memcg OOM killer. out_of_memory tells the caller whether the OOM was allowed to trigger and the callers are supposed to handle the situation. The page allocation path simply fails the allocation same as before. The page fault path will retry the fault (more on that later) and Sysrq OOM trigger will simply complain to the log. Normally there wouldn't be any unfrozen user tasks after try_to_freeze_tasks so the function will not block. But if there was an OOM killer racing with try_to_freeze_tasks and the OOM victim didn't finish yet then we have to wait for it. This should complete in a finite time, though, because - the victim cannot loop in the page fault handler (it would die on the way out from the exception) - it cannot loop in the page allocator because all the further allocation would fail and __GFP_NOFAIL allocations are not acceptable at this stage - it shouldn't be blocked on any locks held by frozen tasks (try_to_freeze expects lockless context) and kernel threads and work queues are not frozen yet Signed-off-by: Michal Hocko Suggested-by: Tejun Heo Cc: David Rientjes Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Cong Wang Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/tty/sysrq.c | 5 +- include/linux/oom.h | 14 ++---- kernel/exit.c | 3 +- kernel/power/process.c | 50 ++++--------------- mm/memcontrol.c | 2 +- mm/oom_kill.c | 132 +++++++++++++++++++++++++++++++++++++++++-------- mm/page_alloc.c | 17 +------ 7 files changed, 132 insertions(+), 91 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 0071469ecbf1..259a4d5a4e8f 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(struct work_struct *ignored) { - out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL, - 0, NULL, true); + if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), + GFP_KERNEL, 0, NULL, true)) + pr_info("OOM request ignored because killer is disabled\n"); } static DECLARE_WORK(moom_work, moom_callback); diff --git a/include/linux/oom.h b/include/linux/oom.h index b42b80f88c3a..d5771bed59c9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill); -extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); extern bool oom_killer_disabled; - -static inline void oom_killer_disable(void) -{ - oom_killer_disabled = true; -} - -static inline void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} +extern bool oom_killer_disable(void); +extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); diff --git a/kernel/exit.c b/kernel/exit.c index 02b3d1ab2ec0..feff10bbb307 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); - unmark_oom_victim(); + if (test_thread_flag(TIF_MEMDIE)) + unmark_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/kernel/power/process.c b/kernel/power/process.c index 3ac45f192e9f..564f786df470 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } -static bool __check_frozen_processes(void) -{ - struct task_struct *g, *p; - - for_each_process_thread(g, p) - if (p != current && !freezer_should_skip(p) && !frozen(p)) - return false; - - return true; -} - -/* - * Returns true if all freezable tasks (except for current) are frozen already - */ -static bool check_frozen_processes(void) -{ - bool ret; - - read_lock(&tasklist_lock); - ret = __check_frozen_processes(); - read_unlock(&tasklist_lock); - return ret; -} - /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -142,7 +118,6 @@ static bool check_frozen_processes(void) int freeze_processes(void) { int error; - int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -157,29 +132,22 @@ int freeze_processes(void) pm_wakeup_clear(); pr_info("Freezing user space processes ... "); pm_freezing = true; - oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { __usermodehelper_set_disable_depth(UMH_DISABLED); - oom_killer_disable(); - - /* - * There might have been an OOM kill while we were - * freezing tasks and the killed task might be still - * on the way out so we have to double check for race. - */ - if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { - __usermodehelper_set_disable_depth(UMH_ENABLED); - pr_cont("OOM in progress."); - error = -EBUSY; - } else { - pr_cont("done."); - } + pr_cont("done."); } pr_cont("\n"); BUG_ON(in_atomic()); + /* + * Now that the whole userspace is frozen we need to disbale + * the OOM killer to disallow any further interference with + * killable tasks. + */ + if (!error && !oom_killer_disable()) + error = -EBUSY; + if (error) thaw_processes(); return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe4d258ef32b..fbf64e6f64e4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle) + if (!handle || oom_killer_disabled) goto cleanup; owait.memcg = memcg; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3cbd76b8c13b..b8df76ee2be3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } /* - * Number of OOM killer invocations (including memcg OOM killer). - * Primarily used by PM freezer to check for potential races with - * OOM killed frozen task. + * Number of OOM victims in flight */ -static atomic_t oom_kills = ATOMIC_INIT(0); +static atomic_t oom_victims = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); -int oom_kills_count(void) -{ - return atomic_read(&oom_kills); -} - -void note_oom_kill(void) -{ - atomic_inc(&oom_kills); -} +bool oom_killer_disabled __read_mostly; +static DECLARE_RWSEM(oom_sem); /** * mark_tsk_oom_victim - marks the given taks as OOM victim. * @tsk: task to mark + * + * Has to be called with oom_sem taken for read and never after + * oom has been disabled already. */ void mark_tsk_oom_victim(struct task_struct *tsk) { - set_tsk_thread_flag(tsk, TIF_MEMDIE); - + WARN_ON(oom_killer_disabled); + /* OOM killer might race with memcg OOM */ + if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk) * that TIF_MEMDIE tasks should be ignored. */ __thaw_task(tsk); + atomic_inc(&oom_victims); } /** * unmark_oom_victim - unmarks the current task as OOM victim. + * + * Wakes up all waiters in oom_killer_disable() */ void unmark_oom_victim(void) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_thread_flag(TIF_MEMDIE)) + return; + + down_read(&oom_sem); + /* + * There is no need to signal the lasst oom_victim if there + * is nobody who cares. + */ + if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + wake_up_all(&oom_victims_wait); + up_read(&oom_sem); +} + +/** + * oom_killer_disable - disable OOM killer + * + * Forces all page allocations to fail rather than trigger OOM killer. + * Will block and wait until all OOM victims are killed. + * + * The function cannot be called when there are runnable user tasks because + * the userspace would see unexpected allocation failures as a result. Any + * new usage of this function should be consulted with MM people. + * + * Returns true if successful and false if the OOM killer cannot be + * disabled. + */ +bool oom_killer_disable(void) +{ + /* + * Make sure to not race with an ongoing OOM killer + * and that the current is not the victim. + */ + down_write(&oom_sem); + if (test_thread_flag(TIF_MEMDIE)) { + up_write(&oom_sem); + return false; + } + + oom_killer_disabled = true; + up_write(&oom_sem); + + wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + + return true; +} + +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + down_write(&oom_sem); + oom_killer_disabled = false; + up_write(&oom_sem); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) } /** - * out_of_memory - kill the "best" process when we run out of memory + * __out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 @@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; @@ -718,6 +771,32 @@ out: schedule_timeout_killable(1); } +/** + * out_of_memory - tries to invoke OOM killer. + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting + * + * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() + * when it returns false. Otherwise returns true. + */ +bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) +{ + bool ret = false; + + down_read(&oom_sem); + if (!oom_killer_disabled) { + __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); + ret = true; + } + up_read(&oom_sem); + + return ret; +} + /* * The pagefault handler calls here because it is out of memory, so kill a * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a @@ -727,12 +806,25 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; + down_read(&oom_sem); if (mem_cgroup_oom_synchronize(true)) - return; + goto unlock; zonelist = node_zonelist(first_memory_node, GFP_KERNEL); if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { - out_of_memory(NULL, 0, 0, NULL, false); + if (!oom_killer_disabled) + __out_of_memory(NULL, 0, 0, NULL, false); + else + /* + * There shouldn't be any user tasks runable while the + * OOM killer is disabled so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } +unlock: + up_read(&oom_sem); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 641d5a9a8617..134e25525044 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } -bool oom_killer_disabled __read_mostly; - #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 0; - if (oom_killer_disabled) - return NULL; - /* * Acquire the per-zone oom lock for each zone. If that * fails, somebody else is making progress for us. @@ -2330,14 +2325,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } - /* - * PM-freezer should be notified that there might be an OOM killer on - * its way to kill and wake somebody up. This is too early and we might - * end up not killing anything but false positives are acceptable. - * See freeze_processes. - */ - note_oom_kill(); - /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if @@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false); - *did_some_progress = 1; + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + *did_some_progress = 1; out: oom_zonelist_unlock(ac->zonelist, gfp_mask); return page; -- cgit v1.2.3 From 99592d598eca62bdbbf62b59941c189176dfc614 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Feb 2015 15:28:15 -0800 Subject: mm: when stealing freepages, also take pages created by splitting buddy page When studying page stealing, I noticed some weird looking decisions in try_to_steal_freepages(). The first I assume is a bug (Patch 1), the following two patches were driven by evaluation. Testing was done with stress-highalloc of mmtests, using the mm_page_alloc_extfrag tracepoint and postprocessing to get counts of how often page stealing occurs for individual migratetypes, and what migratetypes are used for fallbacks. Arguably, the worst case of page stealing is when UNMOVABLE allocation steals from MOVABLE pageblock. RECLAIMABLE allocation stealing from MOVABLE allocation is also not ideal, so the goal is to minimize these two cases. The evaluation of v2 wasn't always clear win and Joonsoo questioned the results. Here I used different baseline which includes RFC compaction improvements from [1]. I found that the compaction improvements reduce variability of stress-highalloc, so there's less noise in the data. First, let's look at stress-highalloc configured to do sync compaction, and how these patches reduce page stealing events during the test. First column is after fresh reboot, other two are reiterations of test without reboot. That was all accumulater over 5 re-iterations (so the benchmark was run 5x3 times with 5 fresh restarts). Baseline: 3.19-rc4 3.19-rc4 3.19-rc4 5-nothp-1 5-nothp-2 5-nothp-3 Page alloc extfrag event 10264225 8702233 10244125 Extfrag fragmenting 10263271 8701552 10243473 Extfrag fragmenting for unmovable 13595 17616 15960 Extfrag fragmenting unmovable placed with movable 7989 12193 8447 Extfrag fragmenting for reclaimable 658 1840 1817 Extfrag fragmenting reclaimable placed with movable 558 1677 1679 Extfrag fragmenting for movable 10249018 8682096 10225696 With Patch 1: 3.19-rc4 3.19-rc4 3.19-rc4 6-nothp-1 6-nothp-2 6-nothp-3 Page alloc extfrag event 11834954 9877523 9774860 Extfrag fragmenting 11833993 9876880 9774245 Extfrag fragmenting for unmovable 7342 16129 11712 Extfrag fragmenting unmovable placed with movable 4191 10547 6270 Extfrag fragmenting for reclaimable 373 1130 923 Extfrag fragmenting reclaimable placed with movable 302 906 738 Extfrag fragmenting for movable 11826278 9859621 9761610 With Patch 2: 3.19-rc4 3.19-rc4 3.19-rc4 7-nothp-1 7-nothp-2 7-nothp-3 Page alloc extfrag event 4725990 3668793 3807436 Extfrag fragmenting 4725104 3668252 3806898 Extfrag fragmenting for unmovable 6678 7974 7281 Extfrag fragmenting unmovable placed with movable 2051 3829 4017 Extfrag fragmenting for reclaimable 429 1208 1278 Extfrag fragmenting reclaimable placed with movable 369 976 1034 Extfrag fragmenting for movable 4717997 3659070 3798339 With Patch 3: 3.19-rc4 3.19-rc4 3.19-rc4 8-nothp-1 8-nothp-2 8-nothp-3 Page alloc extfrag event 5016183 4700142 3850633 Extfrag fragmenting 5015325 4699613 3850072 Extfrag fragmenting for unmovable 1312 3154 3088 Extfrag fragmenting unmovable placed with movable 1115 2777 2714 Extfrag fragmenting for reclaimable 437 1193 1097 Extfrag fragmenting reclaimable placed with movable 330 969 879 Extfrag fragmenting for movable 5013576 4695266 3845887 In v2 we've seen apparent regression with Patch 1 for unmovable events, this is now gone, suggesting it was indeed noise. Here, each patch improves the situation for unmovable events. Reclaimable is improved by patch 1 and then either the same modulo noise, or perhaps sligtly worse - a small price for unmovable improvements, IMHO. The number of movable allocations falling back to other migratetypes is most noisy, but it's reduced to half at Patch 2 nevertheless. These are least critical as compaction can move them around. If we look at success rates, the patches don't affect them, that didn't change. Baseline: 3.19-rc4 3.19-rc4 3.19-rc4 5-nothp-1 5-nothp-2 5-nothp-3 Success 1 Min 49.00 ( 0.00%) 42.00 ( 14.29%) 41.00 ( 16.33%) Success 1 Mean 51.00 ( 0.00%) 45.00 ( 11.76%) 42.60 ( 16.47%) Success 1 Max 55.00 ( 0.00%) 51.00 ( 7.27%) 46.00 ( 16.36%) Success 2 Min 53.00 ( 0.00%) 47.00 ( 11.32%) 44.00 ( 16.98%) Success 2 Mean 59.60 ( 0.00%) 50.80 ( 14.77%) 48.20 ( 19.13%) Success 2 Max 64.00 ( 0.00%) 56.00 ( 12.50%) 52.00 ( 18.75%) Success 3 Min 84.00 ( 0.00%) 82.00 ( 2.38%) 78.00 ( 7.14%) Success 3 Mean 85.60 ( 0.00%) 82.80 ( 3.27%) 79.40 ( 7.24%) Success 3 Max 86.00 ( 0.00%) 83.00 ( 3.49%) 80.00 ( 6.98%) Patch 1: 3.19-rc4 3.19-rc4 3.19-rc4 6-nothp-1 6-nothp-2 6-nothp-3 Success 1 Min 49.00 ( 0.00%) 44.00 ( 10.20%) 44.00 ( 10.20%) Success 1 Mean 51.80 ( 0.00%) 46.00 ( 11.20%) 45.80 ( 11.58%) Success 1 Max 54.00 ( 0.00%) 49.00 ( 9.26%) 49.00 ( 9.26%) Success 2 Min 58.00 ( 0.00%) 49.00 ( 15.52%) 48.00 ( 17.24%) Success 2 Mean 60.40 ( 0.00%) 51.80 ( 14.24%) 50.80 ( 15.89%) Success 2 Max 63.00 ( 0.00%) 54.00 ( 14.29%) 55.00 ( 12.70%) Success 3 Min 84.00 ( 0.00%) 81.00 ( 3.57%) 79.00 ( 5.95%) Success 3 Mean 85.00 ( 0.00%) 81.60 ( 4.00%) 79.80 ( 6.12%) Success 3 Max 86.00 ( 0.00%) 82.00 ( 4.65%) 82.00 ( 4.65%) Patch 2: 3.19-rc4 3.19-rc4 3.19-rc4 7-nothp-1 7-nothp-2 7-nothp-3 Success 1 Min 50.00 ( 0.00%) 44.00 ( 12.00%) 39.00 ( 22.00%) Success 1 Mean 52.80 ( 0.00%) 45.60 ( 13.64%) 42.40 ( 19.70%) Success 1 Max 55.00 ( 0.00%) 46.00 ( 16.36%) 47.00 ( 14.55%) Success 2 Min 52.00 ( 0.00%) 48.00 ( 7.69%) 45.00 ( 13.46%) Success 2 Mean 53.40 ( 0.00%) 49.80 ( 6.74%) 48.80 ( 8.61%) Success 2 Max 57.00 ( 0.00%) 52.00 ( 8.77%) 52.00 ( 8.77%) Success 3 Min 84.00 ( 0.00%) 81.00 ( 3.57%) 79.00 ( 5.95%) Success 3 Mean 85.00 ( 0.00%) 82.40 ( 3.06%) 79.60 ( 6.35%) Success 3 Max 86.00 ( 0.00%) 83.00 ( 3.49%) 80.00 ( 6.98%) Patch 3: 3.19-rc4 3.19-rc4 3.19-rc4 8-nothp-1 8-nothp-2 8-nothp-3 Success 1 Min 46.00 ( 0.00%) 44.00 ( 4.35%) 42.00 ( 8.70%) Success 1 Mean 50.20 ( 0.00%) 45.60 ( 9.16%) 44.00 ( 12.35%) Success 1 Max 52.00 ( 0.00%) 47.00 ( 9.62%) 47.00 ( 9.62%) Success 2 Min 53.00 ( 0.00%) 49.00 ( 7.55%) 48.00 ( 9.43%) Success 2 Mean 55.80 ( 0.00%) 50.60 ( 9.32%) 49.00 ( 12.19%) Success 2 Max 59.00 ( 0.00%) 52.00 ( 11.86%) 51.00 ( 13.56%) Success 3 Min 84.00 ( 0.00%) 80.00 ( 4.76%) 79.00 ( 5.95%) Success 3 Mean 85.40 ( 0.00%) 81.60 ( 4.45%) 80.40 ( 5.85%) Success 3 Max 87.00 ( 0.00%) 83.00 ( 4.60%) 82.00 ( 5.75%) While there's no improvement here, I consider reduced fragmentation events to be worth on its own. Patch 2 also seems to reduce scanning for free pages, and migrations in compaction, suggesting it has somewhat less work to do: Patch 1: Compaction stalls 4153 3959 3978 Compaction success 1523 1441 1446 Compaction failures 2630 2517 2531 Page migrate success 4600827 4943120 5104348 Page migrate failure 19763 16656 17806 Compaction pages isolated 9597640 10305617 10653541 Compaction migrate scanned 77828948 86533283 87137064 Compaction free scanned 517758295 521312840 521462251 Compaction cost 5503 5932 6110 Patch 2: Compaction stalls 3800 3450 3518 Compaction success 1421 1316 1317 Compaction failures 2379 2134 2201 Page migrate success 4160421 4502708 4752148 Page migrate failure 19705 14340 14911 Compaction pages isolated 8731983 9382374 9910043 Compaction migrate scanned 98362797 96349194 98609686 Compaction free scanned 496512560 469502017 480442545 Compaction cost 5173 5526 5811 As with v2, /proc/pagetypeinfo appears unaffected with respect to numbers of unmovable and reclaimable pageblocks. Configuring the benchmark to allocate like THP page fault (i.e. no sync compaction) gives much noisier results for iterations 2 and 3 after reboot. This is not so surprising given how [1] offers lower improvements in this scenario due to less restarts after deferred compaction which would change compaction pivot. Baseline: 3.19-rc4 3.19-rc4 3.19-rc4 5-thp-1 5-thp-2 5-thp-3 Page alloc extfrag event 8148965 6227815 6646741 Extfrag fragmenting 8147872 6227130 6646117 Extfrag fragmenting for unmovable 10324 12942 15975 Extfrag fragmenting unmovable placed with movable 5972 8495 10907 Extfrag fragmenting for reclaimable 601 1707 2210 Extfrag fragmenting reclaimable placed with movable 520 1570 2000 Extfrag fragmenting for movable 8136947 6212481 6627932 Patch 1: 3.19-rc4 3.19-rc4 3.19-rc4 6-thp-1 6-thp-2 6-thp-3 Page alloc extfrag event 8345457 7574471 7020419 Extfrag fragmenting 8343546 7573777 7019718 Extfrag fragmenting for unmovable 10256 18535 30716 Extfrag fragmenting unmovable placed with movable 6893 11726 22181 Extfrag fragmenting for reclaimable 465 1208 1023 Extfrag fragmenting reclaimable placed with movable 353 996 843 Extfrag fragmenting for movable 8332825 7554034 6987979 Patch 2: 3.19-rc4 3.19-rc4 3.19-rc4 7-thp-1 7-thp-2 7-thp-3 Page alloc extfrag event 3512847 3020756 2891625 Extfrag fragmenting 3511940 3020185 2891059 Extfrag fragmenting for unmovable 9017 6892 6191 Extfrag fragmenting unmovable placed with movable 1524 3053 2435 Extfrag fragmenting for reclaimable 445 1081 1160 Extfrag fragmenting reclaimable placed with movable 375 918 986 Extfrag fragmenting for movable 3502478 3012212 2883708 Patch 3: 3.19-rc4 3.19-rc4 3.19-rc4 8-thp-1 8-thp-2 8-thp-3 Page alloc extfrag event 3181699 3082881 2674164 Extfrag fragmenting 3180812 3082303 2673611 Extfrag fragmenting for unmovable 1201 4031 4040 Extfrag fragmenting unmovable placed with movable 974 3611 3645 Extfrag fragmenting for reclaimable 478 1165 1294 Extfrag fragmenting reclaimable placed with movable 387 985 1030 Extfrag fragmenting for movable 3179133 3077107 2668277 The improvements for first iteration are clear, the rest is much noisier and can appear like regression for Patch 1. Anyway, patch 2 rectifies it. Allocation success rates are again unaffected so there's no point in making this e-mail any longer. [1] http://marc.info/?l=linux-mm&m=142166196321125&w=2 This patch (of 3): When __rmqueue_fallback() is called to allocate a page of order X, it will find a page of order Y >= X of a fallback migratetype, which is different from the desired migratetype. With the help of try_to_steal_freepages(), it may change the migratetype (to the desired one) also of: 1) all currently free pages in the pageblock containing the fallback page 2) the fallback pageblock itself 3) buddy pages created by splitting the fallback page (when Y > X) These decisions take the order Y into account, as well as the desired migratetype, with the goal of preventing multiple fallback allocations that could e.g. distribute UNMOVABLE allocations among multiple pageblocks. Originally, decision for 1) has implied the decision for 3). Commit 47118af076f6 ("mm: mmzone: MIGRATE_CMA migration type added") changed that (probably unintentionally) so that the buddy pages in case 3) are always changed to the desired migratetype, except for CMA pageblocks. Commit fef903efcf0c ("mm/page_allo.c: restructure free-page stealing code and fix a bug") did some refactoring and added a comment that the case of 3) is intended. Commit 0cbef29a7821 ("mm: __rmqueue_fallback() should respect pageblock type") removed the comment and tried to restore the original behavior where 1) implies 3), but due to the previous refactoring, the result is instead that only 2) implies 3) - and the conditions for 2) are less frequently met than conditions for 1). This may increase fragmentation in situations where the code decides to steal all free pages from the pageblock (case 1)), but then gives back the buddy pages produced by splitting. This patch restores the original intended logic where 1) implies 3). During testing with stress-highalloc from mmtests, this has shown to decrease the number of events where UNMOVABLE and RECLAIMABLE allocations steal from MOVABLE pageblocks, which can lead to permanent fragmentation. In some cases it has increased the number of events when MOVABLE allocations steal from UNMOVABLE or RECLAIMABLE pageblocks, but these are fixable by sync compaction and thus less harmful. Note that evaluation has shown that the behavior introduced by 47118af076f6 for buddy pages in case 3) is actually even better than the original logic, so the following patch will introduce it properly once again. For stable backports of this patch it makes thus sense to only fix versions containing 0cbef29a7821. [iamjoonsoo.kim@lge.com: tracepoint fix] Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Zhang Yanfei Acked-by: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: KOSAKI Motohiro Cc: [3.13+ containing 0cbef29a7821] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 7 ++++--- mm/page_alloc.c | 12 +++++------- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index aece1346ceb7..4ad10baecd4d 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -268,11 +268,11 @@ TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, int alloc_order, int fallback_order, - int alloc_migratetype, int fallback_migratetype, int new_migratetype), + int alloc_migratetype, int fallback_migratetype), TP_ARGS(page, alloc_order, fallback_order, - alloc_migratetype, fallback_migratetype, new_migratetype), + alloc_migratetype, fallback_migratetype), TP_STRUCT__entry( __field( struct page *, page ) @@ -289,7 +289,8 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->fallback_order = fallback_order; __entry->alloc_migratetype = alloc_migratetype; __entry->fallback_migratetype = fallback_migratetype; - __entry->change_ownership = (new_migratetype == alloc_migratetype); + __entry->change_ownership = (alloc_migratetype == + get_pageblock_migratetype(page)); ), TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 134e25525044..b7a881019352 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1131,8 +1131,8 @@ static void change_pageblock_range(struct page *pageblock_page, * nor move CMA pages to different free lists. We don't want unmovable pages * to be allocated from MIGRATE_CMA areas. * - * Returns the new migratetype of the pageblock (or the same old migratetype - * if it was unchanged). + * Returns the allocation migratetype if free pages were stolen, or the + * fallback migratetype if it was decided not to steal. */ static int try_to_steal_freepages(struct zone *zone, struct page *page, int start_type, int fallback_type) @@ -1163,12 +1163,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Claim the whole block if over half of it is free */ if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) { - + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); - return start_type; - } + return start_type; } return fallback_type; @@ -1220,7 +1218,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype, new_type); + start_migratetype, migratetype); return page; } -- cgit v1.2.3 From 3a1086fba92b6e2311b6a342f68bc380beb240fe Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Feb 2015 15:28:18 -0800 Subject: mm: always steal split buddies in fallback allocations When allocation falls back to another migratetype, it will steal a page with highest available order, and (depending on this order and desired migratetype), it might also steal the rest of free pages from the same pageblock. Given the preference of highest available order, it is likely that it will be higher than the desired order, and result in the stolen buddy page being split. The remaining pages after split are currently stolen only when the rest of the free pages are stolen. This can however lead to situations where for MOVABLE allocations we split e.g. order-4 fallback UNMOVABLE page, but steal only order-0 page. Then on the next MOVABLE allocation (which may be batched to fill the pcplists) we split another order-3 or higher page, etc. By stealing all pages that we have split, we can avoid further stealing. This patch therefore adjusts the page stealing so that buddy pages created by split are always stolen. This has effect only on MOVABLE allocations, as RECLAIMABLE and UNMOVABLE allocations already always do that in addition to stealing the rest of free pages from the pageblock. The change also allows to simplify try_to_steal_freepages() and factor out CMA handling. According to Mel, it has been intended since the beginning that buddy pages after split would be stolen always, but it doesn't seem like it was ever the case until commit 47118af076f6 ("mm: mmzone: MIGRATE_CMA migration type added"). The commit has unintentionally introduced this behavior, but was reverted by commit 0cbef29a7821 ("mm: __rmqueue_fallback() should respect pageblock type"). Neither included evaluation. My evaluation with stress-highalloc from mmtests shows about 2.5x reduction of page stealing events for MOVABLE allocations, without affecting the page stealing events for other allocation migratetypes. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Zhang Yanfei Acked-by: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 62 +++++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 33 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7a881019352..7c44b49cec40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1125,33 +1125,18 @@ static void change_pageblock_range(struct page *pageblock_page, /* * If breaking a large block of pages, move all free pages to the preferred * allocation list. If falling back for a reclaimable kernel allocation, be - * more aggressive about taking ownership of free pages. - * - * On the other hand, never change migration type of MIGRATE_CMA pageblocks - * nor move CMA pages to different free lists. We don't want unmovable pages - * to be allocated from MIGRATE_CMA areas. - * - * Returns the allocation migratetype if free pages were stolen, or the - * fallback migratetype if it was decided not to steal. + * more aggressive about taking ownership of free pages. If we claim more than + * half of the pageblock, change pageblock's migratetype as well. */ -static int try_to_steal_freepages(struct zone *zone, struct page *page, +static void try_to_steal_freepages(struct zone *zone, struct page *page, int start_type, int fallback_type) { int current_order = page_order(page); - /* - * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. We also ensure the freepage_migratetype - * is set to CMA so it is returned to the correct freelist in case - * the page ends up being not actually allocated from the pcp lists. - */ - if (is_migrate_cma(fallback_type)) - return fallback_type; - /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return start_type; + return; } if (current_order >= pageblock_order / 2 || @@ -1165,11 +1150,7 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, if (pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); - - return start_type; } - - return fallback_type; } /* Remove an element from the buddy allocator from the fallback list */ @@ -1179,14 +1160,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct free_area *area; unsigned int current_order; struct page *page; - int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order && current_order <= MAX_ORDER-1; --current_order) { + int i; for (i = 0;; i++) { - migratetype = fallbacks[start_migratetype][i]; + int migratetype = fallbacks[start_migratetype][i]; + int buddy_type = start_migratetype; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) @@ -1200,22 +1182,36 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct page, lru); area->nr_free--; - new_type = try_to_steal_freepages(zone, page, - start_migratetype, - migratetype); + if (!is_migrate_cma(migratetype)) { + try_to_steal_freepages(zone, page, + start_migratetype, + migratetype); + } else { + /* + * When borrowing from MIGRATE_CMA, we need to + * release the excess buddy pages to CMA + * itself, and we do not try to steal extra + * free pages. + */ + buddy_type = migratetype; + } /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); expand(zone, page, order, current_order, area, - new_type); - /* The freepage_migratetype may differ from pageblock's + buddy_type); + + /* + * The freepage_migratetype may differ from pageblock's * migratetype depending on the decisions in - * try_to_steal_freepages. This is OK as long as it does - * not differ for MIGRATE_CMA type. + * try_to_steal_freepages(). This is OK as long as it + * does not differ for MIGRATE_CMA pageblocks. For CMA + * we need to make sure unallocated pages flushed from + * pcp lists are returned to the correct freelist. */ - set_freepage_migratetype(page, new_type); + set_freepage_migratetype(page, buddy_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype); -- cgit v1.2.3 From 9c0415eb8cbf0c8fd043b6c0f0354308ab099df5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Feb 2015 15:28:21 -0800 Subject: mm: more aggressive page stealing for UNMOVABLE allocations When allocation falls back to stealing free pages of another migratetype, it can decide to steal extra pages, or even the whole pageblock in order to reduce fragmentation, which could happen if further allocation fallbacks pick a different pageblock. In try_to_steal_freepages(), one of the situations where extra pages are stolen happens when we are trying to allocate a MIGRATE_RECLAIMABLE page. However, MIGRATE_UNMOVABLE allocations are not treated the same way, although spreading such allocation over multiple fallback pageblocks is arguably even worse than it is for RECLAIMABLE allocations. To minimize fragmentation, we should minimize the number of such fallbacks, and thus steal as much as is possible from each fallback pageblock. Note that in theory this might put more pressure on movable pageblocks and cause movable allocations to steal back from unmovable pageblocks. However, movable allocations are not as aggressive with stealing, and do not cause permanent fragmentation, so the tradeoff is reasonable, and evaluation seems to support the change. This patch thus adds a check for MIGRATE_UNMOVABLE to the decision to steal extra free pages. When evaluating with stress-highalloc from mmtests, this has reduced the number of MIGRATE_UNMOVABLE fallbacks to roughly 1/6. The number of these fallbacks stealing from MIGRATE_MOVABLE block is reduced to 1/3. There was no observation of growing number of unmovable pageblocks over time, and also not of increased movable allocation fallbacks. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Zhang Yanfei Cc: Minchan Kim Cc: David Rientjes Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7c44b49cec40..8d52ab18fe0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1123,10 +1123,19 @@ static void change_pageblock_range(struct page *pageblock_page, } /* - * If breaking a large block of pages, move all free pages to the preferred - * allocation list. If falling back for a reclaimable kernel allocation, be - * more aggressive about taking ownership of free pages. If we claim more than - * half of the pageblock, change pageblock's migratetype as well. + * When we are falling back to another migratetype during allocation, try to + * steal extra free pages from the same pageblocks to satisfy further + * allocations, instead of polluting multiple pageblocks. + * + * If we are stealing a relatively large buddy page, it is likely there will + * be more free pages in the pageblock, so try to steal them all. For + * reclaimable and unmovable allocations, we steal regardless of page size, + * as fragmentation caused by those allocations polluting movable pageblocks + * is worse than movable allocations stealing from unmovable and reclaimable + * pageblocks. + * + * If we claim more than half of the pageblock, change pageblock's migratetype + * as well. */ static void try_to_steal_freepages(struct zone *zone, struct page *page, int start_type, int fallback_type) @@ -1141,6 +1150,7 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page, if (current_order >= pageblock_order / 2 || start_type == MIGRATE_RECLAIMABLE || + start_type == MIGRATE_UNMOVABLE || page_group_by_mobility_disabled) { int pages; -- cgit v1.2.3