diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 386 |
1 files changed, 270 insertions, 116 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ea018263210..0b97b8ece4a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -265,17 +265,19 @@ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; int watermark_scale_factor = 10; -static unsigned long __meminitdata nr_kernel_pages; -static unsigned long __meminitdata nr_all_pages; -static unsigned long __meminitdata dma_reserve; +static unsigned long nr_kernel_pages __meminitdata; +static unsigned long nr_all_pages __meminitdata; +static unsigned long dma_reserve __meminitdata; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __initdata required_kernelcore; -static unsigned long __initdata required_movablecore; -static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; -static bool mirrored_kernelcore; +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long required_kernelcore __initdata; +static unsigned long required_kernelcore_percent __initdata; +static unsigned long required_movablecore __initdata; +static unsigned long required_movablecore_percent __initdata; +static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; +static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -292,40 +294,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - -/* - * Determine how many pages need to be initialized during early boot - * (non-deferred initialization). - * The value of first_deferred_pfn will be set later, once non-deferred pages - * are initialized, but for now set it ULONG_MAX. - */ -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ - phys_addr_t start_addr, end_addr; - unsigned long max_pgcnt; - unsigned long reserved; - - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_pgcnt = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - - /* - * Compensate the all the memblock reservations (e.g. crash kernel) - * from the initial estimation to make sure we will initialize enough - * memory to boot. - */ - start_addr = PFN_PHYS(pgdat->node_start_pfn); - end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); - reserved = memblock_reserved_memory_within(start_addr, end_addr); - max_pgcnt += PHYS_PFN(reserved); - - pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -} - /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { @@ -361,10 +329,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, return true; } #else -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ -} - static inline bool early_page_uninitialised(unsigned long pfn) { return false; @@ -1099,6 +1063,15 @@ static bool bulkfree_pcp_prepare(struct page *page) } #endif /* CONFIG_DEBUG_VM */ +static inline void prefetch_buddy(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); + struct page *buddy = page + (buddy_pfn - pfn); + + prefetch(buddy); +} + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -1115,13 +1088,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int prefetch_nr = 0; bool isolated_pageblocks; - - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); + struct page *page, *tmp; + LIST_HEAD(head); while (count) { - struct page *page; struct list_head *list; /* @@ -1143,26 +1115,48 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = count; do { - int mt; /* migratetype of the to-be-freed page */ - page = list_last_entry(list, struct page, lru); - /* must delete as __free_one_page list manipulates */ + /* must delete to avoid corrupting pcp list */ list_del(&page->lru); - - mt = get_pcppage_migratetype(page); - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); + pcp->count--; if (bulkfree_pcp_prepare(page)) continue; - __free_one_page(page, page_to_pfn(page), zone, 0, mt); - trace_mm_page_pcpu_drain(page, 0, mt); + list_add_tail(&page->lru, &head); + + /* + * We are going to put the page back to the global + * pool, prefetch its buddy to speed up later access + * under zone->lock. It is believed the overhead of + * an additional test and calculating buddy_pfn here + * can be offset by reduced memory latency later. To + * avoid excessive prefetching due to large count, only + * prefetch buddy for the first pcp->batch nr of pages. + */ + if (prefetch_nr++ < pcp->batch) + prefetch_buddy(page); } while (--count && --batch_free && !list_empty(list)); } + + spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); + + /* + * Use safe version since after __free_one_page(), + * page->lru.next will not point to original list. + */ + list_for_each_entry_safe(page, tmp, &head, lru) { + int mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); + + __free_one_page(page, page_to_pfn(page), zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + } spin_unlock(&zone->lock); } @@ -1181,10 +1175,9 @@ static void free_one_page(struct zone *zone, } static void __meminit __init_single_page(struct page *page, unsigned long pfn, - unsigned long zone, int nid, bool zero) + unsigned long zone, int nid) { - if (zero) - mm_zero_struct_page(page); + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1198,12 +1191,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, #endif } -static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, - int nid, bool zero) -{ - return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero); -} - #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __meminit init_reserved_page(unsigned long pfn) { @@ -1222,7 +1209,7 @@ static void __meminit init_reserved_page(unsigned long pfn) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) break; } - __init_single_pfn(pfn, zid, nid, true); + __init_single_page(pfn_to_page(pfn), pfn, zid, nid); } #else static inline void init_reserved_page(unsigned long pfn) @@ -1506,7 +1493,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - cond_resched(); + touch_nmi_watchdog(); } else { nr_free++; } @@ -1535,11 +1522,11 @@ static unsigned long __init deferred_init_pages(int nid, int zid, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - cond_resched(); + touch_nmi_watchdog(); } else { page++; } - __init_single_page(page, pfn, zid, nid, true); + __init_single_page(page, pfn, zid, nid); nr_pages++; } return (nr_pages); @@ -1552,23 +1539,25 @@ static int __init deferred_init_memmap(void *data) int nid = pgdat->node_id; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long spfn, epfn; + unsigned long spfn, epfn, first_init_pfn, flags; phys_addr_t spa, epa; int zid; struct zone *zone; - unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); u64 i; + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + pgdat_resize_lock(pgdat, &flags); + first_init_pfn = pgdat->first_deferred_pfn; if (first_init_pfn == ULONG_MAX) { + pgdat_resize_unlock(pgdat, &flags); pgdat_init_report_one_done(); return 0; } - /* Bind memory initialisation thread to a local node if possible */ - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(current, cpumask); - /* Sanity check boundaries */ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); @@ -1598,6 +1587,7 @@ static int __init deferred_init_memmap(void *data) epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); deferred_free_pages(nid, zid, spfn, epfn); } + pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); @@ -1608,6 +1598,117 @@ static int __init deferred_init_memmap(void *data) pgdat_init_report_one_done(); return 0; } + +/* + * During boot we initialize deferred pages on-demand, as needed, but once + * page_alloc_init_late() has finished, the deferred pages are all initialized, + * and we can permanently disable that path. + */ +static DEFINE_STATIC_KEY_TRUE(deferred_pages); + +/* + * If this zone has deferred pages, try to grow it by initializing enough + * deferred pages to satisfy the allocation specified by order, rounded up to + * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments + * of SECTION_SIZE bytes by initializing struct pages in increments of + * PAGES_PER_SECTION * sizeof(struct page) bytes. + * + * Return true when zone was grown, otherwise return false. We return true even + * when we grow less than requested, to let the caller decide if there are + * enough pages to satisfy the allocation. + * + * Note: We use noinline because this function is needed only during boot, and + * it is called from a __ref function _deferred_grow_zone. This way we are + * making sure that it is not inlined into permanent text section. + */ +static noinline bool __init +deferred_grow_zone(struct zone *zone, unsigned int order) +{ + int zid = zone_idx(zone); + int nid = zone_to_nid(zone); + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + unsigned long nr_pages = 0; + unsigned long first_init_pfn, spfn, epfn, t, flags; + unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + phys_addr_t spa, epa; + u64 i; + + /* Only the last zone may have deferred pages */ + if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) + return false; + + pgdat_resize_lock(pgdat, &flags); + + /* + * If deferred pages have been initialized while we were waiting for + * the lock, return true, as the zone was grown. The caller will retry + * this zone. We won't return to this function since the caller also + * has this static branch. + */ + if (!static_branch_unlikely(&deferred_pages)) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + /* + * If someone grew this zone while we were waiting for spinlock, return + * true, as there might be enough pages already. + */ + if (first_deferred_pfn != pgdat->first_deferred_pfn) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); + + if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + pgdat_resize_unlock(pgdat, &flags); + return false; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + + while (spfn < epfn && nr_pages < nr_pages_needed) { + t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); + first_deferred_pfn = min(t, epfn); + nr_pages += deferred_init_pages(nid, zid, spfn, + first_deferred_pfn); + spfn = first_deferred_pfn; + } + + if (nr_pages >= nr_pages_needed) + break; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); + deferred_free_pages(nid, zid, spfn, epfn); + + if (first_deferred_pfn == epfn) + break; + } + pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat_resize_unlock(pgdat, &flags); + + return nr_pages > 0; +} + +/* + * deferred_grow_zone() is __init, but it is called from + * get_page_from_freelist() during early boot until deferred_pages permanently + * disables this call. This is why we have refdata wrapper to avoid warning, + * and to ensure that the function body gets unloaded. + */ +static bool __ref +_deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return deferred_grow_zone(zone, order); +} + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) @@ -1626,6 +1727,12 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); + /* + * We initialized the rest of the deferred pages. Permanently disable + * on-demand struct page initialization. + */ + static_branch_disable(&deferred_pages); + /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif @@ -2418,10 +2525,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) local_irq_save(flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) { + if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; - } local_irq_restore(flags); } #endif @@ -2443,10 +2548,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - if (pcp->count) { + if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; - } local_irq_restore(flags); } @@ -2670,7 +2773,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); - pcp->count -= batch; } } @@ -3205,6 +3307,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, ac_classzone_idx(ac), alloc_flags)) { int ret; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * Watermark failed for this zone, but see if we can + * grow this zone if it contains deferred pages. + */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) @@ -3246,6 +3358,14 @@ try_this_zone: reserve_highatomic_pageblock(page, zone, order); return page; + } else { +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* Try again if zone has deferred pages */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif } } @@ -3685,16 +3805,18 @@ retry: return page; } -static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, + const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; + enum zone_type high_zoneidx = ac->high_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, order, ac->high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -3973,7 +4095,7 @@ retry_cpuset: goto nopage; if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); /* * The adjusted alloc_flags might result in immediate success, so try @@ -4031,7 +4153,7 @@ retry_cpuset: retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) @@ -5334,6 +5456,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; + struct page *page; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP struct memblock_region *r = NULL, *tmp; #endif @@ -5386,6 +5509,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #endif not_early: + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zone, nid); + if (context == MEMMAP_HOTPLUG) + SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -5402,15 +5530,8 @@ not_early: * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { - struct page *page = pfn_to_page(pfn); - - __init_single_page(page, pfn, zone, nid, - context != MEMMAP_HOTPLUG); set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); - } else { - __init_single_pfn(pfn, zone, nid, - context != MEMMAP_HOTPLUG); } } } @@ -6241,7 +6362,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, alloc_node_mem_map(pgdat); - reset_deferred_meminit(pgdat); +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +#endif free_area_init_core(pgdat); } @@ -6471,7 +6600,18 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* - * If movablecore=nn[KMG] was specified, calculate what size of + * If kernelcore=nn% or movablecore=nn% was specified, calculate the + * amount of necessary memory. + */ + if (required_kernelcore_percent) + required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / + 10000UL; + if (required_movablecore_percent) + required_movablecore = (totalpages * 100 * required_movablecore_percent) / + 10000UL; + + /* + * If movablecore= was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -6711,18 +6851,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) zero_resv_unavail(); } -static int __init cmdline_parse_core(char *p, unsigned long *core) +static int __init cmdline_parse_core(char *p, unsigned long *core, + unsigned long *percent) { unsigned long long coremem; + char *endptr; + if (!p) return -EINVAL; - coremem = memparse(p, &p); - *core = coremem >> PAGE_SHIFT; + /* Value may be a percentage of total memory, otherwise bytes */ + coremem = simple_strtoull(p, &endptr, 0); + if (*endptr == '%') { + /* Paranoid check for percent values greater than 100 */ + WARN_ON(coremem > 100); - /* Paranoid check that UL is enough for the coremem value */ - WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *percent = coremem; + } else { + coremem = memparse(p, &p); + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *core = coremem >> PAGE_SHIFT; + *percent = 0UL; + } return 0; } @@ -6738,7 +6890,8 @@ static int __init cmdline_parse_kernelcore(char *p) return 0; } - return cmdline_parse_core(p, &required_kernelcore); + return cmdline_parse_core(p, &required_kernelcore, + &required_kernelcore_percent); } /* @@ -6747,7 +6900,8 @@ static int __init cmdline_parse_kernelcore(char *p) */ static int __init cmdline_parse_movablecore(char *p) { - return cmdline_parse_core(p, &required_movablecore); + return cmdline_parse_core(p, &required_movablecore, + &required_movablecore_percent); } early_param("kernelcore", cmdline_parse_kernelcore); @@ -7591,7 +7745,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, cc->mode, MR_CMA); + NULL, 0, cc->mode, MR_CONTIG_RANGE); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -7611,11 +7765,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES - * aligned, however it's the caller's responsibility to guarantee that - * we are the only thread that changes migrate type of pageblocks the - * pages fall in. + * aligned. The PFN range must belong to a single zone. * - * The PFN range must belong to a single zone. + * The first thing this routine does is attempt to MIGRATE_ISOLATE all + * pageblocks in the range. Once isolated, the pageblocks should not + * be modified by others. * * Returns zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and |