diff options
Diffstat (limited to 'mm')
55 files changed, 3778 insertions, 1713 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 989f8f3d77e0..2664c118b5d2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE def_bool y depends on SPARSEMEM && MEMORY_HOTPLUG +config MEMORY_HOTPLUG_DEFAULT_ONLINE + bool "Online the newly added memory blocks by default" + default n + depends on MEMORY_HOTPLUG + help + This option sets the default policy setting for memory hotplug + onlining policy (/sys/devices/system/memory/auto_online_blocks) which + determines what happens to newly added memory regions. Policy setting + can always be changed at runtime. + See Documentation/memory-hotplug.txt for more information. + + Say Y here if you want all hot-plugged memory blocks to appear in + 'online' state by default. + Say N here if you want the default policy to keep all hot-plugged + memory blocks in 'offline' state. + config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select MEMORY_ISOLATION @@ -268,11 +284,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -config ZONE_DMA_FLAG - int - default "0" if !ZONE_DMA - default "1" - config BOUNCE bool "Enable bounce buffers" default y @@ -393,6 +404,7 @@ config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION + select RADIX_TREE_MULTIORDER help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. @@ -556,7 +568,7 @@ config ZPOOL zsmalloc. config ZBUD - tristate "Low density storage for compressed pages" + tristate "Low (Up to 2x) density storage for compressed pages" default n help A special purpose allocator for storing compressed pages. @@ -565,6 +577,16 @@ config ZBUD deterministic reclaim properties that make it preferable to a higher density approach when reclaim will be used. +config Z3FOLD + tristate "Up to 3x density storage for compressed pages" + depends on ZPOOL + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to three compressed pages per physical + page. It is a ZBUD derivative so the simplicity and determinism are + still there. + config ZSMALLOC tristate "Memory allocator for compressed pages" depends on MMU diff --git a/mm/Makefile b/mm/Makefile index deb467edca2d..78c6f7dedb83 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_Z3FOLD) += z3fold.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0c6317b7db38..ed173b8ae8f2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -957,9 +957,8 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, a short sleep or a cond_resched is - * performed to yield the processor and to allow other subsystems to make - * a forward progress. + * In the absence of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function @@ -979,20 +978,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) */ if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { - - /* - * Memory allocation/reclaim might be called from a WQ - * context and the current implementation of the WQ - * concurrency control doesn't recognize that a particular - * WQ is congested if the worker thread is looping without - * ever sleeping. Therefore we have to do a short sleep - * here rather than calling cond_resched(). - */ - if (current->flags & PF_WQ_WORKER) - schedule_timeout_uninterruptible(1); - else - cond_resched(); - + cond_resched(); /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); if (ret < 0) diff --git a/mm/compaction.c b/mm/compaction.c index ccf97b02b85f..1427366ad673 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> +#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) +#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) +#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) +#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone) zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = - round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); + pageblock_start_pfn(zone_end_pfn(zone) - 1); } /* @@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc, LIST_HEAD(freelist); pfn = start_pfn; - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(pfn); if (block_start_pfn < cc->zone->zone_start_pfn) block_start_pfn = cc->zone->zone_start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn += isolated, block_start_pfn = block_end_pfn, @@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_start_pfn = pageblock_start_pfn(pfn); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); } @@ -633,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { struct zone *zone = cc->zone; unsigned long nr_scanned = 0, nr_isolated = 0; - struct list_head *migratelist = &cc->migratepages; struct lruvec *lruvec; unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; + bool skip_on_failure = false; + unsigned long next_skip_pfn = 0; /* * Ensure that there are not too many pages isolated from the LRU @@ -659,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (compact_should_abort(cc)) return 0; + if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { + skip_on_failure = true; + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { bool is_lru; + if (skip_on_failure && low_pfn >= next_skip_pfn) { + /* + * We have isolated all migration candidates in the + * previous order-aligned block, and did not skip it due + * to failure. We should migrate the pages now and + * hopefully succeed compaction. + */ + if (nr_isolated) + break; + + /* + * We failed to isolate in the previous order-aligned + * block. Set the new boundary to the end of the + * current block. Note we can't simply increase + * next_skip_pfn by 1 << order, as low_pfn might have + * been incremented by a higher number due to skipping + * a compound or a high-order buddy page in the + * previous loop iteration. + */ + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* * Periodically drop the lock (if held) regardless of its * contention, to give chance to IRQs. Abort async compaction @@ -674,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, break; if (!pfn_valid_within(low_pfn)) - continue; + goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); @@ -729,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (likely(comp_order < MAX_ORDER)) low_pfn += (1UL << comp_order) - 1; - continue; + goto isolate_fail; } if (!is_lru) - continue; + goto isolate_fail; /* * Migration will fail if an anonymous page is pinned in memory, @@ -742,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!page_mapping(page) && page_count(page) > page_mapcount(page)) - continue; + goto isolate_fail; /* If we already hold the lock, we can skip some rechecking */ if (!locked) { @@ -753,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) - continue; + goto isolate_fail; /* * Page become compound since the non-locked check, @@ -762,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (unlikely(PageCompound(page))) { low_pfn += (1UL << compound_order(page)) - 1; - continue; + goto isolate_fail; } } @@ -770,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) - continue; + goto isolate_fail; VM_BUG_ON_PAGE(PageCompound(page), page); @@ -778,15 +811,55 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, del_page_from_lru_list(page, lruvec, page_lru(page)); isolate_success: - list_add(&page->lru, migratelist); + list_add(&page->lru, &cc->migratepages); cc->nr_migratepages++; nr_isolated++; + /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. + * - this is the lowest page that was isolated and likely be + * then freed by migration. + */ + if (!cc->last_migrated_pfn) + cc->last_migrated_pfn = low_pfn; + /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; break; } + + continue; +isolate_fail: + if (!skip_on_failure) + continue; + + /* + * We have isolated some pages, but then failed. Release them + * instead of migrating, as we cannot form the cc->order buddy + * page anyway. + */ + if (nr_isolated) { + if (locked) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } + acct_isolated(zone, cc); + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + cc->last_migrated_pfn = 0; + nr_isolated = 0; + } + + if (low_pfn < next_skip_pfn) { + low_pfn = next_skip_pfn - 1; + /* + * The check near the loop beginning would have updated + * next_skip_pfn too, but this is a bit simpler. + */ + next_skip_pfn += 1UL << cc->order; + } } /* @@ -834,10 +907,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(pfn); if (block_start_pfn < cc->zone->zone_start_pfn) block_start_pfn = cc->zone->zone_start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn = block_end_pfn, block_start_pfn = block_end_pfn, @@ -852,16 +925,8 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, ISOLATE_UNEVICTABLE); - /* - * In case of fatal failure, release everything that might - * have been isolated in the previous iteration, and signal - * the failure back to caller. - */ - if (!pfn) { - putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; + if (!pfn) break; - } if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; @@ -932,10 +997,10 @@ static void isolate_freepages(struct compact_control *cc) * is using. */ isolate_start_pfn = cc->free_pfn; - block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = pageblock_start_pfn(cc->free_pfn); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); - low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); + low_pfn = pageblock_end_pfn(cc->migrate_pfn); /* * Isolate free pages until enough are available to migrate the @@ -1078,7 +1143,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long block_start_pfn; unsigned long block_end_pfn; unsigned long low_pfn; - unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | @@ -1089,12 +1153,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; - block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(low_pfn); if (block_start_pfn < zone->zone_start_pfn) block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(low_pfn); /* * Iterate over whole pageblocks until we find the first suitable. @@ -1133,7 +1197,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* Perform the isolation */ - isolate_start_pfn = low_pfn; low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); @@ -1143,15 +1206,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. - * - this is the lowest page that could have been isolated and - * then freed by migration. - */ - if (cc->nr_migratepages && !cc->last_migrated_pfn) - cc->last_migrated_pfn = isolate_start_pfn; - - /* * Either we isolated something and proceed with migration. Or * we failed and compact_zone should decide if we should * continue or not. @@ -1175,7 +1229,7 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static int __compact_finished(struct zone *zone, struct compact_control *cc, +static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { unsigned int order; @@ -1198,7 +1252,10 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, if (cc->direct_compaction) zone->compact_blockskip_flush = true; - return COMPACT_COMPLETE; + if (cc->whole_zone) + return COMPACT_COMPLETE; + else + return COMPACT_PARTIAL_SKIPPED; } if (is_via_compact_memory(cc->order)) @@ -1238,8 +1295,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_NO_SUITABLE_PAGE; } -static int compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result compact_finished(struct zone *zone, + struct compact_control *cc, + const int migratetype) { int ret; @@ -1258,8 +1316,10 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -static unsigned long __compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) +static enum compact_result __compaction_suitable(struct zone *zone, int order, + unsigned int alloc_flags, + int classzone_idx, + unsigned long wmark_target) { int fragindex; unsigned long watermark; @@ -1282,7 +1342,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, * allocated and for a short time, the footprint is higher */ watermark += (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) + if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + alloc_flags, wmark_target)) return COMPACT_SKIPPED; /* @@ -1303,12 +1364,14 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, return COMPACT_CONTINUE; } -unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) +enum compact_result compaction_suitable(struct zone *zone, int order, + unsigned int alloc_flags, + int classzone_idx) { - unsigned long ret; + enum compact_result ret; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + zone_page_state(zone, NR_FREE_PAGES)); trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; @@ -1316,9 +1379,42 @@ unsigned long compaction_suitable(struct zone *zone, int order, return ret; } -static int compact_zone(struct zone *zone, struct compact_control *cc) +bool compaction_zonelist_suitable(struct alloc_context *ac, int order, + int alloc_flags) { - int ret; + struct zone *zone; + struct zoneref *z; + + /* + * Make sure at least one zone would pass __compaction_suitable if we continue + * retrying the reclaim. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; + enum compact_result compact_result; + + /* + * Do not consider all the reclaimable memory because we do not + * want to trash just for a single high order allocation which + * is even not guaranteed to appear even if __compaction_suitable + * is happy about the watermark check. + */ + available = zone_reclaimable_pages(zone) / order; + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + compact_result = __compaction_suitable(zone, order, alloc_flags, + ac_classzone_idx(ac), available); + if (compact_result != COMPACT_SKIPPED && + compact_result != COMPACT_NOT_SUITABLE_ZONE) + return true; + } + + return false; +} + +static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) +{ + enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); @@ -1326,15 +1422,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); - switch (ret) { - case COMPACT_PARTIAL: - case COMPACT_SKIPPED: - /* Compaction is likely to fail */ + /* Compaction is likely to fail */ + if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) return ret; - case COMPACT_CONTINUE: - /* Fall through to compaction */ - ; - } + + /* huh, compaction_suitable is returning something unexpected */ + VM_BUG_ON(ret != COMPACT_CONTINUE); /* * Clear pageblock skip if there were failures recently and compaction @@ -1351,7 +1444,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { - cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { @@ -1359,6 +1452,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } + + if (cc->migrate_pfn == start_pfn) + cc->whole_zone = true; + cc->last_migrated_pfn = 0; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, @@ -1406,6 +1503,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ret = COMPACT_CONTENDED; goto out; } + /* + * We failed to migrate at least one page in the current + * order-aligned block, so skip the rest of it. + */ + if (cc->direct_compaction && + (cc->mode == MIGRATE_ASYNC)) { + cc->migrate_pfn = block_end_pfn( + cc->migrate_pfn - 1, cc->order); + /* Draining pcplists is useless in this case */ + cc->last_migrated_pfn = 0; + + } } check_drain: @@ -1419,7 +1528,7 @@ check_drain: if (cc->order > 0 && cc->last_migrated_pfn) { int cpu; unsigned long current_block_start = - cc->migrate_pfn & ~((1UL << cc->order) - 1); + block_start_pfn(cc->migrate_pfn, cc->order); if (cc->last_migrated_pfn < current_block_start) { cpu = get_cpu(); @@ -1444,7 +1553,7 @@ out: cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); /* The cached pfn is always the first in a pageblock */ - free_pfn &= ~(pageblock_nr_pages-1); + free_pfn = pageblock_start_pfn(free_pfn); /* * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() @@ -1462,11 +1571,11 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, int order, +static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx) { - unsigned long ret; + enum compact_result ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -1504,15 +1613,15 @@ int sysctl_extfrag_threshold = 500; * * This is the main entry point for direct page compaction. */ -unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended) +enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - int rc = COMPACT_DEFERRED; + enum compact_result rc = COMPACT_SKIPPED; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1526,15 +1635,17 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { - int status; + enum compact_result status; int zone_contended; - if (compaction_deferred(zone, order)) + if (compaction_deferred(zone, order)) { + rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; + } status = compact_zone_order(zone, order, gfp_mask, mode, &zone_contended, alloc_flags, - ac->classzone_idx); + ac_classzone_idx(ac)); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1544,7 +1655,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac->classzone_idx, alloc_flags)) { + ac_classzone_idx(ac), alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller @@ -1566,7 +1677,8 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, goto break_loop; } - if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { + if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || + status == COMPACT_PARTIAL_SKIPPED)) { /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up @@ -1601,7 +1713,7 @@ break_loop: * If at least one zone wasn't deferred or skipped, we report if all * zones that were tried were lock contended. */ - if (rc > COMPACT_SKIPPED && all_zones_contended) + if (rc > COMPACT_INACTIVE && all_zones_contended) *contended = COMPACT_CONTENDED_LOCK; return rc; @@ -1741,7 +1853,7 @@ void compaction_unregister_node(struct node *node) static inline bool kcompactd_work_requested(pg_data_t *pgdat) { - return pgdat->kcompactd_max_order > 0; + return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); } static bool kcompactd_node_suitable(pg_data_t *pgdat) @@ -1750,7 +1862,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) struct zone *zone; enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; - for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) @@ -1785,7 +1897,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.classzone_idx); count_vm_event(KCOMPACTD_WAKE); - for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { int status; zone = &pgdat->node_zones[zoneid]; @@ -1805,13 +1917,15 @@ static void kcompactd_do_work(pg_data_t *pgdat) INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); + if (kthread_should_stop()) + return; status = compact_zone(zone, &cc); if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), cc.classzone_idx, 0)) { success = true; compaction_defer_reset(zone, cc.order, false); - } else if (status == COMPACT_COMPLETE) { + } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* * We use sync migration mode here, so we defer like * sync direct compaction does. diff --git a/mm/filemap.c b/mm/filemap.c index 7b9a4b180cae..00ae878b2a38 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { struct radix_tree_node *node; - unsigned long index; - unsigned int offset; - unsigned int tag; - void **slot; VM_BUG_ON(!PageLocked(page)); - __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index, + shadow); if (shadow) { mapping->nrexceptional++; @@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping, } mapping->nrpages--; - if (!node) { - /* Clear direct pointer tags in root node */ - mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; - radix_tree_replace_slot(slot, shadow); + if (!node) return; - } - - /* Clear tree tags for the removed page */ - index = page->index; - offset = index & RADIX_TREE_MAP_MASK; - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (test_bit(offset, node->tags[tag])) - radix_tree_tag_clear(&mapping->page_tree, index, tag); - } - /* Delete page, swap shadow entry */ - radix_tree_replace_slot(slot, shadow); workingset_node_pages_dec(node); if (shadow) workingset_node_shadows_inc(node); @@ -215,7 +198,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) * some other bad page check should catch it later. */ page_mapcount_reset(page); - atomic_sub(mapcount, &page->_count); + page_ref_sub(page, mapcount); } } @@ -725,8 +708,12 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * The page might have been evicted from cache only * recently, in which case it should be activated like * any other repeatedly accessed page. + * The exception is pages getting rewritten; evicting other + * data from the working set, only to cache data that will + * get overwritten with something else, is a waste of memory. */ - if (shadow && workingset_refault(shadow)) { + if (!(gfp_mask & __GFP_WRITE) && + shadow && workingset_refault(shadow)) { SetPageActive(page); workingset_activation(page); } else @@ -1850,8 +1837,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; ssize_t retval = 0; - loff_t *ppos = &iocb->ki_pos; - loff_t pos = *ppos; size_t count = iov_iter_count(iter); if (!count) @@ -1863,15 +1848,15 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t size; size = i_size_read(inode); - retval = filemap_write_and_wait_range(mapping, pos, - pos + count - 1); + retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1); if (!retval) { struct iov_iter data = *iter; - retval = mapping->a_ops->direct_IO(iocb, &data, pos); + retval = mapping->a_ops->direct_IO(iocb, &data); } if (retval > 0) { - *ppos = pos + retval; + iocb->ki_pos += retval; iov_iter_advance(iter, retval); } @@ -1884,14 +1869,14 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ - if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || + if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || IS_DAX(inode)) { file_accessed(file); goto out; } } - retval = do_generic_file_read(file, ppos, iter, retval); + retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); out: return retval; } @@ -2201,7 +2186,7 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; - do_set_pte(vma, addr, page, pte, false, false); + do_set_pte(vma, addr, page, pte, false, false, true); unlock_page(page); goto next; unlock: @@ -2512,11 +2497,12 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, EXPORT_SYMBOL(pagecache_write_end); ssize_t -generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; ssize_t written; size_t write_len; pgoff_t end; @@ -2550,7 +2536,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) } data = *from; - written = mapping->a_ops->direct_IO(iocb, &data, pos); + written = mapping->a_ops->direct_IO(iocb, &data); /* * Finally, try again to invalidate clean pages which might have been @@ -2587,7 +2573,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { struct page *page; - int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; + int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; if (flags & AOP_FLAG_NOFS) fgp_flags |= FGP_NOFS; @@ -2730,7 +2716,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_DIRECT) { loff_t pos, endbyte; - written = generic_file_direct_write(iocb, from, iocb->ki_pos); + written = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to @@ -2804,13 +2790,8 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); - if (ret > 0) { - ssize_t err; - - err = generic_write_sync(file, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; - } + if (ret > 0) + ret = generic_write_sync(iocb, ret); return ret; } EXPORT_SYMBOL(generic_file_write_iter); diff --git a/mm/highmem.c b/mm/highmem.c index 123bcd3ed4f2..50b4ca6787f0 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); unsigned int nr_free_highpages (void) { - pg_data_t *pgdat; + struct zone *zone; unsigned int pages = 0; - for_each_online_pgdat(pgdat) { - pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); - if (zone_movable_is_highmem()) - pages += zone_page_state( - &pgdat->node_zones[ZONE_MOVABLE], - NR_FREE_PAGES); + for_each_populated_zone(zone) { + if (is_highmem(zone)) + pages += zone_page_state(zone, NR_FREE_PAGES); } return pages; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86f9f8b82f8e..9ed58530f695 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,6 +89,7 @@ static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static unsigned long khugepaged_sleep_expire; static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); static DEFINE_SPINLOCK(khugepaged_mm_lock); @@ -232,7 +233,7 @@ retry: return READ_ONCE(huge_zero_page); } -static void put_huge_zero_page(void) +void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -467,6 +468,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, return -EINVAL; khugepaged_scan_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; @@ -494,6 +496,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, return -EINVAL; khugepaged_alloc_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; @@ -764,10 +767,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) { - pmd_t entry; - entry = mk_pmd(page, prot); - entry = pmd_mkhuge(entry); - return entry; + return pmd_mkhuge(mk_pmd(page, prot)); } static inline struct list_head *page_deferred_list(struct page *page) @@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); return VM_FAULT_NOPAGE; } +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) @@ -1298,15 +1299,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); /* * We can only reuse the page if nobody else maps the huge page or it's - * part. We can do it by checking page_mapcount() on each sub-page, but - * it's expensive. - * The cheaper way is to check page_count() to be equal 1: every - * mapcount takes page reference reference, so this way we can - * guarantee, that the PMD is the only mapping. - * This can give false negative if somebody pinned the page, but that's - * fine. + * part. */ - if (page_mapcount(page) == 1 && page_count(page) == 1) { + if (page_trans_huge_mapcount(page, NULL) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1684,12 +1679,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - put_huge_zero_page(); + tlb_remove_page(tlb, pmd_page(orig_pmd)); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - put_huge_zero_page(); + tlb_remove_page(tlb, pmd_page(orig_pmd)); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); @@ -1704,20 +1699,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, - unsigned long old_addr, +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; - struct mm_struct *mm = vma->vm_mm; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || - old_end - old_addr < HPAGE_PMD_SIZE || - (new_vma->vm_flags & VM_NOHUGEPAGE)) + old_end - old_addr < HPAGE_PMD_SIZE) return false; /* @@ -1960,10 +1952,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, * page fault if needed. */ return 0; - if (vma->vm_ops) + if (vma->vm_ops || (vm_flags & VM_NO_THP)) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -2080,7 +2071,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_write(pteval)) { writable = true; } else { - if (PageSwapCache(page) && !reuse_swap_page(page)) { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { unlock_page(page); result = SCAN_SWAP_CACHE_PAGE; goto out; @@ -2352,8 +2344,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return false; if (is_vma_temporary_stack(vma)) return false; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); - return true; + return !(vma->vm_flags & VM_NO_THP); } static void collapse_huge_page(struct mm_struct *mm, @@ -2804,15 +2795,25 @@ static void khugepaged_do_scan(void) put_page(hpage); } +static bool khugepaged_should_wakeup(void) +{ + return kthread_should_stop() || + time_after_eq(jiffies, khugepaged_sleep_expire); +} + static void khugepaged_wait_work(void) { if (khugepaged_has_work()) { - if (!khugepaged_scan_sleep_millisecs) + const unsigned long scan_sleep_jiffies = + msecs_to_jiffies(khugepaged_scan_sleep_millisecs); + + if (!scan_sleep_jiffies) return; + khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, - kthread_should_stop(), - msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + khugepaged_should_wakeup(), + scan_sleep_jiffies); return; } @@ -3036,8 +3037,10 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, return; /* - * Caller holds the mmap_sem write mode, so a huge pmd cannot - * materialize from under us. + * Caller holds the mmap_sem write mode or the anon_vma lock, + * so a huge pmd cannot materialize from under us (khugepaged + * holds both the mmap_sem write mode and the anon_vma lock + * write mode). */ __split_huge_pmd(vma, pmd, address, freeze); } @@ -3120,7 +3123,7 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_count is zero and not changing from under us. But + * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with @@ -3225,6 +3228,64 @@ int total_mapcount(struct page *page) } /* + * This calculates accurately how many mappings a transparent hugepage + * has (unlike page_mapcount() which isn't fully accurate). This full + * accuracy is primarily needed to know if copy-on-write faults can + * reuse the page and change the mapping to read-write instead of + * copying them. At the same time this returns the total_mapcount too. + * + * The function returns the highest mapcount any one of the subpages + * has. If the return value is one, even if different processes are + * mapping different subpages of the transparent hugepage, they can + * all reuse it, because each process is reusing a different subpage. + * + * The total_mapcount is instead counting all virtual mappings of the + * subpages. If the total_mapcount is equal to "one", it tells the + * caller all mappings belong to the same "mm" and in turn the + * anon_vma of the transparent hugepage can become the vma->anon_vma + * local one as no other process may be mapping any of the subpages. + * + * It would be more accurate to replace page_mapcount() with + * page_trans_huge_mapcount(), however we only use + * page_trans_huge_mapcount() in the copy-on-write faults where we + * need full accuracy to avoid breaking page pinning, because + * page_trans_huge_mapcount() is slower than page_mapcount(). + */ +int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +{ + int i, ret, _total_mapcount, mapcount; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; + } + + page = compound_head(page); + + _total_mapcount = ret = 0; + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + if (PageDoubleMap(page)) { + ret -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + ret += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + return ret; +} + +/* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. * @@ -3289,7 +3350,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); - /* Prevent deferred_split_scan() touching ->_count */ + /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); @@ -3454,7 +3515,7 @@ next: } } - pr_info("%lu of %lu THP split", split, total); + pr_info("%lu of %lu THP split\n", split, total); return 0; } @@ -3465,7 +3526,7 @@ static int __init split_huge_pages_debugfs(void) { void *ret; - ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL, + ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, &split_huge_pages_fops); if (!ret) pr_warn("Failed to create split_huge_pages in debugfs"); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 19d0d08b396f..d26162e81fea 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages); static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; +static bool __initdata parsed_valid_hugepagesz = true; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -144,7 +145,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } } - if (spool->min_hpages != -1) { /* minimum size accounting */ + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on @@ -182,7 +184,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; - if (spool->min_hpages != -1) { /* minimum size accounting */ + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages) ret = 0; else @@ -624,6 +627,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma, { return vma_hugecache_offset(hstate_vma(vma), vma, address); } +EXPORT_SYMBOL_GPL(linear_hugepage_index); /* * Return the size of the pages allocated when backing a VMA. In the majority @@ -937,9 +941,7 @@ err: */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); + nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; @@ -1030,8 +1032,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); } -static bool pfn_range_valid_gigantic(unsigned long start_pfn, - unsigned long nr_pages) +static bool pfn_range_valid_gigantic(struct zone *z, + unsigned long start_pfn, unsigned long nr_pages) { unsigned long i, end_pfn = start_pfn + nr_pages; struct page *page; @@ -1042,6 +1044,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn, page = pfn_to_page(i); + if (page_zone(page) != z) + return false; + if (PageReserved(page)) return false; @@ -1074,7 +1079,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order) pfn = ALIGN(z->zone_start_pfn, nr_pages); while (zone_spans_last_pfn(z, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(pfn, nr_pages)) { + if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { /* * We release the zone lock here because * alloc_contig_range() will also lock the zone @@ -2659,6 +2664,11 @@ static int __init hugetlb_init(void) subsys_initcall(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_bad_size(void) +{ + parsed_valid_hugepagesz = false; +} + void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; @@ -2678,8 +2688,8 @@ void __init hugetlb_add_hstate(unsigned int order) for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); - h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); - h->next_nid_to_free = first_node(node_states[N_MEMORY]); + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -2691,11 +2701,17 @@ static int __init hugetlb_nrpages_setup(char *s) unsigned long *mhp; static unsigned long *last_mhp; + if (!parsed_valid_hugepagesz) { + pr_warn("hugepages = %s preceded by " + "an unsupported hugepagesz, ignoring\n", s); + parsed_valid_hugepagesz = true; + return 1; + } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!hugetlb_max_hstate) + else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d8fb10de0f14..eec1150125b9 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -67,26 +67,42 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) return false; } +static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, + struct hugetlb_cgroup *parent_h_cgroup) +{ + int idx; + + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { + struct page_counter *counter = &h_cgroup->hugepage[idx]; + struct page_counter *parent = NULL; + unsigned long limit; + int ret; + + if (parent_h_cgroup) + parent = &parent_h_cgroup->hugepage[idx]; + page_counter_init(counter, parent); + + limit = round_down(PAGE_COUNTER_MAX, + 1 << huge_page_order(&hstates[idx])); + ret = page_counter_limit(counter, limit); + VM_BUG_ON(ret); + } +} + static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; - int idx; h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); - if (parent_h_cgroup) { - for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - page_counter_init(&h_cgroup->hugepage[idx], - &parent_h_cgroup->hugepage[idx]); - } else { + if (!parent_h_cgroup) root_h_cgroup = h_cgroup; - for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - page_counter_init(&h_cgroup->hugepage[idx], NULL); - } + + hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; } @@ -285,6 +301,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return ret; idx = MEMFILE_IDX(of_cft(of)->private); + nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: diff --git a/mm/internal.h b/mm/internal.h index b79abb6721cf..a37e5b6f9d25 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, } /* - * Turn a non-refcounted page (->_count == 0) into refcounted with + * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) @@ -102,13 +102,14 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; - struct zone *preferred_zone; - int classzone_idx; + struct zoneref *preferred_zoneref; int migratetype; enum zone_type high_zoneidx; bool spread_dirty_pages; }; +#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref) + /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -173,9 +174,10 @@ struct compact_control { enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool direct_compaction; /* False from kcompactd or /proc/... */ + bool whole_zone; /* Whole zone has been scanned */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const int alloc_flags; /* alloc flags of a direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; int contended; /* Signal need_sched() or lock @@ -440,7 +442,7 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; -extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, +extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 131daadf40e4..1548749a3d45 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) obj-y := kasan.o report.o kasan_init.o +obj-$(CONFIG_SLAB) += quarantine.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 38f1dd79acdb..18b6a2b8d183 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -273,32 +273,48 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } - -static __always_inline void check_memory_region(unsigned long addr, - size_t size, bool write) +static __always_inline void check_memory_region_inline(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) { if (unlikely(size == 0)) return; if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - kasan_report(addr, size, write, _RET_IP_); + kasan_report(addr, size, write, ret_ip); return; } if (likely(!memory_is_poisoned(addr, size))) return; - kasan_report(addr, size, write, _RET_IP_); + kasan_report(addr, size, write, ret_ip); +} + +static void check_memory_region(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) +{ + check_memory_region_inline(addr, size, write, ret_ip); +} + +void kasan_check_read(const void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, false, _RET_IP_); } +EXPORT_SYMBOL(kasan_check_read); -void __asan_loadN(unsigned long addr, size_t size); -void __asan_storeN(unsigned long addr, size_t size); +void kasan_check_write(const void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, true, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) { - __asan_storeN((unsigned long)addr, len); + check_memory_region((unsigned long)addr, len, true, _RET_IP_); return __memset(addr, c, len); } @@ -306,8 +322,8 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - __asan_loadN((unsigned long)src, len); - __asan_storeN((unsigned long)dest, len); + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); return __memmove(dest, src, len); } @@ -315,8 +331,8 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - __asan_loadN((unsigned long)src, len); - __asan_storeN((unsigned long)dest, len); + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); return __memcpy(dest, src, len); } @@ -388,6 +404,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, } #endif +void kasan_cache_shrink(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + +void kasan_cache_destroy(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -482,7 +508,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) kasan_kmalloc(cache, object, cache->object_size, flags); } -void kasan_slab_free(struct kmem_cache *cache, void *object) +void kasan_poison_slab_free(struct kmem_cache *cache, void *object) { unsigned long size = cache->object_size; unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); @@ -491,18 +517,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return; + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); +} + +bool kasan_slab_free(struct kmem_cache *cache, void *object) +{ #ifdef CONFIG_SLAB - if (cache->flags & SLAB_KASAN) { - struct kasan_free_meta *free_info = - get_free_info(cache, object); + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + return false; + + if (likely(cache->flags & SLAB_KASAN)) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - alloc_info->state = KASAN_STATE_FREE; - set_track(&free_info->track, GFP_NOWAIT); + struct kasan_free_meta *free_info = + get_free_info(cache, object); + + switch (alloc_info->state) { + case KASAN_STATE_ALLOC: + alloc_info->state = KASAN_STATE_QUARANTINE; + quarantine_put(free_info, cache); + set_track(&free_info->track, GFP_NOWAIT); + kasan_poison_slab_free(cache, object); + return true; + case KASAN_STATE_QUARANTINE: + case KASAN_STATE_FREE: + pr_err("Double free"); + dump_stack(); + break; + default: + break; + } } + return false; +#else + kasan_poison_slab_free(cache, object); + return false; #endif - - kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, @@ -511,6 +562,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, unsigned long redzone_start; unsigned long redzone_end; + if (flags & __GFP_RECLAIM) + quarantine_reduce(); + if (unlikely(object == NULL)) return; @@ -541,6 +595,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) unsigned long redzone_start; unsigned long redzone_end; + if (flags & __GFP_RECLAIM) + quarantine_reduce(); + if (unlikely(ptr == NULL)) return; @@ -649,22 +706,22 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size) } EXPORT_SYMBOL(__asan_unregister_globals); -#define DEFINE_ASAN_LOAD_STORE(size) \ - void __asan_load##size(unsigned long addr) \ - { \ - check_memory_region(addr, size, false); \ - } \ - EXPORT_SYMBOL(__asan_load##size); \ - __alias(__asan_load##size) \ - void __asan_load##size##_noabort(unsigned long); \ - EXPORT_SYMBOL(__asan_load##size##_noabort); \ - void __asan_store##size(unsigned long addr) \ - { \ - check_memory_region(addr, size, true); \ - } \ - EXPORT_SYMBOL(__asan_store##size); \ - __alias(__asan_store##size) \ - void __asan_store##size##_noabort(unsigned long); \ +#define DEFINE_ASAN_LOAD_STORE(size) \ + void __asan_load##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, false, _RET_IP_);\ + } \ + EXPORT_SYMBOL(__asan_load##size); \ + __alias(__asan_load##size) \ + void __asan_load##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_load##size##_noabort); \ + void __asan_store##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__asan_store##size); \ + __alias(__asan_store##size) \ + void __asan_store##size##_noabort(unsigned long); \ EXPORT_SYMBOL(__asan_store##size##_noabort) DEFINE_ASAN_LOAD_STORE(1); @@ -675,7 +732,7 @@ DEFINE_ASAN_LOAD_STORE(16); void __asan_loadN(unsigned long addr, size_t size) { - check_memory_region(addr, size, false); + check_memory_region(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_loadN); @@ -685,7 +742,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort); void __asan_storeN(unsigned long addr, size_t size) { - check_memory_region(addr, size, true); + check_memory_region(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__asan_storeN); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 30a2f0ba0e09..7f7ac51d7faf 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -62,6 +62,7 @@ struct kasan_global { enum kasan_state { KASAN_STATE_INIT, KASAN_STATE_ALLOC, + KASAN_STATE_QUARANTINE, KASAN_STATE_FREE }; @@ -79,9 +80,14 @@ struct kasan_alloc_meta { u32 reserved; }; +struct qlist_node { + struct qlist_node *next; +}; struct kasan_free_meta { - /* Allocator freelist pointer, unused by KASAN. */ - void **freelist; + /* This field is used while the object is in the quarantine. + * Otherwise it might be used for the allocator freelist. + */ + struct qlist_node quarantine_link; struct kasan_track track; }; @@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); +#ifdef CONFIG_SLAB +void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); +void quarantine_reduce(void); +void quarantine_remove_cache(struct kmem_cache *cache); +#else +static inline void quarantine_put(struct kasan_free_meta *info, + struct kmem_cache *cache) { } +static inline void quarantine_reduce(void) { } +static inline void quarantine_remove_cache(struct kmem_cache *cache) { } +#endif + #endif diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c new file mode 100644 index 000000000000..4973505a9bdd --- /dev/null +++ b/mm/kasan/quarantine.c @@ -0,0 +1,291 @@ +/* + * KASAN quarantine. + * + * Author: Alexander Potapenko <glider@google.com> + * Copyright (C) 2016 Google, Inc. + * + * Based on code by Dmitry Chernenkov. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#include <linux/gfp.h> +#include <linux/hash.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/printk.h> +#include <linux/shrinker.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "../slab.h" +#include "kasan.h" + +/* Data structure and operations for quarantine queues. */ + +/* + * Each queue is a signle-linked list, which also stores the total size of + * objects inside of it. + */ +struct qlist_head { + struct qlist_node *head; + struct qlist_node *tail; + size_t bytes; +}; + +#define QLIST_INIT { NULL, NULL, 0 } + +static bool qlist_empty(struct qlist_head *q) +{ + return !q->head; +} + +static void qlist_init(struct qlist_head *q) +{ + q->head = q->tail = NULL; + q->bytes = 0; +} + +static void qlist_put(struct qlist_head *q, struct qlist_node *qlink, + size_t size) +{ + if (unlikely(qlist_empty(q))) + q->head = qlink; + else + q->tail->next = qlink; + q->tail = qlink; + qlink->next = NULL; + q->bytes += size; +} + +static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) +{ + if (unlikely(qlist_empty(from))) + return; + + if (qlist_empty(to)) { + *to = *from; + qlist_init(from); + return; + } + + to->tail->next = from->head; + to->tail = from->tail; + to->bytes += from->bytes; + + qlist_init(from); +} + +static void qlist_move(struct qlist_head *from, struct qlist_node *last, + struct qlist_head *to, size_t size) +{ + if (unlikely(last == from->tail)) { + qlist_move_all(from, to); + return; + } + if (qlist_empty(to)) + to->head = from->head; + else + to->tail->next = from->head; + to->tail = last; + from->head = last->next; + last->next = NULL; + from->bytes -= size; + to->bytes += size; +} + + +/* + * The object quarantine consists of per-cpu queues and a global queue, + * guarded by quarantine_lock. + */ +static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); + +static struct qlist_head global_quarantine; +static DEFINE_SPINLOCK(quarantine_lock); + +/* Maximum size of the global queue. */ +static unsigned long quarantine_size; + +/* + * The fraction of physical memory the quarantine is allowed to occupy. + * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep + * the ratio low to avoid OOM. + */ +#define QUARANTINE_FRACTION 32 + +#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4) +#define QUARANTINE_PERCPU_SIZE (1 << 20) + +static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) +{ + return virt_to_head_page(qlink)->slab_cache; +} + +static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) +{ + struct kasan_free_meta *free_info = + container_of(qlink, struct kasan_free_meta, + quarantine_link); + + return ((void *)free_info) - cache->kasan_info.free_meta_offset; +} + +static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) +{ + void *object = qlink_to_object(qlink, cache); + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + unsigned long flags; + + local_irq_save(flags); + alloc_info->state = KASAN_STATE_FREE; + ___cache_free(cache, object, _THIS_IP_); + local_irq_restore(flags); +} + +static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) +{ + struct qlist_node *qlink; + + if (unlikely(qlist_empty(q))) + return; + + qlink = q->head; + while (qlink) { + struct kmem_cache *obj_cache = + cache ? cache : qlink_to_cache(qlink); + struct qlist_node *next = qlink->next; + + qlink_free(qlink, obj_cache); + qlink = next; + } + qlist_init(q); +} + +void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) +{ + unsigned long flags; + struct qlist_head *q; + struct qlist_head temp = QLIST_INIT; + + local_irq_save(flags); + + q = this_cpu_ptr(&cpu_quarantine); + qlist_put(q, &info->quarantine_link, cache->size); + if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) + qlist_move_all(q, &temp); + + local_irq_restore(flags); + + if (unlikely(!qlist_empty(&temp))) { + spin_lock_irqsave(&quarantine_lock, flags); + qlist_move_all(&temp, &global_quarantine); + spin_unlock_irqrestore(&quarantine_lock, flags); + } +} + +void quarantine_reduce(void) +{ + size_t new_quarantine_size; + unsigned long flags; + struct qlist_head to_free = QLIST_INIT; + size_t size_to_free = 0; + struct qlist_node *last; + + if (likely(READ_ONCE(global_quarantine.bytes) <= + READ_ONCE(quarantine_size))) + return; + + spin_lock_irqsave(&quarantine_lock, flags); + + /* + * Update quarantine size in case of hotplug. Allocate a fraction of + * the installed memory to quarantine minus per-cpu queue limits. + */ + new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + QUARANTINE_FRACTION; + new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus(); + WRITE_ONCE(quarantine_size, new_quarantine_size); + + last = global_quarantine.head; + while (last) { + struct kmem_cache *cache = qlink_to_cache(last); + + size_to_free += cache->size; + if (!last->next || size_to_free > + global_quarantine.bytes - QUARANTINE_LOW_SIZE) + break; + last = last->next; + } + qlist_move(&global_quarantine, last, &to_free, size_to_free); + + spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, NULL); +} + +static void qlist_move_cache(struct qlist_head *from, + struct qlist_head *to, + struct kmem_cache *cache) +{ + struct qlist_node *prev = NULL, *curr; + + if (unlikely(qlist_empty(from))) + return; + + curr = from->head; + while (curr) { + struct qlist_node *qlink = curr; + struct kmem_cache *obj_cache = qlink_to_cache(qlink); + + if (obj_cache == cache) { + if (unlikely(from->head == qlink)) { + from->head = curr->next; + prev = curr; + } else + prev->next = curr->next; + if (unlikely(from->tail == qlink)) + from->tail = curr->next; + from->bytes -= cache->size; + qlist_put(to, qlink, cache->size); + } else { + prev = curr; + } + curr = curr->next; + } +} + +static void per_cpu_remove_cache(void *arg) +{ + struct kmem_cache *cache = arg; + struct qlist_head to_free = QLIST_INIT; + struct qlist_head *q; + + q = this_cpu_ptr(&cpu_quarantine); + qlist_move_cache(q, &to_free, cache); + qlist_free_all(&to_free, cache); +} + +void quarantine_remove_cache(struct kmem_cache *cache) +{ + unsigned long flags; + struct qlist_head to_free = QLIST_INIT; + + on_each_cpu(per_cpu_remove_cache, cache, 1); + + spin_lock_irqsave(&quarantine_lock, flags); + qlist_move_cache(&global_quarantine, &to_free, cache); + spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, cache); +} diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 60869a5a0124..b3c122ddd454 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page, print_track(&alloc_info->track); break; case KASAN_STATE_FREE: + case KASAN_STATE_QUARANTINE: pr_err("Object freed, allocated with size %u bytes\n", alloc_info->alloc_size); free_info = get_free_info(cache, object); @@ -783,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void) } remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -794,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void) free_mm_slot(mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); - up_read(&mm->mmap_sem); mmdrop(mm); - } else { + } else spin_unlock(&ksm_mmlist_lock); - up_read(&mm->mmap_sem); - } } /* Clean up stable nodes, but don't worry if some are still busy */ @@ -1663,8 +1661,15 @@ next_mm: up_read(&mm->mmap_sem); mmdrop(mm); } else { - spin_unlock(&ksm_mmlist_lock); up_read(&mm->mmap_sem); + /* + * up_read(&mm->mmap_sem) first because after + * spin_unlock(&ksm_mmlist_lock) run, the "mm" may + * already have been freed under us by __ksm_exit() + * because the "mm_slot" is still hashed and + * ksm_scan.mm_slot doesn't point to it anymore. + */ + spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ diff --git a/mm/maccess.c b/mm/maccess.c index d159b1c96e48..78f9274dd49d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) pagefault_disable(); do { - ret = __copy_from_user_inatomic(dst++, - (const void __user __force *)src++, 1); + ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); dst[-1] = '\0'; diff --git a/mm/madvise.c b/mm/madvise.c index 07427d3fcead..93fb63e88b5e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -707,10 +707,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return error; write = madvise_need_mmap_write(behavior); - if (write) - down_write(¤t->mm->mmap_sem); - else + if (write) { + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; + } else { down_read(¤t->mm->mmap_sem); + } /* * If the interval [start,end) covers some unmapped address diff --git a/mm/memblock.c b/mm/memblock.c index b570dddb4cb9..ac1248933b31 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -606,22 +606,14 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, return memblock_add_range(&memblock.memory, base, size, nid, 0); } -static int __init_memblock memblock_add_region(phys_addr_t base, - phys_addr_t size, - int nid, - unsigned long flags) +int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, - flags, (void *)_RET_IP_); - - return memblock_add_range(&memblock.memory, base, size, nid, flags); -} + 0UL, (void *)_RET_IP_); -int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) -{ - return memblock_add_region(base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } /** @@ -732,22 +724,14 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.reserved, base, size); } -static int __init_memblock memblock_reserve_region(phys_addr_t base, - phys_addr_t size, - int nid, - unsigned long flags) +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, - flags, (void *)_RET_IP_); - - return memblock_add_range(&memblock.reserved, base, size, nid, flags); -} + 0UL, (void *)_RET_IP_); -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) -{ - return memblock_reserve_region(base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); } /** @@ -840,7 +824,7 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, { struct memblock_type *type = &memblock.reserved; - if (*idx >= 0 && *idx < type->cnt) { + if (*idx < type->cnt) { struct memblock_region *r = &type->regions[*idx]; phys_addr_t base = r->base; phys_addr_t size = r->size; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36db05fa8acb..cf428d7b9a03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { spinlock_t lock; /* for from, to */ + struct mm_struct *mm; struct mem_cgroup *from; struct mem_cgroup *to; unsigned long flags; @@ -1022,22 +1023,40 @@ out: * @lru: index of lru list the page is sitting on * @nr_pages: positive when adding or negative when removing * - * This function must be called when a page is added to or removed from an - * lru list. + * This function must be called under lru_lock, just before a page is added + * to or just after a page is removed from an lru list (that ordering being + * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages) { struct mem_cgroup_per_zone *mz; unsigned long *lru_size; + long size; + bool empty; + + __update_lru_size(lruvec, lru, nr_pages); if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); lru_size = mz->lru_size + lru; - *lru_size += nr_pages; - VM_BUG_ON((long)(*lru_size) < 0); + empty = list_empty(lruvec->lists + lru); + + if (nr_pages < 0) + *lru_size += nr_pages; + + size = *lru_size; + if (WARN_ONCE(size < 0 || empty != !size, + "%s(%p, %d, %d): lru_size %ld but %sempty\n", + __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + VM_BUG_ON(1); + *lru_size = 0; + } + + if (nr_pages > 0) + *lru_size += nr_pages; } bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) @@ -1256,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { mark_oom_victim(current); + try_oom_reaper(current); goto unlock; } @@ -1388,14 +1408,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) mem_cgroup_may_update_nodemask(memcg); node = memcg->last_scanned_node; - node = next_node(node, memcg->scan_nodes); - if (node == MAX_NUMNODES) - node = first_node(memcg->scan_nodes); + node = next_node_in(node, memcg->scan_nodes); /* - * We call this when we hit limit, not when pages are added to LRU. - * No LRU may hold pages because all pages are UNEVICTABLE or - * memcg is too small and all pages are not on LRU. In that case, - * we use curret node. + * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages + * last time it really checked all the LRUs due to rate limiting. + * Fallback to the current node in that case for simplicity. */ if (unlikely(node == MAX_NUMNODES)) node = numa_node_id(); @@ -1587,7 +1604,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom) + if (!current->memcg_may_oom || current->memcg_in_oom) return; /* * We are in the middle of the charge context here, so we @@ -2635,8 +2652,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) } /* - * Reclaims as many pages from the given memcg as possible and moves - * the rest to the parent. + * Reclaims as many pages from the given memcg as possible. * * Caller is responsible for holding css reference for memcg. */ @@ -4667,6 +4683,8 @@ static void __mem_cgroup_clear_mc(void) static void mem_cgroup_clear_mc(void) { + struct mm_struct *mm = mc.mm; + /* * we must clear moving_task before waking up waiters at the end of * task migration. @@ -4676,7 +4694,10 @@ static void mem_cgroup_clear_mc(void) spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; + mc.mm = NULL; spin_unlock(&mc.lock); + + mmput(mm); } static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4733,6 +4754,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) VM_BUG_ON(mc.moved_swap); spin_lock(&mc.lock); + mc.mm = mm; mc.from = from; mc.to = memcg; mc.flags = move_flags; @@ -4742,8 +4764,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); + } else { + mmput(mm); } - mmput(mm); return ret; } @@ -4852,11 +4875,11 @@ put: /* get_mctgt_type() gets the page */ return ret; } -static void mem_cgroup_move_charge(struct mm_struct *mm) +static void mem_cgroup_move_charge(void) { struct mm_walk mem_cgroup_move_charge_walk = { .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mm, + .mm = mc.mm, }; lru_add_drain_all(); @@ -4868,7 +4891,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) atomic_inc(&mc.from->moving_account); synchronize_rcu(); retry: - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { /* * Someone who are holding the mmap_sem might be waiting in * waitq. So we cancel all extra charges, wake up all waiters, @@ -4885,23 +4908,16 @@ retry: * additional charge, the page walk just aborts. */ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); - up_read(&mm->mmap_sem); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { - struct cgroup_subsys_state *css; - struct task_struct *p = cgroup_taskset_first(tset, &css); - struct mm_struct *mm = get_task_mm(p); - - if (mm) { - if (mc.to) - mem_cgroup_move_charge(mm); - mmput(mm); - } - if (mc.to) + if (mc.to) { + mem_cgroup_move_charge(); mem_cgroup_clear_mc(); + } } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4911,7 +4927,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { } #endif @@ -5195,7 +5211,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, - .attach = mem_cgroup_move_task, + .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 78f5f2641b91..2fcca6b0e005 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -184,8 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, struct siginfo si; int ret; - pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); si.si_signo = SIGBUS; si.si_errno = 0; si.si_addr = (void *)addr; @@ -208,7 +208,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ } if (ret < 0) - pr_info("MCE: Error sending signal to %s:%d: %d\n", + pr_info("Memory failure: Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); return ret; } @@ -289,7 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } else { tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); if (!tk) { - pr_err("MCE: Out of memory while machine check handling\n"); + pr_err("Memory failure: Out of memory while machine check handling\n"); return; } } @@ -303,7 +303,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_info("MCE: Unable to find user space address %lx in %s\n", + pr_info("Memory failure: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -334,7 +334,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, * signal and then access the memory. Just kill it. */ if (fail || tk->addr_valid == 0) { - pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); force_sig(SIGKILL, tk->tsk); } @@ -347,7 +347,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, */ else if (kill_proc(tk->tsk, tk->addr, trapno, pfn, page, flags) < 0) - pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } put_task_struct(tk->tsk); @@ -559,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn) */ static int me_unknown(struct page *p, unsigned long pfn) { - pr_err("MCE %#lx: Unknown page state\n", pfn); + pr_err("Memory failure: %#lx: Unknown page state\n", pfn); return MF_FAILED; } @@ -604,11 +604,12 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (mapping->a_ops->error_remove_page) { err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { - pr_info("MCE %#lx: Failed to punch page: %d\n", + pr_info("Memory failure: %#lx: Failed to punch page: %d\n", pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_info("MCE %#lx: failed to release buffers\n", pfn); + pr_info("Memory failure: %#lx: failed to release buffers\n", + pfn); } else { ret = MF_RECOVERED; } @@ -620,7 +621,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (invalidate_inode_page(p)) ret = MF_RECOVERED; else - pr_info("MCE %#lx: Failed to invalidate\n", pfn); + pr_info("Memory failure: %#lx: Failed to invalidate\n", + pfn); } return ret; } @@ -833,7 +835,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - pr_err("MCE %#lx: recovery action for %s: %s\n", + pr_err("Memory failure: %#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } @@ -849,7 +851,7 @@ static int page_action(struct page_state *ps, struct page *p, if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { - pr_err("MCE %#lx: %s still referenced by %d users\n", + pr_err("Memory failure: %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; } @@ -882,13 +884,22 @@ int get_hwpoison_page(struct page *page) * tries to touch the "partially handled" page. */ if (!PageAnon(head)) { - pr_err("MCE: %#lx: non anonymous thp\n", + pr_err("Memory failure: %#lx: non anonymous thp\n", page_to_pfn(page)); return 0; } } - return get_page_unless_zero(head); + if (get_page_unless_zero(head)) { + if (head == compound_head(page)) + return 1; + + pr_info("Memory failure: %#lx cannot catch tail\n", + page_to_pfn(page)); + put_page(head); + } + + return 0; } EXPORT_SYMBOL_GPL(get_hwpoison_page); @@ -923,12 +934,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, return SWAP_SUCCESS; if (PageKsm(p)) { - pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); + pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); return SWAP_FAIL; } if (PageSwapCache(p)) { - pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn); + pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n", + pfn); ttu |= TTU_IGNORE_HWPOISON; } @@ -946,7 +958,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; - pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n", + pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n", pfn); } } @@ -964,7 +976,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) - pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n", + pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); /* @@ -1032,14 +1044,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - pr_err("MCE %#lx: memory outside kernel control\n", pfn); + pr_err("Memory failure: %#lx: memory outside kernel control\n", + pfn); return -ENXIO; } p = pfn_to_page(pfn); orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { - pr_err("MCE %#lx: already hardware poisoned\n", pfn); + pr_err("Memory failure: %#lx: already hardware poisoned\n", + pfn); return 0; } @@ -1104,9 +1118,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { unlock_page(hpage); if (!PageAnon(hpage)) - pr_err("MCE: %#lx: non anonymous thp\n", pfn); + pr_err("Memory failure: %#lx: non anonymous thp\n", + pfn); else - pr_err("MCE: %#lx: thp split failed\n", pfn); + pr_err("Memory failure: %#lx: thp split failed\n", + pfn); if (TestClearPageHWPoison(p)) num_poisoned_pages_sub(nr_pages); put_hwpoison_page(p); @@ -1170,7 +1186,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { - pr_err("MCE %#lx: just unpoisoned\n", pfn); + pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_sub(nr_pages); unlock_page(hpage); put_hwpoison_page(hpage); @@ -1387,25 +1403,25 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n", + unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); return 0; } if (page_count(page) > 1) { - unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n", + unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); return 0; } if (page_mapped(page)) { - unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n", + unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); return 0; } if (page_mapping(page)) { - unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", + unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); return 0; } @@ -1416,7 +1432,7 @@ int unpoison_memory(unsigned long pfn) * In such case, we yield to memory_failure() and make unpoison fail. */ if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("MCE: Memory failure is now running on %#lx\n", + unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", pfn, &unpoison_rs); return 0; } @@ -1431,13 +1447,13 @@ int unpoison_memory(unsigned long pfn) * to the end. */ if (PageHuge(page)) { - unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", + unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n", pfn, &unpoison_rs); return 0; } if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); - unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n", + unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", pfn, &unpoison_rs); return 0; } @@ -1450,7 +1466,7 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n", + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", pfn, &unpoison_rs); num_poisoned_pages_sub(nr_pages); freeit = 1; diff --git a/mm/memory.c b/mm/memory.c index 06f552504e79..15322b73636b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -790,6 +790,46 @@ out: return pfn_to_page(pfn); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + /* + * There is no pmd_special() but there may be special pmds, e.g. + * in a direct-access (dax) mapping, so let's just replicate the + * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + */ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; + if (unlikely(pfn > highest_memmap_pfn)) + return NULL; + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} +#endif + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -1183,15 +1223,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { -#ifdef CONFIG_DEBUG_VM - if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { - pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", - __func__, addr, end, - vma->vm_start, - vma->vm_end); - BUG(); - } -#endif + VM_BUG_ON_VMA(vma_is_anonymous(vma) && + !rwsem_is_locked(&tlb->mm->mmap_sem), vma); split_huge_pmd(vma, pmd, addr); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; @@ -1712,6 +1745,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; + unsigned long remap_pfn = pfn; int err; /* @@ -1738,7 +1772,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; @@ -1757,7 +1791,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } while (pgd++, addr = next, addr != end); if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); return err; } @@ -2341,6 +2375,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page) && !PageKsm(old_page)) { + int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); pte_unmap_unlock(page_table, ptl); @@ -2355,13 +2390,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } put_page(old_page); } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); + if (reuse_swap_page(old_page, &total_mapcount)) { + if (total_mapcount == 1) { + /* + * The page is all ours. Move it to + * our anon_vma so the rmap code will + * not search our parent or siblings. + * Protected against the rmap code by + * the page lock. + */ + page_move_anon_rmap(compound_head(old_page), + vma, address); + } unlock_page(old_page); return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, old_page, 0, 0); @@ -2583,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -2837,7 +2877,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, * vm_ops->map_pages. */ void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon) + struct page *page, pte_t *pte, bool write, bool anon, bool old) { pte_t entry; @@ -2845,6 +2885,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (old) + entry = pte_mkold(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address, false); @@ -2858,8 +2900,16 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } +/* + * If architecture emulates "accessed" or "young" bit without HW support, + * there is no much gain with fault_around. + */ static unsigned long fault_around_bytes __read_mostly = +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS + PAGE_SIZE; +#else rounddown_pow_of_two(65536); +#endif #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) @@ -2982,9 +3032,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); - do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) goto unlock_out; + do_fault_around(vma, address, pte, pgoff, flags); + /* Check if the fault is handled by faultaround */ + if (!pte_same(*pte, orig_pte)) { + /* + * Faultaround produce old pte, but the pte we've + * handler fault for should be young. + */ + pte_t entry = pte_mkyoung(*pte); + if (ptep_set_access_flags(vma, address, pte, entry, 0)) + update_mmu_cache(vma, address, pte); + goto unlock_out; + } pte_unmap_unlock(pte, ptl); } @@ -2999,7 +3060,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, false, false); + do_set_pte(vma, address, fault_page, pte, false, false, false); unlock_page(fault_page); unlock_out: pte_unmap_unlock(pte, ptl); @@ -3050,7 +3111,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } goto uncharge_out; } - do_set_pte(vma, address, new_page, pte, true, true); + do_set_pte(vma, address, new_page, pte, true, true, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); @@ -3103,7 +3164,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, true, false); + do_set_pte(vma, address, fault_page, pte, true, false, false); pte_unmap_unlock(pte, ptl); if (set_page_dirty(fault_page)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa34431c3f31..caf2a14c37ad 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -78,9 +78,24 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; +#else +bool memhp_auto_online = true; +#endif EXPORT_SYMBOL_GPL(memhp_auto_online); +static int __init setup_memhp_default_state(char *str) +{ + if (!strcmp(str, "online")) + memhp_auto_online = true; + else if (!strcmp(str, "offline")) + memhp_auto_online = false; + + return 1; +} +__setup("memhp_default_state=", setup_memhp_default_state); + void get_online_mems(void) { might_sleep(); @@ -1410,7 +1425,7 @@ static struct page *next_active_pageblock(struct page *page) } /* Checks if this range of memory is likely to be hot-removable. */ -int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; @@ -1418,12 +1433,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { if (!is_pageblock_removable_nolock(page)) - return 0; + return false; cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ - return 1; + return true; } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36cc01bc950a..297d6854f849 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -97,7 +97,6 @@ #include <asm/tlbflush.h> #include <asm/uaccess.h> -#include <linux/random.h> #include "internal.h" @@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, BUG(); if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = first_node(tmp); + current->il_next = next_node_in(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) current->il_next = numa_node_id(); } @@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); + next = next_node_in(nid, policy->v.nodes); if (next < MAX_NUMNODES) me->il_next = next; return nid; @@ -1744,18 +1739,18 @@ unsigned int mempolicy_slab_node(void) return interleave_nodes(policy); case MPOL_BIND: { + struct zoneref *z; + /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; - struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[0]; - (void)first_zones_zonelist(zonelist, highest_zoneidx, - &policy->v.nodes, - &zone); - return zone ? zone->node : node; + z = first_zones_zonelist(zonelist, highest_zoneidx, + &policy->v.nodes); + return z->zone ? z->zone->node : node; } default: @@ -1763,23 +1758,25 @@ unsigned int mempolicy_slab_node(void) } } -/* Do static interleaving for a VMA with known offset. */ +/* + * Do static interleaving for a VMA with known offset @n. Returns the n'th + * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the + * number of present nodes. + */ static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long off) + struct vm_area_struct *vma, unsigned long n) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; - int c; - int nid = NUMA_NO_NODE; + int i; + int nid; if (!nnodes) return numa_node_id(); - target = (unsigned int)off % nnodes; - c = 0; - do { + target = (unsigned int)n % nnodes; + nid = first_node(pol->v.nodes); + for (i = 0; i < target; i++) nid = next_node(nid, pol->v.nodes); - c++; - } while (c <= target); return nid; } @@ -1805,21 +1802,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } -/* - * Return the bit number of a random bit set in the nodemask. - * (returns NUMA_NO_NODE if nodemask is empty) - */ -int node_random(const nodemask_t *maskp) -{ - int w, bit = NUMA_NO_NODE; - - w = nodes_weight(*maskp); - if (w) - bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); - return bit; -} - #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) @@ -2284,7 +2266,7 @@ static void sp_free(struct sp_node *n) int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; - struct zone *zone; + struct zoneref *z; int curnid = page_to_nid(page); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); @@ -2316,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_BIND: + /* * allows binding to multiple nodes. * use current page if in policy nodemask, @@ -2324,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long */ if (node_isset(curnid, pol->v.nodes)) goto out; - (void)first_zones_zonelist( + z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), - &pol->v.nodes, &zone); - polnid = zone->node; + &pol->v.nodes); + polnid = z->zone->node; break; default: diff --git a/mm/mempool.c b/mm/mempool.c index 9b7a14a791cc..9e075f829d0d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element) static void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab) - kasan_slab_free(pool->pool_data, element); + kasan_poison_slab_free(pool->pool_data, element); if (pool->alloc == mempool_kmalloc) kasan_kfree(element); if (pool->alloc == mempool_alloc_pages) diff --git a/mm/migrate.c b/mm/migrate.c index 6c822a7b27e0..9baf41c877ff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping, newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + __SetPageSwapBacked(newpage); return MIGRATEPAGE_SUCCESS; } @@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping, newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + __SetPageSwapBacked(newpage); get_page(newpage); /* add cache reference */ if (PageSwapCache(page)) { @@ -975,7 +975,13 @@ out: dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE) { + if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { + /* + * With this release, we free successfully migrated + * page and set PG_HWPoison on just freed page + * intentionally. Although it's rather weird, it's how + * HWPoison flag works at the moment. + */ put_page(page); if (!test_set_page_hwpoison(page)) num_poisoned_pages_inc(); @@ -1165,6 +1171,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, switch(rc) { case -ENOMEM: + nr_failed++; goto out; case -EAGAIN: retry++; @@ -1785,7 +1792,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - SetPageSwapBacked(new_page); + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; diff --git a/mm/mlock.c b/mm/mlock.c index 96f001041928..ef8dc9f395c4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -617,7 +617,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return error; } -static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) +static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; @@ -635,7 +635,8 @@ static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; locked += current->mm->locked_vm; @@ -678,7 +679,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; ret = apply_vma_lock_flags(start, len, 0); up_write(¤t->mm->mmap_sem); @@ -748,9 +750,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - ret = -ENOMEM; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; + ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); @@ -765,7 +768,8 @@ SYSCALL_DEFINE0(munlockall) { int ret; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; ret = apply_mlockall_flags(0); up_write(¤t->mm->mmap_sem); return ret; diff --git a/mm/mmap.c b/mm/mmap.c index bd2e1a533bc1..d3d9a94ca031 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -55,10 +55,6 @@ #define arch_mmap_check(addr, len, flags) (0) #endif -#ifndef arch_rebalance_pgtables -#define arch_rebalance_pgtables(addr, len) (addr) -#endif - #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; @@ -70,7 +66,7 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif -static bool ignore_rlimit_data = true; +static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, @@ -182,7 +178,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long min_brk; bool populate; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; #ifdef CONFIG_COMPAT_BRK /* @@ -1911,7 +1908,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (offset_in_page(addr)) return -EINVAL; - addr = arch_rebalance_pgtables(addr, len); error = security_mmap_addr(addr); return error ? error : addr; } @@ -2498,7 +2494,9 @@ int vm_munmap(unsigned long start, size_t len) int ret; struct mm_struct *mm = current->mm; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_munmap(mm, start, len); up_write(&mm->mmap_sem); return ret; @@ -2507,8 +2505,15 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { + int ret; + struct mm_struct *mm = current->mm; + profile_munmap(addr); - return vm_munmap(addr, len); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; } @@ -2540,7 +2545,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + vma = find_vma(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) @@ -2705,7 +2712,9 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) unsigned long ret; bool populate; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_brk(addr, len); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); @@ -2891,13 +2900,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { - if (ignore_rlimit_data) - pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n", + /* Workaround for Valgrind */ + if (rlimit(RLIMIT_DATA) == 0 && + mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) + return true; + if (!ignore_rlimit_data) { + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA)); - else return false; + } } return true; diff --git a/mm/mmu_context.c b/mm/mmu_context.c index f802c2d216a7..6f4d27c5bb32 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -4,9 +4,9 @@ */ #include <linux/mm.h> +#include <linux/sched.h> #include <linux/mmu_context.h> #include <linux/export.h> -#include <linux/sched.h> #include <asm/mmu_context.h> diff --git a/mm/mmzone.c b/mm/mmzone.c index 52687fb4de6f..5652be858e5e 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) } /* Returns the next zone at or below highest_zoneidx in a zonelist */ -struct zoneref *next_zones_zonelist(struct zoneref *z, +struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { diff --git a/mm/mprotect.c b/mm/mprotect.c index b650c5412f58..5019a1ef2848 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -379,7 +379,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, reqprot = prot; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; vma = find_vma(current->mm, start); error = -ENOMEM; diff --git a/mm/mremap.c b/mm/mremap.c index 3fa0a467df66..1f157adfdaf9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return pmd; } +static void take_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); +} + +static void drop_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + if (vma->vm_file) + i_mmap_unlock_write(vma->vm_file->f_mapping); +} + static pte_t move_soft_dirty_pte(pte_t pte) { /* @@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) { - struct address_space *mapping = NULL; - struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; @@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * serialize access to individual ptes, but only rmap traversal * order guarantees that we won't miss both the old and new ptes). */ - if (need_rmap_locks) { - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); - } - if (vma->anon_vma) { - anon_vma = vma->anon_vma; - anon_vma_lock_write(anon_vma); - } - } + if (need_rmap_locks) + take_rmap_locks(vma); /* * We don't have to worry about the ordering of src and dst @@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); - if (anon_vma) - anon_vma_unlock_write(anon_vma); - if (mapping) - i_mmap_unlock_write(mapping); + if (need_rmap_locks) + drop_rmap_locks(vma); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -193,16 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; - VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, - vma); /* See comment in move_ptes() */ if (need_rmap_locks) - anon_vma_lock_write(vma->anon_vma); - moved = move_huge_pmd(vma, new_vma, old_addr, - new_addr, old_end, - old_pmd, new_pmd); + take_rmap_locks(vma); + moved = move_huge_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); if (need_rmap_locks) - anon_vma_unlock_write(vma->anon_vma); + drop_rmap_locks(vma); if (moved) { need_flush = true; continue; @@ -502,7 +503,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (!new_len) return ret; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 86349586eacb..5bb2f7698ad7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; + /* + * Do not even consider tasks which are explicitly marked oom + * unkillable or have been already oom reaped. + */ adj = (long)p->signal->oom_score_adj; - if (adj == OOM_SCORE_ADJ_MIN) { + if (adj == OOM_SCORE_ADJ_MIN || + test_bit(MMF_OOM_REAPED, &p->mm->flags)) { task_unlock(p); return 0; } @@ -278,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves. */ - if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (!is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - } - if (!task->mm) - return OOM_SCAN_CONTINUE; + if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) + return OOM_SCAN_ABORT; /* * If task is allocating a lot of memory and has been marked to be @@ -302,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, static struct task_struct *select_bad_process(struct oom_control *oc, unsigned int *ppoints, unsigned long totalpages) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; unsigned long chosen_points = 0; rcu_read_lock(); - for_each_process_thread(g, p) { + for_each_process(p) { unsigned int points; switch (oom_scan_process_thread(oc, p, totalpages)) { @@ -326,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc, points = oom_badness(p, NULL, oc->nodemask, totalpages); if (!points || points < chosen_points) continue; - /* Prefer thread group leaders for display purposes */ - if (points == chosen_points && thread_group_leader(chosen)) - continue; chosen = p; chosen_points = points; @@ -412,6 +410,25 @@ bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + + #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -422,7 +439,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); - static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; @@ -491,16 +507,17 @@ static bool __oom_reap_task(struct task_struct *tsk) up_read(&mm->mmap_sem); /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore. OOM killer can continue - * by selecting other victim if unmapping hasn't led to any - * improvements. This also means that selecting this task doesn't - * make any sense. + * This task can be safely ignored because we cannot do much more + * to release its memory. */ - tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; - exit_oom_victim(tsk); + set_bit(MMF_OOM_REAPED, &mm->flags); out: - mmput(mm); + /* + * Drop our reference but make sure the mmput slow path is called from a + * different context because we shouldn't risk we get stuck there and + * put the oom_reaper out of the way. + */ + mmput_async(mm); return ret; } @@ -519,6 +536,15 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); } + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore or it is not a good candidate + * for the oom victim right now because it cannot release its memory + * itself nor by the oom reaper. + */ + tsk->oom_reaper_list = NULL; + exit_oom_victim(tsk); + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); } @@ -563,6 +589,53 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } +/* Check if we can reap the given task. This has to be called with stable + * tsk->mm + */ +void try_oom_reaper(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + struct task_struct *p; + + if (!mm) + return; + + /* + * There might be other threads/processes which are either not + * dying or even not killable. + */ + if (atomic_read(&mm->mm_users) > 1) { + rcu_read_lock(); + for_each_process(p) { + bool exiting; + + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(p, tsk)) + continue; + if (fatal_signal_pending(p)) + continue; + + /* + * If the task is exiting make sure the whole thread group + * is exiting and cannot acces mm anymore. + */ + spin_lock_irq(&p->sighand->siglock); + exiting = signal_group_exit(p->signal); + spin_unlock_irq(&p->sighand->siglock); + if (exiting) + continue; + + /* Give up */ + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + } + + wake_oom_reaper(tsk); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -593,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk) /* OOM killer might race with memcg OOM */ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_inc(&tsk->signal->oom_victims); /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -610,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk) { if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_dec(&tsk->signal->oom_victims); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -653,24 +728,6 @@ void oom_killer_enable(void) } /* - * task->mm can be NULL if the task is the exited group leader. So to - * determine whether the task is using a particular mm, we examine all the - * task's threads: if one of those is using this mm then this task was also - * using it. - */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) -{ - struct task_struct *t; - - for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); - if (t_mm) - return t_mm == mm; - } - return false; -} - -/* * Must be called while holding a reference to p, which will be released upon * returning. */ @@ -694,6 +751,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_lock(p); if (p->mm && task_will_free_mem(p)) { mark_oom_victim(p); + try_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -873,10 +931,20 @@ bool out_of_memory(struct oom_control *oc) if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { mark_oom_victim(current); + try_oom_reaper(current); return true; } /* + * The OOM killer does not compensate for IO-less reclaim. + * pagefault_out_of_memory lost its gfp context so we have to + * make sure exclude 0 mask - all other users should have at least + * ___GFP_DIRECT_RECLAIM to get here. + */ + if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + return true; + + /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 999792d35ccc..b9956fdee8f5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; + int i; for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *z = &NODE_DATA(node)->node_zones[i]; - x += zone_dirtyable_memory(z); + if (is_highmem(z)) + x += zone_dirtyable_memory(z); + } } /* * Unreclaimable memory (kernel memory or anonymous memory @@ -407,8 +411,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) bg_thresh = thresh / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - bg_thresh += bg_thresh / 4; - thresh += thresh / 4; + bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; + thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; @@ -1910,7 +1914,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (gdtc->dirty > gdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc)) + if (wb_stat(wb, WB_RECLAIMABLE) > + wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) return true; if (mdtc) { @@ -1924,7 +1929,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (mdtc->dirty > mdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc)) + if (wb_stat(wb, WB_RECLAIMABLE) > + wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) return true; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59de90d5d3a3..f8f3bfc435ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat, } #endif +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct page *page, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return page_zone(page)->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; +} + +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); +} + +static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +{ + return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); +} + +/** + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in + */ +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = READ_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } +} void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -422,12 +522,6 @@ static void bad_page(struct page *page, const char *reason, static unsigned long nr_shown; static unsigned long nr_unshown; - /* Don't complain about poisoned pages */ - if (PageHWPoison(page)) { - page_mapcount_reset(page); /* remove PageBuddy */ - return; - } - /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -513,14 +607,7 @@ static int __init early_debug_pagealloc(char *buf) { if (!buf) return -EINVAL; - - if (strcmp(buf, "on") == 0) - _debug_pagealloc_enabled = true; - - if (strcmp(buf, "off") == 0) - _debug_pagealloc_enabled = false; - - return 0; + return kstrtobool(buf, &_debug_pagealloc_enabled); } early_param("debug_pagealloc", early_debug_pagealloc); @@ -784,17 +871,42 @@ out: zone->free_area[order].nr_free++; } -static inline int free_pages_check(struct page *page) +/* + * A bad page could be due to a number of fields. Instead of multiple branches, + * try and check multiple fields with one check. The caller must do a detailed + * check if necessary. + */ +static inline bool page_expected_state(struct page *page, + unsigned long check_flags) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; + if (unlikely(atomic_read(&page->_mapcount) != -1)) + return false; + + if (unlikely((unsigned long)page->mapping | + page_ref_count(page) | +#ifdef CONFIG_MEMCG + (unsigned long)page->mem_cgroup | +#endif + (page->flags & check_flags))) + return false; + + return true; +} + +static void free_pages_check_bad(struct page *page) +{ + const char *bad_reason; + unsigned long bad_flags; + + bad_reason = NULL; + bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; @@ -803,15 +915,145 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; + bad_page(page, bad_reason, bad_flags); +} + +static inline int free_pages_check(struct page *page) +{ + if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) + return 0; + + /* Something has gone sideways, find it */ + free_pages_check_bad(page); + return 1; +} + +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + goto out; + } + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; + } + ret = 0; +out: + page->mapping = NULL; + clear_compound_head(page); + return ret; +} + +static __always_inline bool free_pages_prepare(struct page *page, + unsigned int order, bool check_free) +{ + int bad = 0; + + VM_BUG_ON_PAGE(PageTail(page), page); + + trace_mm_page_free(page, order); + kmemcheck_free_shadow(page, order); + + /* + * Check tail pages before head page information is cleared to + * avoid checking PageCompound for order-0 pages. + */ + if (unlikely(order)) { + bool compound = PageCompound(page); + int i; + + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + if (unlikely(free_pages_check(page + i))) { + bad++; + continue; + } + (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + } } + if (PageAnonHead(page)) + page->mapping = NULL; + if (check_free) + bad += free_pages_check(page); + if (bad) + return false; + page_cpupid_reset_last(page); - if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - return 0; + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + reset_page_owner(page, order); + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE << order); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } + arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); + kernel_map_pages(page, 1 << order, 0); + kasan_free_pages(page, order); + + return true; +} + +#ifdef CONFIG_DEBUG_VM +static inline bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, true); +} + +static inline bool bulkfree_pcp_prepare(struct page *page) +{ + return false; +} +#else +static bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, false); +} + +static bool bulkfree_pcp_prepare(struct page *page) +{ + return free_pages_check(page); } +#endif /* CONFIG_DEBUG_VM */ /* * Frees a number of pages from the PCP lists @@ -829,15 +1071,16 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - int to_free = count; unsigned long nr_scanned; + bool isolated_pageblocks; spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); - while (to_free) { + while (count) { struct page *page; struct list_head *list; @@ -857,7 +1100,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* This is the only non-empty list. Free them all. */ if (batch_free == MIGRATE_PCPTYPES) - batch_free = to_free; + batch_free = count; do { int mt; /* migratetype of the to-be-freed page */ @@ -870,12 +1113,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ - if (unlikely(has_isolate_pageblock(zone))) + if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); + if (bulkfree_pcp_prepare(page)) + continue; + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - } while (--to_free && --batch_free && !list_empty(list)); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } @@ -899,56 +1145,6 @@ static void free_one_page(struct zone *zone, spin_unlock(&zone->lock); } -static int free_tail_pages_check(struct page *head_page, struct page *page) -{ - int ret = 1; - - /* - * We rely page->lru.next never has bit 0 set, unless the page - * is PageTail(). Let's make sure that's true even for poisoned ->lru. - */ - BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); - - if (!IS_ENABLED(CONFIG_DEBUG_VM)) { - ret = 0; - goto out; - } - switch (page - head_page) { - case 1: - /* the first tail page: ->mapping is compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); - goto out; - } - break; - case 2: - /* - * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. - */ - break; - default: - if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); - goto out; - } - break; - } - if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); - goto out; - } - if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); - goto out; - } - ret = 0; -out: - page->mapping = NULL; - clear_compound_head(page); - return ret; -} - static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { @@ -1003,7 +1199,7 @@ static inline void init_reserved_page(unsigned long pfn) * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -1022,51 +1218,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) } } -static bool free_pages_prepare(struct page *page, unsigned int order) -{ - bool compound = PageCompound(page); - int i, bad = 0; - - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - - trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); - kasan_free_pages(page, order); - - if (PageAnon(page)) - page->mapping = NULL; - bad += free_pages_check(page); - for (i = 1; i < (1 << order); i++) { - if (compound) - bad += free_tail_pages_check(page, page + i); - bad += free_pages_check(page + i); - } - if (bad) - return false; - - reset_page_owner(page, order); - - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), - PAGE_SIZE << order); - debug_check_no_obj_freed(page_address(page), - PAGE_SIZE << order); - } - arch_free_page(page, order); - kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); - - return true; -} - static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order)) + if (!free_pages_prepare(page, order, true)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -1076,8 +1234,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, - unsigned long pfn, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1154,7 +1311,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, pfn, order); + return __free_pages_boot_core(page, order); } /* @@ -1239,12 +1396,12 @@ static void __init deferred_free_range(struct page *page, if (nr_pages == MAX_ORDER_NR_PAGES && (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pfn, MAX_ORDER-1); + __free_pages_boot_core(page, MAX_ORDER-1); return; } - for (i = 0; i < nr_pages; i++, page++, pfn++) - __free_pages_boot_core(page, pfn, 0); + for (i = 0; i < nr_pages; i++, page++) + __free_pages_boot_core(page, 0); } /* Completion tracking for deferred_init_memmap() threads */ @@ -1477,10 +1634,7 @@ static inline void expand(struct zone *zone, struct page *page, } } -/* - * This page is about to be returned from the page allocator - */ -static inline int check_new_page(struct page *page) +static void check_new_page_bad(struct page *page) { const char *bad_reason = NULL; unsigned long bad_flags = 0; @@ -1494,6 +1648,9 @@ static inline int check_new_page(struct page *page) if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; bad_flags = __PG_HWPOISON; + /* Don't complain about hwpoisoned pages */ + page_mapcount_reset(page); /* remove PageBuddy */ + return; } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; @@ -1503,11 +1660,20 @@ static inline int check_new_page(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; - } - return 0; + bad_page(page, bad_reason, bad_flags); +} + +/* + * This page is about to be returned from the page allocator + */ +static inline int check_new_page(struct page *page) +{ + if (likely(page_expected_state(page, + PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) + return 0; + + check_new_page_bad(page); + return 1; } static inline bool free_pages_prezeroed(bool poisoned) @@ -1516,16 +1682,48 @@ static inline bool free_pages_prezeroed(bool poisoned) page_poisoning_enabled() && poisoned; } -static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - int alloc_flags) +#ifdef CONFIG_DEBUG_VM +static bool check_pcp_refill(struct page *page) +{ + return false; +} + +static bool check_new_pcp(struct page *page) +{ + return check_new_page(page); +} +#else +static bool check_pcp_refill(struct page *page) +{ + return check_new_page(page); +} +static bool check_new_pcp(struct page *page) +{ + return false; +} +#endif /* CONFIG_DEBUG_VM */ + +static bool check_new_pages(struct page *page, unsigned int order) +{ + int i; + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + + if (unlikely(check_new_page(p))) + return true; + } + + return false; +} + +static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) { int i; bool poisoned = true; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; - if (unlikely(check_new_page(p))) - return 1; if (poisoned) poisoned &= page_is_poisoned(p); } @@ -1557,8 +1755,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_pfmemalloc(page); else clear_page_pfmemalloc(page); - - return 0; } /* @@ -1980,6 +2176,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; + if (unlikely(check_pcp_refill(page))) + continue; + /* * Split buddy pages returned by expand() are received here * in physical page order. The page is added to the callers and @@ -2157,6 +2356,10 @@ void mark_free_pages(struct zone *zone) for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } @@ -2187,7 +2390,7 @@ void free_hot_cold_page(struct page *page, bool cold) unsigned long pfn = page_to_pfn(page); int migratetype; - if (!free_pages_prepare(page, 0)) + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -2343,12 +2546,44 @@ int split_free_page(struct page *page) } /* + * Update NUMA hit/miss statistics + * + * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. + */ +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + gfp_t flags) +{ +#ifdef CONFIG_NUMA + int local_nid = numa_node_id(); + enum zone_stat_item local_stat = NUMA_LOCAL; + + if (unlikely(flags & __GFP_OTHER_NODE)) { + local_stat = NUMA_OTHER; + local_nid = preferred_zone->node; + } + + if (z->node == local_nid) { + __inc_zone_state(z, NUMA_HIT); + __inc_zone_state(z, local_stat); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); + } +#endif +} + +/* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int alloc_flags, int migratetype) + gfp_t gfp_flags, unsigned int alloc_flags, + int migratetype) { unsigned long flags; struct page *page; @@ -2359,21 +2594,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, struct list_head *list; local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } + do { + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + goto failed; + } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + } while (page && check_new_pcp(page)); + __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); pcp->count--; } else { @@ -2384,22 +2622,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) set_bit(ZONE_FAIR_DEPLETED, &zone->flags); @@ -2500,13 +2740,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * one free page of a suitable size. Checking now avoids taking the zone lock * to check in the allocation paths if no pages are free. */ -static bool __zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags, - long free_pages) +bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int classzone_idx, unsigned int alloc_flags, + long free_pages) { long min = mark; int o; - const int alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & ALLOC_HARDER); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2569,12 +2809,38 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, int alloc_flags) + int classzone_idx, unsigned int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } +static inline bool zone_watermark_fast(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, unsigned int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + long cma_pages = 0; + +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + + /* + * Fast check for order-0 only. If this fails then the reserves + * need to be calculated. There is a corner case where the check + * passes but only the high-order atomic reserve are free. If + * the caller is !atomic then it'll uselessly search the free + * list. That corner case is then slower but it is harmless. + */ + if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + return true; + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); +} + bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx) { @@ -2630,27 +2896,24 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zonelist *zonelist = ac->zonelist; - struct zoneref *z; - struct page *page = NULL; + struct zoneref *z = ac->preferred_zoneref; struct zone *zone; - int nr_fair_skipped = 0; - bool zonelist_rescan; + bool fair_skipped = false; + bool apply_fair = (alloc_flags & ALLOC_FAIR); zonelist_scan: - zonelist_rescan = false; - /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { + struct page *page; unsigned long mark; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* * Distribute pages in proportion to the individual @@ -2658,13 +2921,16 @@ zonelist_scan: * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. */ - if (alloc_flags & ALLOC_FAIR) { - if (!zone_local(ac->preferred_zone, zone)) - break; + if (apply_fair) { if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - nr_fair_skipped++; + fair_skipped = true; continue; } + if (!zone_local(ac->preferred_zoneref->zone, zone)) { + if (fair_skipped) + goto reset_fair; + apply_fair = false; + } } /* * When allocating a page cache page for writing, we @@ -2696,8 +2962,8 @@ zonelist_scan: continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, - ac->classzone_idx, alloc_flags)) { + if (!zone_watermark_fast(zone, order, mark, + ac_classzone_idx(ac), alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ @@ -2706,7 +2972,7 @@ zonelist_scan: goto try_this_zone; if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(ac->preferred_zone, zone)) + !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2720,7 +2986,7 @@ zonelist_scan: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac->classzone_idx, alloc_flags)) + ac_classzone_idx(ac), alloc_flags)) goto try_this_zone; continue; @@ -2728,11 +2994,10 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(ac->preferred_zone, zone, order, + page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { - if (prep_new_page(page, order, gfp_mask, alloc_flags)) - goto try_this_zone; + prep_new_page(page, order, gfp_mask, alloc_flags); /* * If this is a high-order atomic allocation then check @@ -2753,18 +3018,13 @@ try_this_zone: * include remote zones now, before entering the slowpath and waking * kswapd: prefer spilling to a remote zone over swapping locally. */ - if (alloc_flags & ALLOC_FAIR) { - alloc_flags &= ~ALLOC_FAIR; - if (nr_fair_skipped) { - zonelist_rescan = true; - reset_alloc_batches(ac->preferred_zone); - } - if (nr_online_nodes > 1) - zonelist_rescan = true; - } - - if (zonelist_rescan) + if (fair_skipped) { +reset_fair: + apply_fair = false; + fair_skipped = false; + reset_alloc_batches(ac->preferred_zoneref->zone); goto zonelist_scan; + } return NULL; } @@ -2872,22 +3132,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for IO-less reclaim */ - if (!(gfp_mask & __GFP_FS)) { - /* - * XXX: Page reclaim didn't yield anything, - * and the OOM killer can't be invoked, but - * keep looping as per tradition. - * - * But do not keep looping if oom_killer_disable() - * was already called, for the system is trying to - * enter a quiescent state during suspend. - */ - *did_some_progress = !oom_killer_disabled; - goto out; - } if (pm_suspended_storage()) goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; @@ -2913,34 +3169,33 @@ out: return page; } + +/* + * Maximum number of compaction retries wit a progress before OOM + * killer is consider as the only way to move forward. + */ +#define MAX_COMPACT_RETRIES 16 + #ifdef CONFIG_COMPACTION /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended_compaction, - bool *deferred_compaction) + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, enum compact_result *compact_result) { - unsigned long compact_result; struct page *page; + int contended_compaction; if (!order) return NULL; current->flags |= PF_MEMALLOC; - compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - mode, contended_compaction); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + mode, &contended_compaction); current->flags &= ~PF_MEMALLOC; - switch (compact_result) { - case COMPACT_DEFERRED: - *deferred_compaction = true; - /* fall-through */ - case COMPACT_SKIPPED: + if (*compact_result <= COMPACT_INACTIVE) return NULL; - default: - break; - } /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -2966,19 +3221,112 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTFAIL); + /* + * In all zones where compaction was attempted (and not + * deferred or skipped), lock contention has been detected. + * For THP allocation we do not want to disrupt the others + * so we fallback to base pages instead. + */ + if (contended_compaction == COMPACT_CONTENDED_LOCK) + *compact_result = COMPACT_CONTENDED; + + /* + * If compaction was aborted due to need_resched(), we do not + * want to further increase allocation latency, unless it is + * khugepaged trying to collapse. + */ + if (contended_compaction == COMPACT_CONTENDED_SCHED + && !(current->flags & PF_KTHREAD)) + *compact_result = COMPACT_CONTENDED; + cond_resched(); return NULL; } + +static inline bool +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, + enum compact_result compact_result, enum migrate_mode *migrate_mode, + int compaction_retries) +{ + int max_retries = MAX_COMPACT_RETRIES; + + if (!order) + return false; + + /* + * compaction considers all the zone as desperately out of memory + * so it doesn't really make much sense to retry except when the + * failure could be caused by weak migration mode. + */ + if (compaction_failed(compact_result)) { + if (*migrate_mode == MIGRATE_ASYNC) { + *migrate_mode = MIGRATE_SYNC_LIGHT; + return true; + } + return false; + } + + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But do not retry if the given zonelist is not suitable for + * compaction. + */ + if (compaction_withdrawn(compact_result)) + return compaction_zonelist_suitable(ac, order, alloc_flags); + + /* + * !costly requests are much more important than __GFP_REPEAT + * costly ones because they are de facto nofail and invoke OOM + * killer to move on while costly can fail and users are ready + * to cope with that. 1/4 retries is rather arbitrary but we + * would need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; + if (compaction_retries <= max_retries) + return true; + + return false; +} #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended_compaction, - bool *deferred_compaction) + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, enum compact_result *compact_result) { + *compact_result = COMPACT_SKIPPED; return NULL; } + +static inline bool +should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, + enum compact_result compact_result, + enum migrate_mode *migrate_mode, + int compaction_retries) +{ + struct zone *zone; + struct zoneref *z; + + if (!order || order > PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * There are setups with compaction disabled which would prefer to loop + * inside the allocator rather than hit the oom killer prematurely. + * Let's give them a good hope and keep retrying while the order-0 + * watermarks are OK. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), + ac_classzone_idx(ac), alloc_flags)) + return true; + } + return false; +} #endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ @@ -3013,7 +3361,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page = NULL; @@ -3049,13 +3397,13 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); + wakeup_kswapd(zone, order, ac_classzone_idx(ac)); } -static inline int +static inline unsigned int gfp_to_alloc_flags(gfp_t gfp_mask) { - int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -3110,18 +3458,113 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask) return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; } +/* + * Maximum number of reclaim retries without any progress before OOM killer + * is consider as the only way to move forward. + */ +#define MAX_RECLAIM_RETRIES 16 + +/* + * Checks whether it makes sense to retry the reclaim to make a forward progress + * for the given allocation request. + * The reclaim feedback represented by did_some_progress (any progress during + * the last reclaim round) and no_progress_loops (number of reclaim rounds without + * any progress in a row) is considered as well as the reclaimable pages on the + * applicable zone list (with a backoff mechanism which is a function of + * no_progress_loops). + * + * Returns true if a retry is viable or false to enter the oom path. + */ +static inline bool +should_reclaim_retry(gfp_t gfp_mask, unsigned order, + struct alloc_context *ac, int alloc_flags, + bool did_some_progress, int no_progress_loops) +{ + struct zone *zone; + struct zoneref *z; + + /* + * Make sure we converge to OOM if we cannot make any progress + * several times in the row. + */ + if (no_progress_loops > MAX_RECLAIM_RETRIES) + return false; + + /* + * Keep reclaiming pages while there is a chance this will lead somewhere. + * If none of the target zones can satisfy our allocation request even + * if all reclaimable pages are considered then we are screwed and have + * to go OOM. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; + unsigned long reclaimable; + + available = reclaimable = zone_reclaimable_pages(zone); + available -= DIV_ROUND_UP(no_progress_loops * available, + MAX_RECLAIM_RETRIES); + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + + /* + * Would the allocation succeed if we reclaimed the whole + * available? + */ + if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), + ac_classzone_idx(ac), alloc_flags, available)) { + /* + * If we didn't make any progress and have a lot of + * dirty + writeback pages then we should wait for + * an IO to complete to slow down the reclaim and + * prevent from pre mature OOM + */ + if (!did_some_progress) { + unsigned long writeback; + unsigned long dirty; + + writeback = zone_page_state_snapshot(zone, + NR_WRITEBACK); + dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY); + + if (2*(writeback + dirty) > reclaimable) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + return true; + } + } + + /* + * Memory allocation/reclaim might be called from a WQ + * context and the current implementation of the WQ + * concurrency control doesn't recognize that + * a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to + * do a short sleep here rather than calling + * cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); + + return true; + } + } + + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; - int alloc_flags; - unsigned long pages_reclaimed = 0; + unsigned int alloc_flags; unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; - bool deferred_compaction = false; - int contended_compaction = COMPACT_CONTENDED_NONE; + enum compact_result compact_result; + int compaction_retries = 0; + int no_progress_loops = 0; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3153,17 +3596,6 @@ retry: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); - /* - * Find the true preferred zone if the allocation is unconstrained by - * cpusets. - */ - if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { - struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, NULL, &ac->preferred_zone); - ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); - } - /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); @@ -3219,8 +3651,7 @@ retry: */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, migration_mode, - &contended_compaction, - &deferred_compaction); + &compact_result); if (page) goto got_pg; @@ -3233,35 +3664,19 @@ retry: * to heavily disrupt the system, so we fail the allocation * instead of entering direct reclaim. */ - if (deferred_compaction) + if (compact_result == COMPACT_DEFERRED) goto nopage; /* - * In all zones where compaction was attempted (and not - * deferred or skipped), lock contention has been detected. - * For THP allocation we do not want to disrupt the others - * so we fallback to base pages instead. + * Compaction is contended so rather back off than cause + * excessive stalls. */ - if (contended_compaction == COMPACT_CONTENDED_LOCK) - goto nopage; - - /* - * If compaction was aborted due to need_resched(), we do not - * want to further increase allocation latency, unless it is - * khugepaged trying to collapse. - */ - if (contended_compaction == COMPACT_CONTENDED_SCHED - && !(current->flags & PF_KTHREAD)) + if(compact_result == COMPACT_CONTENDED) goto nopage; } - /* - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. - */ - if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_SYNC_LIGHT; + if (order && compaction_made_progress(compact_result)) + compaction_retries++; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, @@ -3273,14 +3688,38 @@ retry: if (gfp_mask & __GFP_NORETRY) goto noretry; - /* Keep reclaiming pages as long as there is reasonable progress */ - pages_reclaimed += did_some_progress; - if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || - ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { - /* Wait for some write requests to complete then retry */ - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + /* + * Do not retry costly high order allocations unless they are + * __GFP_REPEAT + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + goto noretry; + + /* + * Costly allocations might have made a progress but this doesn't mean + * their order will become available due to high fragmentation so + * always increment the no progress counter for them + */ + if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) + no_progress_loops = 0; + else + no_progress_loops++; + + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, + did_some_progress > 0, no_progress_loops)) + goto retry; + + /* + * It doesn't make any sense to retry for the compaction if the order-0 + * reclaim is not able to make any progress because the current + * implementation of the compaction depends on the sufficient amount + * of free memory (see __compaction_suitable) + */ + if (did_some_progress > 0 && + should_compact_retry(ac, order, alloc_flags, + compact_result, &migration_mode, + compaction_retries)) goto retry; - } /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); @@ -3288,19 +3727,28 @@ retry: goto got_pg; /* Retry as long as the OOM killer is making progress */ - if (did_some_progress) + if (did_some_progress) { + no_progress_loops = 0; goto retry; + } noretry: /* - * High-order allocations do not necessarily loop after - * direct reclaim and reclaim/compaction depends on compaction - * being called after reclaim so call directly if necessary + * High-order allocations do not necessarily loop after direct reclaim + * and reclaim/compaction depends on compaction being called after + * reclaim so call directly if necessary. + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. All other requests should tolerate + * at least light sync migration. */ + if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_ASYNC; + else + migration_mode = MIGRATE_SYNC_LIGHT; page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, migration_mode, - &contended_compaction, - &deferred_compaction); + &compact_result); if (page) goto got_pg; nopage: @@ -3316,17 +3764,24 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - struct zoneref *preferred_zoneref; - struct page *page = NULL; + struct page *page; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), + .zonelist = zonelist, .nodemask = nodemask, .migratetype = gfpflags_to_migratetype(gfp_mask), }; + if (cpusets_enabled()) { + alloc_mask |= __GFP_HARDWALL; + alloc_flags |= ALLOC_CPUSET; + if (!ac.nodemask) + ac.nodemask = &cpuset_current_mems_allowed; + } + gfp_mask &= gfp_allowed_mask; lockdep_trace_alloc(gfp_mask); @@ -3350,49 +3805,54 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - /* We set it here, as __alloc_pages_slowpath might have changed it */ - ac.zonelist = zonelist; - /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask ? : &cpuset_current_mems_allowed, - &ac.preferred_zone); - if (!ac.preferred_zone) - goto out; - ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); + ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, + ac.high_zoneidx, ac.nodemask); + if (!ac.preferred_zoneref) { + page = NULL; + goto no_zone; + } /* First allocation attempt */ - alloc_mask = gfp_mask|__GFP_HARDWALL; page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); - if (unlikely(!page)) { - /* - * Runtime PM, block IO and its error handling path - * can deadlock because I/O on the device might not - * complete. - */ - alloc_mask = memalloc_noio_flags(gfp_mask); - ac.spread_dirty_pages = false; - - page = __alloc_pages_slowpath(alloc_mask, order, &ac); - } + if (likely(page)) + goto out; - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); + /* + * Runtime PM, block IO and its error handling path can deadlock + * because I/O on the device might not complete. + */ + alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + /* + * Restore the original nodemask if it was potentially replaced with + * &cpuset_current_mems_allowed to optimize the fast-path attempt. + */ + if (cpusets_enabled()) + ac.nodemask = nodemask; + page = __alloc_pages_slowpath(alloc_mask, order, &ac); -out: +no_zone: /* * When updating a task's mems_allowed, it is possible to race with * parallel threads in such a way that an allocation can fail while * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) { + alloc_mask = gfp_mask; goto retry_cpuset; + } + +out: + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; } @@ -3790,6 +4250,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) { int zone_type; /* needs to be signed */ unsigned long managed_pages = 0; + unsigned long managed_highpages = 0; + unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) @@ -3798,12 +4260,19 @@ void si_meminfo_node(struct sysinfo *val, int nid) val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; - val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + + if (is_highmem(zone)) { + managed_highpages += zone->managed_pages; + free_highpages += zone_page_state(zone, NR_FREE_PAGES); + } + } + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #else - val->totalhigh = 0; - val->freehigh = 0; + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #endif val->mem_unit = PAGE_SIZE; } @@ -4390,13 +4859,12 @@ static void build_zonelists(pg_data_t *pgdat) */ int local_memory_node(int node) { - struct zone *zone; + struct zoneref *z; - (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), - NULL, - &zone); - return zone->node; + NULL); + return z->zone->node; } #endif @@ -6395,49 +6863,6 @@ void setup_per_zone_wmarks(void) } /* - * The inactive anon list should be small enough that the VM never has to - * do too much work, but large enough that each inactive page has a chance - * to be referenced again before it is swapped out. - * - * The inactive_anon ratio is the target ratio of ACTIVE_ANON to - * INACTIVE_ANON pages on this zone's LRU, maintained by the - * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of - * the anonymous pages are kept on the inactive list. - * - * total target max - * memory ratio inactive anon - * ------------------------------------- - * 10MB 1 5MB - * 100MB 1 50MB - * 1GB 3 250MB - * 10GB 10 0.9GB - * 100GB 31 3GB - * 1TB 101 10GB - * 10TB 320 32GB - */ -static void __meminit calculate_zone_inactive_ratio(struct zone *zone) -{ - unsigned int gb, ratio; - - /* Zone size in gigabytes */ - gb = zone->managed_pages >> (30 - PAGE_SHIFT); - if (gb) - ratio = int_sqrt(10 * gb); - else - ratio = 1; - - zone->inactive_ratio = ratio; -} - -static void __meminit setup_per_zone_inactive_ratio(void) -{ - struct zone *zone; - - for_each_zone(zone) - calculate_zone_inactive_ratio(zone); -} - -/* * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines @@ -6482,10 +6907,9 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); - setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_wmark_min) +core_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -6725,98 +7149,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct zone *zone, - unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; -#else - return zone->pageblock_flags; -#endif /* CONFIG_SPARSEMEM */ -} - -static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - pfn &= (PAGES_PER_SECTION-1); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#else - pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#endif /* CONFIG_SPARSEMEM */ -} - -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest to retrieve - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - struct zone *zone; - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long word; - - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; - bitidx += end_bitidx; - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; -} - -/** - * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @flags: The flags to set - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest - * @mask: mask of bits that the caller is interested in - */ -void set_pfnblock_flags_mask(struct page *page, unsigned long flags, - unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - struct zone *zone; - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long old_word, word; - - BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - - bitidx += end_bitidx; - mask <<= (BITS_PER_LONG - bitidx - 1); - flags <<= (BITS_PER_LONG - bitidx - 1); - - word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } -} - /* * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages @@ -6864,7 +7196,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * We can't use page_count without pin a page * because another CPU can free compound page. * This check already skips compound tails of THP - * because their page->_count is zero at all time. + * because their page->_refcount is zero at all time. */ if (!page_ref_count(page)) { if (PageBuddy(page)) @@ -7177,7 +7509,8 @@ void zone_pcp_reset(struct zone *zone) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be isolated before calling this. + * All pages in the range must be in a single zone and isolated + * before calling this. */ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) diff --git a/mm/page_io.c b/mm/page_io.c index cd92e3d67a32..242dba07545b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -279,7 +279,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, set_page_writeback(page); unlock_page(page); - ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos); + ret = mapping->a_ops->direct_IO(&kiocb, &from); if (ret == PAGE_SIZE) { count_vm_event(PSWPOUT); ret = 0; @@ -353,7 +353,11 @@ int swap_readpage(struct page *page) ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); if (!ret) { - swap_slot_free_notify(page); + if (trylock_page(page)) { + swap_slot_free_notify(page); + unlock_page(page); + } + count_vm_event(PSWPIN); return 0; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c4f568206544..612122bf6a42 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, return pfn; } +/* Caller should ensure that requested range is in a single zone */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages) { @@ -288,13 +289,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * accordance with memory policy of the user process if possible. For * now as a simple work-around, we use the next node for destination. */ - if (PageHuge(page)) { - int node = next_online_node(page_to_nid(page)); - if (node == MAX_NUMNODES) - node = first_online_node; + if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), - node); - } + next_node_in(page_to_nid(page), + node_online_map)); if (PageHighMem(page)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/page_owner.c b/mm/page_owner.c index ac3d8d129974..792b56da13d8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, goto err; /* Print information relevant to grouping pages by mobility */ - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", @@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + /* * We are safe to check buddy flag and order, because * this is init stage and only single thread runs. diff --git a/mm/page_poison.c b/mm/page_poison.c index 479e7ea2bea6..1eae5fad2446 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -13,13 +13,7 @@ static int early_page_poison_param(char *buf) { if (!buf) return -EINVAL; - - if (strcmp(buf, "on") == 0) - want_page_poisoning = true; - else if (strcmp(buf, "off") == 0) - want_page_poisoning = false; - - return 0; + return strtobool(buf, &want_page_poisoning); } early_param("page_poison", early_page_poison_param); diff --git a/mm/rmap.c b/mm/rmap.c index 307b555024ef..8a839935b18c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - BUG_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->degree); put_anon_vma(anon_vma); list_del(&avc->same_vma); @@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page, int nr = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - SetPageSwapBacked(page); + __SetPageSwapBacked(page); if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ diff --git a/mm/shmem.c b/mm/shmem.c index 719bd6b88d98..e418a995427d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -101,7 +101,6 @@ struct shmem_falloc { enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; @@ -122,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp); static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); static inline int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, int *fault_type) + struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), fault_type); + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -169,7 +169,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as - * pages are allocated, in order to allow huge sparse files. + * pages are allocated, in order to allow large sparse files. * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ @@ -528,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (partial_start) { struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); + shmem_getpage(inode, start - 1, &page, SGP_READ); if (page) { unsigned int top = PAGE_SIZE; if (start > end) { @@ -543,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } if (partial_end) { struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ, NULL); + shmem_getpage(inode, end, &page, SGP_READ); if (page) { zero_user_segment(page, 0, partial_end); set_page_dirty(page); @@ -947,8 +947,7 @@ redirty: return 0; } -#ifdef CONFIG_NUMA -#ifdef CONFIG_TMPFS +#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; @@ -972,7 +971,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } return mpol; } -#endif /* CONFIG_TMPFS */ +#else /* !CONFIG_NUMA || !CONFIG_TMPFS */ +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ +} +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif /* CONFIG_NUMA && CONFIG_TMPFS */ +#ifndef CONFIG_NUMA +#define vm_policy vm_private_data +#endif static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) @@ -1008,39 +1018,17 @@ static struct page *shmem_alloc_page(gfp_t gfp, pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - page = alloc_page_vma(gfp, &pvma, 0); + page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); + if (page) { + __SetPageLocked(page); + __SetPageSwapBacked(page); + } /* Drop reference taken by mpol_shared_policy_lookup() */ mpol_cond_put(pvma.vm_policy); return page; } -#else /* !CONFIG_NUMA */ -#ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) -{ -} -#endif /* CONFIG_TMPFS */ - -static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return swapin_readahead(swap, gfp, NULL, 0); -} - -static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return alloc_page(gfp); -} -#endif /* CONFIG_NUMA */ - -#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) -static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) -{ - return NULL; -} -#endif /* * When a page is moved from swapcache to shmem filecache (either by the @@ -1084,9 +1072,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __SetPageLocked(newpage); SetPageUptodate(newpage); - SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1130,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap - * entry since a page cannot live in both the swap and page cache + * entry since a page cannot live in both the swap and page cache. + * + * fault_mm and fault_type are only supplied by shmem_fault: + * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) + struct page **pagep, enum sgp_type sgp, gfp_t gfp, + struct mm_struct *fault_mm, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mm_struct *charge_mm; struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; @@ -1155,7 +1146,7 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; @@ -1183,14 +1174,19 @@ repeat: */ info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = fault_mm ? : current->mm; if (swap.val) { /* Look it up and read it in.. */ page = lookup_swap_cache(swap); if (!page) { - /* here we actually do the io */ - if (fault_type) + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT); + } + /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); if (!page) { error = -ENOMEM; @@ -1217,7 +1213,7 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1275,13 +1271,10 @@ repeat: error = -ENOMEM; goto decused; } - - __SetPageSwapBacked(page); - __SetPageLocked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, false); if (error) goto decused; @@ -1321,12 +1314,10 @@ clear: flush_dcache_page(page); SetPageUptodate(page); } - if (sgp == SGP_DIRTY) - set_page_dirty(page); } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { ClearPageDirty(page); @@ -1372,6 +1363,7 @@ unlock: static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); int error; int ret = VM_FAULT_LOCKED; @@ -1433,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); + error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + gfp, vma->vm_mm, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - - if (ret & VM_FAULT_MAJOR) { - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - } return ret; } @@ -1587,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + return shmem_getpage(inode, index, pagep, SGP_WRITE); } static int @@ -1633,7 +1621,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * and even mark them dirty, so it cannot exceed the max_blocks limit. */ if (!iter_is_iovec(to)) - sgp = SGP_DIRTY; + sgp = SGP_CACHE; index = *ppos >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; @@ -1653,14 +1641,17 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, sgp, NULL); + error = shmem_getpage(inode, index, &page, sgp); if (error) { if (error == -EINVAL) error = 0; break; } - if (page) + if (page) { + if (sgp == SGP_CACHE) + set_page_dirty(page); unlock_page(page); + } /* * We must evaluate after, since reads (unlike writes) @@ -1766,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, error = 0; while (spd.nr_pages < nr_pages) { - error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + error = shmem_getpage(inode, index, &page, SGP_CACHE); if (error) break; unlock_page(page); @@ -1788,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, page = spd.pages[page_nr]; if (!PageUptodate(page) || page->mapping != mapping) { - error = shmem_getpage(inode, index, &page, - SGP_CACHE, NULL); + error = shmem_getpage(inode, index, &page, SGP_CACHE); if (error) break; unlock_page(page); @@ -2232,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC, - NULL); + error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { /* Remove the !PageUptodate pages we added */ shmem_undo_range(inode, @@ -2551,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); + error = shmem_getpage(inode, 0, &page, SGP_WRITE); if (error) { iput(inode); return error; @@ -2592,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ, NULL); + error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); unlock_page(page); @@ -2646,10 +2635,10 @@ static int shmem_initxattrs(struct inode *inode, } static int shmem_xattr_handler_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); @@ -3123,7 +3112,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - kfree(inode->i_link); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -3495,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, int error; BUG_ON(mapping->a_ops != &shmem_aops); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + gfp, NULL, NULL); if (error) page = ERR_PTR(error); else diff --git a/mm/slab.c b/mm/slab.c index 17e2848979c5..cc8bbc1e6bc9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list); +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page, + void **list); static int slab_early_init = 1; #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) @@ -421,8 +426,6 @@ static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", }; -#define BAD_ALIEN_MAGIC 0x01020304ul - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -519,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { - int node; - - node = next_node(cpu_to_mem(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); - - per_cpu(slab_reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), + node_online_map); } static void next_reap_node(void) { int node = __this_cpu_read(slab_reap_node); - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); + node = next_node_in(node, node_online_map); __this_cpu_write(slab_reap_node, node); } @@ -644,7 +640,7 @@ static int transfer_objects(struct array_cache *to, static inline struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - return (struct alien_cache **)BAD_ALIEN_MAGIC; + return NULL; } static inline void free_alien_cache(struct alien_cache **ac_ptr) @@ -850,6 +846,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags) } #endif +static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) +{ + struct kmem_cache_node *n; + + /* + * Set up the kmem_cache_node for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + n = get_node(cachep, node); + if (n) { + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + + cachep->num; + spin_unlock_irq(&n->list_lock); + + return 0; + } + + n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); + if (!n) + return -ENOMEM; + + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + + n->free_limit = + (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; + + /* + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient + * protection here. + */ + cachep->node[node] = n; + + return 0; +} + /* * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node @@ -861,46 +897,82 @@ static inline gfp_t gfp_exact_node(gfp_t flags) */ static int init_cache_node_node(int node) { + int ret; struct kmem_cache *cachep; - struct kmem_cache_node *n; - const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { - /* - * Set up the kmem_cache_node for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - n = get_node(cachep, node); - if (!n) { - n = kmalloc_node(memsize, GFP_KERNEL, node); - if (!n) - return -ENOMEM; - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - /* - * The kmem_cache_nodes don't come and go as CPUs - * come and go. slab_mutex is sufficient - * protection here. - */ - cachep->node[node] = n; - } - - spin_lock_irq(&n->list_lock); - n->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); + ret = init_cache_node(cachep, node, GFP_KERNEL); + if (ret) + return ret; } + return 0; } -static inline int slabs_tofree(struct kmem_cache *cachep, - struct kmem_cache_node *n) +static int setup_kmem_cache_node(struct kmem_cache *cachep, + int node, gfp_t gfp, bool force_change) { - return (n->free_objects + cachep->num - 1) / cachep->num; + int ret = -ENOMEM; + struct kmem_cache_node *n; + struct array_cache *old_shared = NULL; + struct array_cache *new_shared = NULL; + struct alien_cache **new_alien = NULL; + LIST_HEAD(list); + + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit, gfp); + if (!new_alien) + goto fail; + } + + if (cachep->shared) { + new_shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); + if (!new_shared) + goto fail; + } + + ret = init_cache_node(cachep, node, gfp); + if (ret) + goto fail; + + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + if (n->shared && force_change) { + free_block(cachep, n->shared->entry, + n->shared->avail, node, &list); + n->shared->avail = 0; + } + + if (!n->shared || force_change) { + old_shared = n->shared; + n->shared = new_shared; + new_shared = NULL; + } + + if (!n->alien) { + n->alien = new_alien; + new_alien = NULL; + } + + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + + /* + * To protect lockless access to n->shared during irq disabled context. + * If n->shared isn't NULL in irq disabled context, accessing to it is + * guaranteed to be valid until irq is re-enabled, because it will be + * freed after synchronize_sched(). + */ + if (force_change) + synchronize_sched(); + +fail: + kfree(old_shared); + kfree(new_shared); + free_alien_cache(new_alien); + + return ret; } static void cpuup_canceled(long cpu) @@ -967,14 +1039,13 @@ free_slab: n = get_node(cachep, node); if (!n) continue; - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); } } static int cpuup_prepare(long cpu) { struct kmem_cache *cachep; - struct kmem_cache_node *n = NULL; int node = cpu_to_mem(cpu); int err; @@ -993,44 +1064,9 @@ static int cpuup_prepare(long cpu) * array caches */ list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *shared = NULL; - struct alien_cache **alien = NULL; - - if (cachep->shared) { - shared = alloc_arraycache(node, - cachep->shared * cachep->batchcount, - 0xbaadf00d, GFP_KERNEL); - if (!shared) - goto bad; - } - if (use_alien_caches) { - alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); - if (!alien) { - kfree(shared); - goto bad; - } - } - n = get_node(cachep, node); - BUG_ON(!n); - - spin_lock_irq(&n->list_lock); - if (!n->shared) { - /* - * We are serialised from CPU_DEAD or - * CPU_UP_CANCELLED by the cpucontrol lock - */ - n->shared = shared; - shared = NULL; - } -#ifdef CONFIG_NUMA - if (!n->alien) { - n->alien = alien; - alien = NULL; - } -#endif - spin_unlock_irq(&n->list_lock); - kfree(shared); - free_alien_cache(alien); + err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false); + if (err) + goto bad; } return 0; @@ -1119,7 +1155,7 @@ static int __meminit drain_cache_node_node(int node) if (!n) continue; - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); if (!list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial)) { @@ -1200,6 +1236,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list, + size_t count) +{ + size_t i; + unsigned int rand; + + for (i = 0; i < count; i++) + list[i] = i; + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(state); + rand %= (i + 1); + swap(list[i], list[rand]); + } +} + +/* Create a random sequence per cache */ +static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + unsigned int seed, count = cachep->num; + struct rnd_state state; + + if (count < 2) + return 0; + + /* If it fails, we will just use the global lists */ + cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp); + if (!cachep->random_seq) + return -ENOMEM; + + /* Get best entropy at this stage */ + get_random_bytes_arch(&seed, sizeof(seed)); + prandom_seed_state(&state, seed); + + freelist_randomize(&state, cachep->random_seq, count); + return 0; +} + +/* Destroy the per-cache random freelist sequence */ +static void cache_random_seq_destroy(struct kmem_cache *cachep) +{ + kfree(cachep->random_seq); + cachep->random_seq = NULL; +} +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + + /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). @@ -1212,7 +1303,7 @@ void __init kmem_cache_init(void) sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; - if (num_possible_nodes() == 1) + if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) @@ -1781,7 +1872,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, /* * Needed to avoid possible looping condition - * in cache_grow() + * in cache_grow_begin() */ if (OFF_SLAB(freelist_cache)) continue; @@ -2138,7 +2229,7 @@ done: cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; - if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) + if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); @@ -2180,6 +2271,11 @@ static void check_irq_on(void) BUG_ON(irqs_disabled()); } +static void check_mutex_acquired(void) +{ + BUG_ON(!mutex_is_locked(&slab_mutex)); +} + static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP @@ -2199,13 +2295,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) +#define check_mutex_acquired() do { } while(0) #define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, - int force, int node); +static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, + int node, bool free_all, struct list_head *list) +{ + int tofree; + + if (!ac || !ac->avail) + return; + + tofree = free_all ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + + free_block(cachep, ac->entry, tofree, node, list); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); +} static void do_drain(void *arg) { @@ -2229,6 +2339,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_cache_node *n; int node; + LIST_HEAD(list); on_each_cpu(do_drain, cachep, 1); check_irq_on(); @@ -2236,8 +2347,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep) if (n->alien) drain_alien_cache(cachep, n->alien); - for_each_kmem_cache_node(cachep, node, n) - drain_array(cachep, n, n->shared, 1, node); + for_each_kmem_cache_node(cachep, node, n) { + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, n->shared, node, true, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } } /* @@ -2288,7 +2404,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) check_irq_on(); for_each_kmem_cache_node(cachep, node, n) { - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); ret += !list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial); @@ -2306,6 +2422,8 @@ void __kmem_cache_release(struct kmem_cache *cachep) int i; struct kmem_cache_node *n; + cache_random_seq_destroy(cachep); + free_percpu(cachep->cpu_cache); /* NUMA: free the node structures */ @@ -2412,15 +2530,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) #endif } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Hold information during a freelist initialization */ +union freelist_init_state { + struct { + unsigned int pos; + freelist_idx_t *list; + unsigned int count; + unsigned int rand; + }; + struct rnd_state rnd_state; +}; + +/* + * Initialize the state based on the randomization methode available. + * return true if the pre-computed list is available, false otherwize. + */ +static bool freelist_state_initialize(union freelist_init_state *state, + struct kmem_cache *cachep, + unsigned int count) +{ + bool ret; + unsigned int rand; + + /* Use best entropy available to define a random shift */ + get_random_bytes_arch(&rand, sizeof(rand)); + + /* Use a random state if the pre-computed list is not available */ + if (!cachep->random_seq) { + prandom_seed_state(&state->rnd_state, rand); + ret = false; + } else { + state->list = cachep->random_seq; + state->count = count; + state->pos = 0; + state->rand = rand; + ret = true; + } + return ret; +} + +/* Get the next entry on the list and randomize it using a random shift */ +static freelist_idx_t next_random_slot(union freelist_init_state *state) +{ + return (state->list[state->pos++] + state->rand) % state->count; +} + +/* + * Shuffle the freelist initialization state based on pre-computed lists. + * return true if the list was successfully shuffled, false otherwise. + */ +static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) +{ + unsigned int objfreelist = 0, i, count = cachep->num; + union freelist_init_state state; + bool precomputed; + + if (count < 2) + return false; + + precomputed = freelist_state_initialize(&state, cachep, count); + + /* Take a random entry as the objfreelist */ + if (OBJFREELIST_SLAB(cachep)) { + if (!precomputed) + objfreelist = count - 1; + else + objfreelist = next_random_slot(&state); + page->freelist = index_to_obj(cachep, page, objfreelist) + + obj_offset(cachep); + count--; + } + + /* + * On early boot, generate the list dynamically. + * Later use a pre-computed list for speed. + */ + if (!precomputed) { + freelist_randomize(&state.rnd_state, page->freelist, count); + } else { + for (i = 0; i < count; i++) + set_free_obj(page, i, next_random_slot(&state)); + } + + if (OBJFREELIST_SLAB(cachep)) + set_free_obj(page, cachep->num - 1, objfreelist); + + return true; +} +#else +static inline bool shuffle_freelist(struct kmem_cache *cachep, + struct page *page) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + static void cache_init_objs(struct kmem_cache *cachep, struct page *page) { int i; void *objp; + bool shuffled; cache_init_objs_debug(cachep, page); - if (OBJFREELIST_SLAB(cachep)) { + /* Try to randomize the freelist if enabled */ + shuffled = shuffle_freelist(cachep, page); + + if (!shuffled && OBJFREELIST_SLAB(cachep)) { page->freelist = index_to_obj(cachep, page, cachep->num - 1) + obj_offset(cachep); } @@ -2434,17 +2652,8 @@ static void cache_init_objs(struct kmem_cache *cachep, kasan_poison_object_data(cachep, objp); } - set_free_obj(page, i, i); - } -} - -static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) -{ - if (CONFIG_ZONE_DMA_FLAG) { - if (flags & GFP_DMA) - BUG_ON(!(cachep->allocflags & GFP_DMA)); - else - BUG_ON(cachep->allocflags & GFP_DMA); + if (!shuffled) + set_free_obj(page, i, i); } } @@ -2502,13 +2711,15 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, struct page *page) +static struct page *cache_grow_begin(struct kmem_cache *cachep, + gfp_t flags, int nodeid) { void *freelist; size_t offset; gfp_t local_flags; + int page_node; struct kmem_cache_node *n; + struct page *page; /* * Be lazy and only check for valid flags here, keeping it out of the @@ -2520,43 +2731,35 @@ static int cache_grow(struct kmem_cache *cachep, } local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = get_node(cachep, nodeid); - spin_lock(&n->list_lock); - - /* Get colour for the slab, and cal the next value. */ - offset = n->colour_next; - n->colour_next++; - if (n->colour_next >= cachep->colour) - n->colour_next = 0; - spin_unlock(&n->list_lock); - - offset *= cachep->colour_off; - if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); /* - * The test for missing atomic flag is performed here, rather than - * the more obvious place, simply to reduce the critical path length - * in kmem_cache_alloc(). If a caller is seriously mis-behaving they - * will eventually be caught here (where it matters). - */ - kmem_flagcheck(cachep, flags); - - /* * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!page) - page = kmem_getpages(cachep, local_flags, nodeid); + page = kmem_getpages(cachep, local_flags, nodeid); if (!page) goto failed; + page_node = page_to_nid(page); + n = get_node(cachep, page_node); + + /* Get colour for the slab, and cal the next value. */ + n->colour_next++; + if (n->colour_next >= cachep->colour) + n->colour_next = 0; + + offset = n->colour_next; + if (offset >= cachep->colour) + offset = 0; + + offset *= cachep->colour_off; + /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, - local_flags & ~GFP_CONSTRAINT_MASK, nodeid); + local_flags & ~GFP_CONSTRAINT_MASK, page_node); if (OFF_SLAB(cachep) && !freelist) goto opps1; @@ -2567,21 +2770,40 @@ static int cache_grow(struct kmem_cache *cachep, if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - check_irq_off(); - spin_lock(&n->list_lock); - /* Make slab active. */ - list_add_tail(&page->lru, &(n->slabs_free)); - STATS_INC_GROWN(cachep); - n->free_objects += cachep->num; - spin_unlock(&n->list_lock); - return 1; + return page; + opps1: kmem_freepages(cachep, page); failed: if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - return 0; + return NULL; +} + +static void cache_grow_end(struct kmem_cache *cachep, struct page *page) +{ + struct kmem_cache_node *n; + void *list = NULL; + + check_irq_off(); + + if (!page) + return; + + INIT_LIST_HEAD(&page->lru); + n = get_node(cachep, page_to_nid(page)); + + spin_lock(&n->list_lock); + if (!page->active) + list_add_tail(&page->lru, &(n->slabs_free)); + else + fixup_slab_list(cachep, n, page, &list); + STATS_INC_GROWN(cachep); + n->free_objects += cachep->num - page->active; + spin_unlock(&n->list_lock); + + fixup_objfreelist_debug(cachep, &list); } #if DEBUG @@ -2785,18 +3007,42 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, return obj; } +/* + * Slab list should be fixed up by fixup_slab_list() for existing slab + * or cache_grow_end() for new slab + */ +static __always_inline int alloc_block(struct kmem_cache *cachep, + struct array_cache *ac, struct page *page, int batchcount) +{ + /* + * There must be at least one object available for + * allocation. + */ + BUG_ON(page->active >= cachep->num); + + while (page->active < cachep->num && batchcount--) { + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + ac->entry[ac->avail++] = slab_get_obj(cachep, page); + } + + return batchcount; +} + static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; - struct array_cache *ac; + struct array_cache *ac, *shared; int node; void *list = NULL; + struct page *page; check_irq_off(); node = numa_mem_id(); -retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2810,16 +3056,20 @@ retry: n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); + shared = READ_ONCE(n->shared); + if (!n->free_objects && (!shared || !shared->avail)) + goto direct_grow; + spin_lock(&n->list_lock); + shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ - if (n->shared && transfer_objects(ac, n->shared, batchcount)) { - n->shared->touched = 1; + if (shared && transfer_objects(ac, shared, batchcount)) { + shared->touched = 1; goto alloc_done; } while (batchcount > 0) { - struct page *page; /* Get slab alloc is to come from. */ page = get_first_slab(n, false); if (!page) @@ -2827,21 +3077,7 @@ retry: check_spinlock_acquired(cachep); - /* - * The slab was either on partial or free list so - * there must be at least one object available for - * allocation. - */ - BUG_ON(page->active >= cachep->num); - - while (page->active < cachep->num && batchcount--) { - STATS_INC_ALLOCED(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - ac->entry[ac->avail++] = slab_get_obj(cachep, page); - } - + batchcount = alloc_block(cachep, ac, page, batchcount); fixup_slab_list(cachep, n, page, &list); } @@ -2851,9 +3087,8 @@ alloc_done: spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); +direct_grow: if (unlikely(!ac->avail)) { - int x; - /* Check if we can use obj in pfmemalloc slab */ if (sk_memalloc_socks()) { void *obj = cache_alloc_pfmemalloc(cachep, n, flags); @@ -2862,18 +3097,19 @@ alloc_done: return obj; } - x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); + page = cache_grow_begin(cachep, gfp_exact_node(flags), node); - /* cache_grow can reenable interrupts, then ac could change. */ + /* + * cache_grow_begin() can reenable interrupts, + * then ac could change. + */ ac = cpu_cache_get(cachep); - node = numa_mem_id(); + if (!ac->avail && page) + alloc_block(cachep, ac, page, batchcount); + cache_grow_end(cachep, page); - /* no objects in sight? abort */ - if (!x && ac->avail == 0) + if (!ac->avail) return NULL; - - if (!ac->avail) /* objects refilled by interrupt? */ - goto retry; } ac->touched = 1; @@ -2884,9 +3120,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { might_sleep_if(gfpflags_allow_blocking(flags)); -#if DEBUG - kmem_flagcheck(cachep, flags); -#endif } #if DEBUG @@ -2998,19 +3231,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { struct zonelist *zonelist; - gfp_t local_flags; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; + struct page *page; int nid; unsigned int cpuset_mems_cookie; if (flags & __GFP_THISNODE) return NULL; - local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); @@ -3040,33 +3271,19 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - struct page *page; - - if (gfpflags_allow_blocking(local_flags)) - local_irq_enable(); - kmem_flagcheck(cache, flags); - page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (gfpflags_allow_blocking(local_flags)) - local_irq_disable(); + page = cache_grow_begin(cache, flags, numa_mem_id()); + cache_grow_end(cache, page); if (page) { + nid = page_to_nid(page); + obj = ____cache_alloc_node(cache, + gfp_exact_node(flags), nid); + /* - * Insert into the appropriate per node queues + * Another processor may allocate the objects in + * the slab since we are not holding any locks. */ - nid = page_to_nid(page); - if (cache_grow(cache, flags, nid, page)) { - obj = ____cache_alloc_node(cache, - gfp_exact_node(flags), nid); - if (!obj) - /* - * Another processor may allocate the - * objects in the slab since we are - * not holding any locks. - */ - goto retry; - } else { - /* cache_grow already freed obj */ - obj = NULL; - } + if (!obj) + goto retry; } } @@ -3083,15 +3300,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, { struct page *page; struct kmem_cache_node *n; - void *obj; + void *obj = NULL; void *list = NULL; - int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); BUG_ON(!n); -retry: check_irq_off(); spin_lock(&n->list_lock); page = get_first_slab(n, false); @@ -3113,18 +3328,18 @@ retry: spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); - goto done; + return obj; must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); - if (x) - goto retry; - - return fallback_alloc(cachep, flags); + page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + if (page) { + /* This slab isn't counted yet so don't update free_objects */ + obj = slab_get_obj(cachep, page); + } + cache_grow_end(cachep, page); -done: - return obj; + return obj ? obj : fallback_alloc(cachep, flags); } static __always_inline void * @@ -3242,6 +3457,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp, { int i; struct kmem_cache_node *n = get_node(cachep, node); + struct page *page; + + n->free_objects += nr_objects; for (i = 0; i < nr_objects; i++) { void *objp; @@ -3254,17 +3472,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp, check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); - n->free_objects++; /* fixup slab chains */ - if (page->active == 0) { - if (n->free_objects > n->free_limit) { - n->free_objects -= cachep->num; - list_add_tail(&page->lru, list); - } else { - list_add(&page->lru, &n->slabs_free); - } - } else { + if (page->active == 0) + list_add(&page->lru, &n->slabs_free); + else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3272,6 +3484,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp, list_add_tail(&page->lru, &n->slabs_partial); } } + + while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { + n->free_objects -= cachep->num; + + page = list_last_entry(&n->slabs_free, struct page, lru); + list_del(&page->lru); + list_add(&page->lru, list); + } } static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) @@ -3327,9 +3547,17 @@ free_done: static inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { - struct array_cache *ac = cpu_cache_get(cachep); + /* Put the object into the quarantine, don't touch it for now. */ + if (kasan_slab_free(cachep, objp)) + return; + + ___cache_free(cachep, objp, caller); +} - kasan_slab_free(cachep, objp); +void ___cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); @@ -3645,72 +3873,19 @@ EXPORT_SYMBOL(kfree); /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) +static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp) { + int ret; int node; struct kmem_cache_node *n; - struct array_cache *new_shared; - struct alien_cache **new_alien = NULL; for_each_online_node(node) { - - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } - - new_shared = NULL; - if (cachep->shared) { - new_shared = alloc_arraycache(node, - cachep->shared*cachep->batchcount, - 0xbaadf00d, gfp); - if (!new_shared) { - free_alien_cache(new_alien); - goto fail; - } - } - - n = get_node(cachep, node); - if (n) { - struct array_cache *shared = n->shared; - LIST_HEAD(list); - - spin_lock_irq(&n->list_lock); - - if (shared) - free_block(cachep, shared->entry, - shared->avail, node, &list); - - n->shared = new_shared; - if (!n->alien) { - n->alien = new_alien; - new_alien = NULL; - } - n->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - kfree(shared); - free_alien_cache(new_alien); - continue; - } - n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - if (!n) { - free_alien_cache(new_alien); - kfree(new_shared); + ret = setup_kmem_cache_node(cachep, node, gfp, true); + if (ret) goto fail; - } - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - n->shared = new_shared; - n->alien = new_alien; - n->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - cachep->node[node] = n; } + return 0; fail: @@ -3752,7 +3927,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; if (!prev) - goto alloc_node; + goto setup_node; for_each_online_cpu(cpu) { LIST_HEAD(list); @@ -3769,8 +3944,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, } free_percpu(prev); -alloc_node: - return alloc_kmem_cache_node(cachep, gfp); +setup_node: + return setup_kmem_cache_nodes(cachep, gfp); } static int do_tune_cpucache(struct kmem_cache *cachep, int limit, @@ -3804,6 +3979,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) int shared = 0; int batchcount = 0; + err = cache_random_seq_create(cachep, gfp); + if (err) + goto end; + if (!is_root_cache(cachep)) { struct kmem_cache *root = memcg_root_cache(cachep); limit = root->limit; @@ -3857,6 +4036,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) batchcount = (limit + 1) / 2; skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); +end: if (err) pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); @@ -3869,29 +4049,26 @@ skip_setup: * if drain_array() is used on the shared array. */ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, int force, int node) + struct array_cache *ac, int node) { LIST_HEAD(list); - int tofree; + + /* ac from n->shared can be freed if we don't hold the slab_mutex. */ + check_mutex_acquired(); if (!ac || !ac->avail) return; - if (ac->touched && !force) { + + if (ac->touched) { ac->touched = 0; - } else { - spin_lock_irq(&n->list_lock); - if (ac->avail) { - tofree = force ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) - tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node, &list); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), - sizeof(void *) * ac->avail); - } - spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); + return; } + + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, ac, node, false, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); } /** @@ -3929,7 +4106,7 @@ static void cache_reap(struct work_struct *w) reap_alien(searchp, n); - drain_array(searchp, n, cpu_cache_get(searchp), 0, node); + drain_array(searchp, n, cpu_cache_get(searchp), node); /* * These are racy checks but it does not matter @@ -3940,7 +4117,7 @@ static void cache_reap(struct work_struct *w) n->next_reap = jiffies + REAPTIMEOUT_NODE; - drain_array(searchp, n, n->shared, 0, node); + drain_array(searchp, n, n->shared, node); if (n->free_touched) n->free_touched = 0; @@ -4324,7 +4501,7 @@ size_t ksize(const void *objp) /* We assume that ksize callers could use the whole allocated area, * so we need to unpoison this area. */ - kasan_krealloc(objp, size, GFP_NOWAIT); + kasan_unpoison_shadow(objp, size); return size; } diff --git a/mm/slab.h b/mm/slab.h index 5969769fbee6..dedb1a920fb8 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 3239bfd758e6..a65dad7fdcd1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s) get_online_cpus(); get_online_mems(); + kasan_cache_destroy(s); mutex_lock(&slab_mutex); s->refcount--; @@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); + kasan_cache_shrink(cachep); ret = __kmem_cache_shrink(cachep, false); put_online_mems(); put_online_cpus(); diff --git a/mm/slub.c b/mm/slub.c index 4dbb109eb8cd..825ff4505336 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count tmp.counters = counters_new; /* * page->counters can cover frozen/inuse/objects as well - * as page->_count. If we assign to ->counters directly - * we run the risk of losing updates to page->_count, so + * as page->_refcount. If we assign to ->counters directly + * we run the risk of losing updates to page->_refcount, so * be careful and only assign to the fields we need. */ page->frozen = tmp.frozen; @@ -1735,11 +1735,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, * may return off node objects because partial slabs are obtained * from other nodes and filled up. * - * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes - * defrag_ratio = 1000) then every (well almost) allocation will - * first attempt to defrag slab caches on other nodes. This means - * scanning over all nodes to look for partial slabs which may be - * expensive if we do it every time we are trying to find a slab + * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100 + * (which makes defrag_ratio = 1000) then every (well almost) + * allocation will first attempt to defrag slab caches on other nodes. + * This means scanning over all nodes to look for partial slabs which + * may be expensive if we do it every time we are trying to find a slab * with available objects. */ if (!s->remote_node_defrag_ratio || @@ -3635,8 +3635,9 @@ size_t ksize(const void *object) { size_t size = __ksize(object); /* We assume that ksize callers could use whole allocated area, - so we need unpoison this area. */ - kasan_krealloc(object, size, GFP_NOWAIT); + * so we need to unpoison this area. + */ + kasan_unpoison_shadow(object, size); return size; } EXPORT_SYMBOL(ksize); @@ -3697,7 +3698,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) * s->cpu_partial is checked locklessly (see put_cpu_partial), * so we have to make sure the change is visible. */ - kick_all_cpus_sync(); + synchronize_sched(); } flush_all(s); diff --git a/mm/swap.c b/mm/swap.c index a0bc206b4ac6..95916142fc46 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); +#endif /* * This path almost never happens for VM activity - pages are normally @@ -274,8 +277,6 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, } #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); - static void activate_page_drain(int cpu) { struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); @@ -728,6 +729,11 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + if (is_huge_zero_page(page)) { + put_huge_zero_page(); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; diff --git a/mm/swap_state.c b/mm/swap_state.c index 366ce3518703..0d457e7db8d6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); - SetPageSwapBacked(new_page); + __SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); @@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return new_page; } radix_tree_preload_end(); - ClearPageSwapBacked(new_page); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely diff --git a/mm/swapfile.c b/mm/swapfile.c index 83874eced5bf..031713ab40ce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -922,18 +922,19 @@ out: * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. + * + * NOTE: total_mapcount should not be relied upon by the caller if + * reuse_swap_page() returns false, but it may be always overwritten + * (see the other implementation for CONFIG_SWAP=n). */ -int reuse_swap_page(struct page *page) +bool reuse_swap_page(struct page *page, int *total_mapcount) { int count; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) - return 0; - /* The page is part of THP and cannot be reused */ - if (PageTransCompound(page)) - return 0; - count = page_mapcount(page); + return false; + count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); if (count == 1 && !PageWriteback(page)) { diff --git a/mm/util.c b/mm/util.c index 6cc81e7b8705..917e0e3d0f8e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -297,7 +297,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = security_mmap_file(file, prot, flag); if (!ret) { - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate); up_write(&mm->mmap_sem); @@ -346,6 +347,29 @@ void *page_rmapping(struct page *page) return __page_rmapping(page); } +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped. + */ +bool page_mapped(struct page *page) +{ + int i; + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + page = compound_head(page); + if (atomic_read(compound_mapcount_ptr(page)) >= 0) + return true; + if (PageHuge(page)) + return false; + for (i = 0; i < hpage_nr_pages(page); i++) { + if (atomic_read(&page[i]._mapcount) >= 0) + return true; + } + return false; +} +EXPORT_SYMBOL(page_mapped); + struct anon_vma *page_anon_vma(struct page *page) { unsigned long mapping; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ae7d20b447ff..cf7ad1a53be0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -21,6 +21,7 @@ #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/list.h> +#include <linux/notifier.h> #include <linux/rbtree.h> #include <linux/radix-tree.h> #include <linux/rcupdate.h> @@ -274,13 +275,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ -#define VM_LAZY_FREE 0x01 -#define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); +static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -344,6 +344,8 @@ static void __insert_vmap_area(struct vmap_area *va) static void purge_vmap_area_lazy(void); +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -363,6 +365,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) @@ -468,6 +472,16 @@ overflow: purged = 1; goto retry; } + + if (gfpflags_allow_blocking(gfp_mask)) { + unsigned long freed = 0; + blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); + if (freed > 0) { + purged = 0; + goto retry; + } + } + if (printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", size); @@ -475,6 +489,18 @@ overflow: return ERR_PTR(-EBUSY); } +int register_vmap_purge_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&vmap_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); + +int unregister_vmap_purge_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&vmap_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); + static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); @@ -601,7 +627,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { static DEFINE_SPINLOCK(purge_lock); - LIST_HEAD(valist); + struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; int nr = 0; @@ -620,20 +646,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (sync) purge_fragmented_blocks_allcpus(); - rcu_read_lock(); - list_for_each_entry_rcu(va, &vmap_area_list, list) { - if (va->flags & VM_LAZY_FREE) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - list_add_tail(&va->purge_list, &valist); - va->flags |= VM_LAZY_FREEING; - va->flags &= ~VM_LAZY_FREE; - } + valist = llist_del_all(&vmap_purge_list); + llist_for_each_entry(va, valist, purge_list) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; } - rcu_read_unlock(); if (nr) atomic_sub(nr, &vmap_lazy_nr); @@ -643,7 +663,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (nr) { spin_lock(&vmap_area_lock); - list_for_each_entry_safe(va, n_va, &valist, purge_list) + llist_for_each_entry_safe(va, n_va, valist, purge_list) __free_vmap_area(va); spin_unlock(&vmap_area_lock); } @@ -678,9 +698,15 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - va->flags |= VM_LAZY_FREE; - atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); - if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + int nr_lazy; + + nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, + &vmap_lazy_nr); + + /* After this point, we may free va at any time */ + llist_add(&va->purge_list, &vmap_purge_list); + + if (unlikely(nr_lazy > lazy_max_pages())) try_purge_vmap_area_lazy(); } diff --git a/mm/vmscan.c b/mm/vmscan.c index b934223eaa45..c4a2f4512fca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc) } #endif -static unsigned long zone_reclaimable_pages(struct zone *zone) +unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; @@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_count. + * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. @@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); scan++) { struct page *page; - int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); @@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode)) { case 0: - nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); + nr_taken += hpage_nr_pages(page); list_move(&page->lru, dst); - nr_taken += nr_pages; break; case -EBUSY: @@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + update_lru_size(lruvec, lru, -nr_taken); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); @@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&zone->lru_lock); - reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { if (current_is_kswapd()) __count_zone_vm_events(PGSTEAL_KSWAPD, zone, @@ -1720,7 +1716,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. * - * The downside is that we have to touch page->_count against each page. + * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. */ @@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, SetPageLRU(page); nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, pages_to_free); } } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) __count_vm_events(PGDEACTIVATE, pgmoved); } @@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + update_lru_size(lruvec, lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; + if (global_reclaim(sc)) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); __count_zone_vm_events(PGREFILL, zone, nr_scanned); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + spin_unlock_irq(&zone->lru_lock); while (!list_empty(&l_hold)) { @@ -1865,83 +1862,63 @@ static void shrink_active_list(unsigned long nr_to_scan, free_hot_cold_page_list(&l_hold, true); } -#ifdef CONFIG_SWAP -static bool inactive_anon_is_low_global(struct zone *zone) -{ - unsigned long active, inactive; - - active = zone_page_state(zone, NR_ACTIVE_ANON); - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - - return inactive * zone->inactive_ratio < active; -} - -/** - * inactive_anon_is_low - check if anonymous pages need to be deactivated - * @lruvec: LRU vector to check +/* + * The inactive anon list should be small enough that the VM never has + * to do too much work. * - * Returns true if the zone does not have enough inactive anon pages, - * meaning some active anon pages need to be deactivated. - */ -static bool inactive_anon_is_low(struct lruvec *lruvec) -{ - /* - * If we don't have swap space, anonymous page deactivation - * is pointless. - */ - if (!total_swap_pages) - return false; - - if (!mem_cgroup_disabled()) - return mem_cgroup_inactive_anon_is_low(lruvec); - - return inactive_anon_is_low_global(lruvec_zone(lruvec)); -} -#else -static inline bool inactive_anon_is_low(struct lruvec *lruvec) -{ - return false; -} -#endif - -/** - * inactive_file_is_low - check if file pages need to be deactivated - * @lruvec: LRU vector to check + * The inactive file list should be small enough to leave most memory + * to the established workingset on the scan-resistant active list, + * but large enough to avoid thrashing the aggregate readahead window. * - * When the system is doing streaming IO, memory pressure here - * ensures that active file pages get deactivated, until more - * than half of the file pages are on the inactive list. + * Both inactive lists should also be large enough that each inactive + * page has a chance to be referenced again before it is reclaimed. * - * Once we get to that situation, protect the system's working - * set from being evicted by disabling active file page aging. + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages + * on this LRU, maintained by the pageout code. A zone->inactive_ratio + * of 3 means 3:1 or 25% of the pages are kept on the inactive list. * - * This uses a different ratio than the anonymous pages, because - * the page cache uses a use-once replacement algorithm. + * total target max + * memory ratio inactive + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB */ -static bool inactive_file_is_low(struct lruvec *lruvec) +static bool inactive_list_is_low(struct lruvec *lruvec, bool file) { + unsigned long inactive_ratio; unsigned long inactive; unsigned long active; + unsigned long gb; - inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); - active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!file && !total_swap_pages) + return false; - return active > inactive; -} + inactive = lruvec_lru_size(lruvec, file * LRU_FILE); + active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); -static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) -{ - if (is_file_lru(lru)) - return inactive_file_is_low(lruvec); + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); else - return inactive_anon_is_low(lruvec); + inactive_ratio = 1; + + return inactive * inactive_ratio < active; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, lru)) + if (inactive_list_is_low(lruvec, is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2062,7 +2039,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_file_is_low(lruvec) && + if (!inactive_list_is_low(lruvec, true) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2304,7 +2281,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(lruvec)) + if (inactive_list_is_low(lruvec, false)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -2482,7 +2459,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Returns true if compaction should go ahead for a high-order request, or * the high-order allocation would succeed without compaction. */ -static inline bool compaction_ready(struct zone *zone, int order) +static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx) { unsigned long balance_gap, watermark; bool watermark_ok; @@ -2496,7 +2473,7 @@ static inline bool compaction_ready(struct zone *zone, int order) balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx); /* * If compaction is deferred, reclaim up to a point where @@ -2509,7 +2486,7 @@ static inline bool compaction_ready(struct zone *zone, int order) * If compaction is not ready to start and allocation is not likely * to succeed without it, then keep reclaiming. */ - if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) + if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2530,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order) * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. - * - * Returns true if a zone was reclaimable. */ -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; @@ -2541,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_scanned; gfp_t orig_mask; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); - bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2553,7 +2527,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask |= __GFP_HIGHMEM; for_each_zone_zonelist_nodemask(zone, z, zonelist, - requested_highidx, sc->nodemask) { + gfp_zone(sc->gfp_mask), sc->nodemask) { enum zone_type classzone_idx; if (!populated_zone(zone)) @@ -2589,7 +2563,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && zonelist_zone_idx(z) <= requested_highidx && - compaction_ready(zone, sc->order)) { + compaction_ready(zone, sc->order, requested_highidx)) { sc->compaction_ready = true; continue; } @@ -2606,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; - if (nr_soft_reclaimed) - reclaimable = true; /* need some check for avoid more shrink_zone() */ } - if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) - reclaimable = true; - - if (global_reclaim(sc) && - !reclaimable && zone_reclaimable(zone)) - reclaimable = true; + shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); } /* @@ -2624,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; - - return reclaimable; } /* @@ -2650,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, int initial_priority = sc->priority; unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool zones_reclaimable; retry: delayacct_freepages_start(); @@ -2661,7 +2625,7 @@ retry: vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - zones_reclaimable = shrink_zones(zonelist, sc); + shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) @@ -2708,10 +2672,6 @@ retry: goto retry; } - /* Any of the zones still reclaimable? Don't OOM. */ - if (zones_reclaimable) - return 1; - return 0; } @@ -2965,7 +2925,7 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) do { struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - if (inactive_anon_is_low(lruvec)) + if (inactive_list_is_low(lruvec, false)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -3318,6 +3278,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, /* Try to sleep for a short interval */ if (prepare_kswapd_sleep(pgdat, order, remaining, balanced_classzone_idx)) { + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, order, classzone_idx); + remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3341,20 +3315,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - /* - * Compaction records what page blocks it recently failed to - * isolate pages from and skips them in the future scanning. - * When kswapd is going to sleep, it is reasonable to assume - * that pages and compaction may succeed so reset the cache. - */ - reset_isolation_suitable(pgdat); - - /* - * We have freed the memory, now we should compact it to make - * allocation of the requested order possible. - */ - wakeup_kcompactd(pgdat, order, classzone_idx); - if (!kthread_should_stop()) schedule(); diff --git a/mm/vmstat.c b/mm/vmstat.c index 5e4300482897..77e42ef388c2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -570,49 +570,18 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) #ifdef CONFIG_NUMA /* - * zonelist = the list of zones passed to the allocator - * z = the zone from which the allocation occurred. - * - * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. - */ -void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) -{ - if (z->zone_pgdat == preferred_zone->zone_pgdat) { - __inc_zone_state(z, NUMA_HIT); - } else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); - } - if (z->node == ((flags & __GFP_OTHER_NODE) ? - preferred_zone->node : numa_node_id())) - __inc_zone_state(z, NUMA_LOCAL); - else - __inc_zone_state(z, NUMA_OTHER); -} - -/* * Determine the per node value of a stat item. */ unsigned long node_page_state(int node, enum zone_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; + int i; + unsigned long count = 0; - return -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], item) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], item) + -#endif -#ifdef CONFIG_HIGHMEM - zone_page_state(&zones[ZONE_HIGHMEM], item) + -#endif - zone_page_state(&zones[ZONE_NORMAL], item) + - zone_page_state(&zones[ZONE_MOVABLE], item); + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_page_state(zones + i, item); + + return count; } #endif @@ -1010,6 +979,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, if (!memmap_valid_within(pfn, page, zone)) continue; + if (page_zone(page) != zone) + continue; + mtype = get_pageblock_migratetype(page); if (mtype < MIGRATE_TYPES) @@ -1069,13 +1041,17 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, block_end_pfn = min(block_end_pfn, end_pfn); page = pfn_to_page(pfn); - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { if (!pfn_valid_within(pfn)) continue; page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (PageBuddy(page)) { pfn += (1UL << page_order(page)) - 1; continue; @@ -1376,7 +1352,66 @@ static const struct file_operations proc_vmstat_file_operations = { static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; -static cpumask_var_t cpu_stat_off; + +#ifdef CONFIG_PROC_FS +static void refresh_vm_stats(struct work_struct *work) +{ + refresh_cpu_vm_stats(true); +} + +int vmstat_refresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + long val; + int err; + int i; + + /* + * The regular update, every sysctl_stat_interval, may come later + * than expected: leaving a significant amount in per_cpu buckets. + * This is particularly misleading when checking a quantity of HUGE + * pages, immediately after running a test. /proc/sys/vm/stat_refresh, + * which can equally be echo'ed to or cat'ted from (by root), + * can be used to update the stats just before reading them. + * + * Oh, and since global_page_state() etc. are so careful to hide + * transiently negative values, report an error here if any of + * the stats is negative, so we know to go looking for imbalance. + */ + err = schedule_on_each_cpu(refresh_vm_stats); + if (err) + return err; + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_stat[i]); + if (val < 0) { + switch (i) { + case NR_ALLOC_BATCH: + case NR_PAGES_SCANNED: + /* + * These are often seen to go negative in + * recent kernels, but not to go permanently + * negative. Whilst it would be nicer not to + * have exceptions, rooting them out would be + * another task, of rather low priority. + */ + break; + default: + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; + break; + } + } + } + if (err) + return err; + if (write) + *ppos += *lenp; + else + *lenp = 0; + return 0; +} +#endif /* CONFIG_PROC_FS */ static void vmstat_update(struct work_struct *w) { @@ -1385,24 +1420,10 @@ static void vmstat_update(struct work_struct *w) * Counters were updated so we expect more updates * to occur in the future. Keep on running the * update worker thread. - * If we were marked on cpu_stat_off clear the flag - * so that vmstat_shepherd doesn't schedule us again. */ - if (!cpumask_test_and_clear_cpu(smp_processor_id(), - cpu_stat_off)) { - queue_delayed_work_on(smp_processor_id(), vmstat_wq, + queue_delayed_work_on(smp_processor_id(), vmstat_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); - } - } else { - /* - * We did not update any counters so the app may be in - * a mode where it does not cause counter updates. - * We may be uselessly running vmstat_update. - * Defer the checking for differentials to the - * shepherd thread on a different processor. - */ - cpumask_set_cpu(smp_processor_id(), cpu_stat_off); } } @@ -1434,16 +1455,17 @@ static bool need_update(int cpu) return false; } +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ void quiet_vmstat(void) { if (system_state != SYSTEM_RUNNING) return; - /* - * If we are already in hands of the shepherd then there - * is nothing for us to do here. - */ - if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + if (!delayed_work_pending(this_cpu_ptr(&vmstat_work))) return; if (!need_update(smp_processor_id())) @@ -1458,7 +1480,6 @@ void quiet_vmstat(void) refresh_cpu_vm_stats(false); } - /* * Shepherd worker thread that checks the * differentials of processors that have their worker @@ -1475,20 +1496,11 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ - for_each_cpu(cpu, cpu_stat_off) { + for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); - if (need_update(cpu)) { - if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - queue_delayed_work_on(cpu, vmstat_wq, dw, 0); - } else { - /* - * Cancel the work if quiet_vmstat has put this - * cpu on cpu_stat_off because the work item might - * be still scheduled - */ - cancel_delayed_work(dw); - } + if (!delayed_work_pending(dw) && need_update(cpu)) + queue_delayed_work_on(cpu, vmstat_wq, dw, 0); } put_online_cpus(); @@ -1504,10 +1516,6 @@ static void __init start_shepherd_timer(void) INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); - if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) - BUG(); - cpumask_copy(cpu_stat_off, cpu_online_mask); - vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); @@ -1542,16 +1550,13 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: refresh_zone_stat_thresholds(); node_set_state(cpu_to_node(cpu), N_CPU); - cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - cpumask_clear_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DEAD: case CPU_DEAD_FROZEN: diff --git a/mm/z3fold.c b/mm/z3fold.c new file mode 100644 index 000000000000..34917d55d311 --- /dev/null +++ b/mm/z3fold.c @@ -0,0 +1,792 @@ +/* + * z3fold.c + * + * Author: Vitaly Wool <vitaly.wool@konsulko.com> + * Copyright (C) 2016, Sony Mobile Communications Inc. + * + * This implementation is based on zbud written by Seth Jennings. + * + * z3fold is an special purpose allocator for storing compressed pages. It + * can store up to three compressed pages per page which improves the + * compression ratio of zbud while retaining its main concepts (e. g. always + * storing an integral number of objects per page) and simplicity. + * It still has simple and deterministic reclaim properties that make it + * preferable to a higher density approach (with no requirement on integral + * number of object per page) when reclaim is used. + * + * As in zbud, pages are divided into "chunks". The size of the chunks is + * fixed at compile time and is determined by NCHUNKS_ORDER below. + * + * z3fold doesn't export any API and is meant to be used via zpool API. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/zpool.h> + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk + * in allocated page is occupied by z3fold header, NCHUNKS will be calculated + * to 63 which shows the max number of free chunks in z3fold page, also there + * will be 63 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1) + +struct z3fold_pool; +struct z3fold_ops { + int (*evict)(struct z3fold_pool *pool, unsigned long handle); +}; + +/** + * struct z3fold_pool - stores metadata for each z3fold pool + * @lock: protects all pool fields and first|last_chunk fields of any + * z3fold page in the pool + * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; + * the lists each z3fold page is added to depends on the size of + * its free region. + * @buddied: list tracking the z3fold pages that contain 3 buddies; + * these z3fold pages are full + * @lru: list tracking the z3fold pages in LRU order by most recently + * added buddy. + * @pages_nr: number of z3fold pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular z3fold pool. + */ +struct z3fold_pool { + spinlock_t lock; + struct list_head unbuddied[NCHUNKS]; + struct list_head buddied; + struct list_head lru; + u64 pages_nr; + const struct z3fold_ops *ops; + struct zpool *zpool; + const struct zpool_ops *zpool_ops; +}; + +enum buddy { + HEADLESS = 0, + FIRST, + MIDDLE, + LAST, + BUDDIES_MAX +}; + +/* + * struct z3fold_header - z3fold page metadata occupying the first chunk of each + * z3fold page, except for HEADLESS pages + * @buddy: links the z3fold page into the relevant list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @middle_chunks: the size of the middle buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + * @first_num: the starting number (for the first handle) + */ +struct z3fold_header { + struct list_head buddy; + unsigned short first_chunks; + unsigned short middle_chunks; + unsigned short last_chunks; + unsigned short start_middle; + unsigned short first_num:NCHUNKS_ORDER; +}; + +/* + * Internal z3fold page flags + */ +enum z3fold_page_flags { + UNDER_RECLAIM = 0, + PAGE_HEADLESS, + MIDDLE_CHUNK_MAPPED, +}; + +/***************** + * Helpers +*****************/ + +/* Converts an allocation size in bytes to size in z3fold chunks */ +static int size_to_chunks(size_t size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the z3fold header of a newly allocated z3fold page */ +static struct z3fold_header *init_z3fold_page(struct page *page) +{ + struct z3fold_header *zhdr = page_address(page); + + INIT_LIST_HEAD(&page->lru); + clear_bit(UNDER_RECLAIM, &page->private); + clear_bit(PAGE_HEADLESS, &page->private); + clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + + zhdr->first_chunks = 0; + zhdr->middle_chunks = 0; + zhdr->last_chunks = 0; + zhdr->first_num = 0; + zhdr->start_middle = 0; + INIT_LIST_HEAD(&zhdr->buddy); + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_z3fold_page(struct z3fold_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a z3fold page + * Pool lock should be held as this function accesses first_num + */ +static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + handle = (unsigned long)zhdr; + if (bud != HEADLESS) + handle += (bud + zhdr->first_num) & BUDDY_MASK; + return handle; +} + +/* Returns the z3fold page where a given handle is stored */ +static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) +{ + return (struct z3fold_header *)(handle & PAGE_MASK); +} + +/* Returns buddy number */ +static enum buddy handle_to_buddy(unsigned long handle) +{ + struct z3fold_header *zhdr = handle_to_z3fold_header(handle); + return (handle - zhdr->first_num) & BUDDY_MASK; +} + +/* + * Returns the number of free chunks in a z3fold page. + * NB: can't be used with HEADLESS pages. + */ +static int num_free_chunks(struct z3fold_header *zhdr) +{ + int nfree; + /* + * If there is a middle object, pick up the bigger free space + * either before or after it. Otherwise just subtract the number + * of chunks occupied by the first and the last objects. + */ + if (zhdr->middle_chunks != 0) { + int nfree_before = zhdr->first_chunks ? + 0 : zhdr->start_middle - 1; + int nfree_after = zhdr->last_chunks ? + 0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks; + nfree = max(nfree_before, nfree_after); + } else + nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; + return nfree; +} + +/***************** + * API Functions +*****************/ +/** + * z3fold_create_pool() - create a new z3fold pool + * @gfp: gfp flags when allocating the z3fold pool structure + * @ops: user-defined operations for the z3fold pool + * + * Return: pointer to the new z3fold pool or NULL if the metadata allocation + * failed. + */ +static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, + const struct z3fold_ops *ops) +{ + struct z3fold_pool *pool; + int i; + + pool = kzalloc(sizeof(struct z3fold_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * z3fold_destroy_pool() - destroys an existing z3fold pool + * @pool: the z3fold pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +static void z3fold_destroy_pool(struct z3fold_pool *pool) +{ + kfree(pool); +} + +/* Has to be called with lock held */ +static int z3fold_compact_page(struct z3fold_header *zhdr) +{ + struct page *page = virt_to_page(zhdr); + void *beg = zhdr; + + + if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) && + zhdr->middle_chunks != 0 && + zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + memmove(beg + ZHDR_SIZE_ALIGNED, + beg + (zhdr->start_middle << CHUNK_SHIFT), + zhdr->middle_chunks << CHUNK_SHIFT); + zhdr->first_chunks = zhdr->middle_chunks; + zhdr->middle_chunks = 0; + zhdr->start_middle = 0; + zhdr->first_num++; + return 1; + } + return 0; +} + +/** + * z3fold_alloc() - allocates a region of a given size + * @pool: z3fold pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as z3fold pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + int chunks = 0, i, freechunks; + struct z3fold_header *zhdr = NULL; + enum buddy bud; + struct page *page; + + if (!size || (gfp & __GFP_HIGHMEM)) + return -EINVAL; + + if (size > PAGE_SIZE) + return -ENOSPC; + + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) + bud = HEADLESS; + else { + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied z3fold page. */ + zhdr = NULL; + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct z3fold_header, buddy); + page = virt_to_page(zhdr); + if (zhdr->first_chunks == 0) { + if (zhdr->middle_chunks != 0 && + chunks >= zhdr->start_middle) + bud = LAST; + else + bud = FIRST; + } else if (zhdr->last_chunks == 0) + bud = LAST; + else if (zhdr->middle_chunks == 0) + bud = MIDDLE; + else { + pr_err("No free chunks in unbuddied\n"); + WARN_ON(1); + continue; + } + list_del(&zhdr->buddy); + goto found; + } + } + bud = FIRST; + spin_unlock(&pool->lock); + } + + /* Couldn't find unbuddied z3fold page, create new one */ + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_z3fold_page(page); + + if (bud == HEADLESS) { + set_bit(PAGE_HEADLESS, &page->private); + goto headless; + } + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else if (bud == LAST) + zhdr->last_chunks = chunks; + else { + zhdr->middle_chunks = chunks; + zhdr->start_middle = zhdr->first_chunks + 1; + } + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || + zhdr->middle_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + +headless: + /* Add/move z3fold page to beginning of LRU */ + if (!list_empty(&page->lru)) + list_del(&page->lru); + + list_add(&page->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * z3fold_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by z3fold_alloc() + * + * In the case that the z3fold page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see z3fold_reclaim_page() below). + */ +static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + int freechunks; + struct page *page; + enum buddy bud; + + spin_lock(&pool->lock); + zhdr = handle_to_z3fold_header(handle); + page = virt_to_page(zhdr); + + if (test_bit(PAGE_HEADLESS, &page->private)) { + /* HEADLESS page stored */ + bud = HEADLESS; + } else { + bud = (handle - zhdr->first_num) & BUDDY_MASK; + + switch (bud) { + case FIRST: + zhdr->first_chunks = 0; + break; + case MIDDLE: + zhdr->middle_chunks = 0; + zhdr->start_middle = 0; + break; + case LAST: + zhdr->last_chunks = 0; + break; + default: + pr_err("%s: unknown bud %d\n", __func__, bud); + WARN_ON(1); + spin_unlock(&pool->lock); + return; + } + } + + if (test_bit(UNDER_RECLAIM, &page->private)) { + /* z3fold page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + if (bud != HEADLESS) { + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + } + + if (bud == HEADLESS || + (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 && + zhdr->last_chunks == 0)) { + /* z3fold page is empty, free */ + list_del(&page->lru); + clear_bit(PAGE_HEADLESS, &page->private); + free_z3fold_page(zhdr); + pool->pages_nr--; + } else { + z3fold_compact_page(zhdr); + /* Add to the unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +/** + * z3fold_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retires: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * z3fold reclaim is different from normal system reclaim in that it is done + * from the bottom, up. This is because only the bottom layer, z3fold, has + * information on how the allocations are organized within each z3fold page. + * This has the potential to create interesting locking situations between + * z3fold and the user, however. + * + * To avoid these, this is how z3fold_reclaim_page() should be called: + + * The user detects a page should be reclaimed and calls z3fold_reclaim_page(). + * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and + * call the user-defined eviction handler with the pool and handle as + * arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. z3fold_reclaim_page() will add the z3fold page back to the + * appropriate list and try the next z3fold page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the z3fold page are successfully evicted, then the + * z3fold page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) +{ + int i, ret = 0, freechunks; + struct z3fold_header *zhdr; + struct page *page; + unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + page = list_last_entry(&pool->lru, struct page, lru); + list_del(&page->lru); + + /* Protect z3fold page against free */ + set_bit(UNDER_RECLAIM, &page->private); + zhdr = page_address(page); + if (!test_bit(PAGE_HEADLESS, &page->private)) { + list_del(&zhdr->buddy); + /* + * We need encode the handles before unlocking, since + * we can race with free that will set + * (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + middle_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->middle_chunks) + middle_handle = encode_handle(zhdr, MIDDLE); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + } else { + first_handle = encode_handle(zhdr, HEADLESS); + last_handle = middle_handle = 0; + } + + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (middle_handle) { + ret = pool->ops->evict(pool, middle_handle); + if (ret) + goto next; + } + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + clear_bit(UNDER_RECLAIM, &page->private); + if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) || + (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 && + zhdr->middle_chunks == 0)) { + /* + * All buddies are now free, free the z3fold page and + * return success. + */ + clear_bit(PAGE_HEADLESS, &page->private); + free_z3fold_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks != 0 && + zhdr->last_chunks != 0 && zhdr->middle_chunks != 0) { + /* Full, add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } else if (!test_bit(PAGE_HEADLESS, &page->private)) { + z3fold_compact_page(zhdr); + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + /* add to beginning of LRU */ + list_add(&page->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * z3fold_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * Extracts the buddy number from handle and constructs the pointer to the + * correct starting chunk within the page. + * + * Returns: a pointer to the mapped allocation + */ +static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + struct page *page; + void *addr; + enum buddy buddy; + + spin_lock(&pool->lock); + zhdr = handle_to_z3fold_header(handle); + addr = zhdr; + page = virt_to_page(zhdr); + + if (test_bit(PAGE_HEADLESS, &page->private)) + goto out; + + buddy = handle_to_buddy(handle); + switch (buddy) { + case FIRST: + addr += ZHDR_SIZE_ALIGNED; + break; + case MIDDLE: + addr += zhdr->start_middle << CHUNK_SHIFT; + set_bit(MIDDLE_CHUNK_MAPPED, &page->private); + break; + case LAST: + addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + break; + default: + pr_err("unknown buddy id %d\n", buddy); + WARN_ON(1); + addr = NULL; + break; + } +out: + spin_unlock(&pool->lock); + return addr; +} + +/** + * z3fold_unmap() - unmaps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + struct page *page; + enum buddy buddy; + + spin_lock(&pool->lock); + zhdr = handle_to_z3fold_header(handle); + page = virt_to_page(zhdr); + + if (test_bit(PAGE_HEADLESS, &page->private)) { + spin_unlock(&pool->lock); + return; + } + + buddy = handle_to_buddy(handle); + if (buddy == MIDDLE) + clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + spin_unlock(&pool->lock); +} + +/** + * z3fold_get_pool_size() - gets the z3fold pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +static u64 z3fold_get_pool_size(struct z3fold_pool *pool) +{ + return pool->pages_nr; +} + +/***************** + * zpool + ****************/ + +static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) +{ + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; +} + +static const struct z3fold_ops z3fold_zpool_ops = { + .evict = z3fold_zpool_evict +}; + +static void *z3fold_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + struct z3fold_pool *pool; + + pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; +} + +static void z3fold_zpool_destroy(void *pool) +{ + z3fold_destroy_pool(pool); +} + +static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return z3fold_alloc(pool, size, gfp, handle); +} +static void z3fold_zpool_free(void *pool, unsigned long handle) +{ + z3fold_free(pool, handle); +} + +static int z3fold_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = z3fold_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *z3fold_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return z3fold_map(pool, handle); +} +static void z3fold_zpool_unmap(void *pool, unsigned long handle) +{ + z3fold_unmap(pool, handle); +} + +static u64 z3fold_zpool_total_size(void *pool) +{ + return z3fold_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver z3fold_zpool_driver = { + .type = "z3fold", + .owner = THIS_MODULE, + .create = z3fold_zpool_create, + .destroy = z3fold_zpool_destroy, + .malloc = z3fold_zpool_malloc, + .free = z3fold_zpool_free, + .shrink = z3fold_zpool_shrink, + .map = z3fold_zpool_map, + .unmap = z3fold_zpool_unmap, + .total_size = z3fold_zpool_total_size, +}; + +MODULE_ALIAS("zpool-z3fold"); + +static int __init init_z3fold(void) +{ + /* Make sure the z3fold header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED); + zpool_register_driver(&z3fold_zpool_driver); + + return 0; +} + +static void __exit exit_z3fold(void) +{ + zpool_unregister_driver(&z3fold_zpool_driver); +} + +module_init(init_z3fold); +module_exit(exit_z3fold); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>"); +MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e72efb109fde..72698db958e7 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -247,7 +247,6 @@ struct zs_pool { struct size_class **size_class; struct kmem_cache *handle_cachep; - gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; struct zs_pool_stats stats; @@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool) kmem_cache_destroy(pool->handle_cachep); } -static unsigned long alloc_handle(struct zs_pool *pool) +static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - pool->flags & ~__GFP_HIGHMEM); + gfp & ~__GFP_HIGHMEM); } static void free_handle(struct zs_pool *pool, unsigned long handle) @@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { - return zs_create_pool(name, gfp); + /* + * Ignore global gfp flags: zs_malloc() may be invoked from + * different contexts and its caller must provide a valid + * gfp mask. + */ + return zs_create_pool(name); } static void zs_zpool_destroy(void *pool) @@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool) static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { - *handle = zs_malloc(pool, size); + *handle = zs_malloc(pool, size, gfp); return *handle ? 0 : -1; } static void zs_zpool_free(void *pool, unsigned long handle) @@ -413,26 +417,28 @@ static int is_last_page(struct page *page) return PagePrivate2(page); } -static void get_zspage_mapping(struct page *page, unsigned int *class_idx, +static void get_zspage_mapping(struct page *first_page, + unsigned int *class_idx, enum fullness_group *fullness) { unsigned long m; - BUG_ON(!is_first_page(page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - m = (unsigned long)page->mapping; + m = (unsigned long)first_page->mapping; *fullness = m & FULLNESS_MASK; *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; } -static void set_zspage_mapping(struct page *page, unsigned int class_idx, +static void set_zspage_mapping(struct page *first_page, + unsigned int class_idx, enum fullness_group fullness) { unsigned long m; - BUG_ON(!is_first_page(page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | (fullness & FULLNESS_MASK); - page->mapping = (struct address_space *)m; + first_page->mapping = (struct address_space *)m; } /* @@ -567,17 +573,17 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(const char *name, struct zs_pool *pool) +static void zs_pool_stat_create(struct zs_pool *pool, const char *name) { struct dentry *entry; if (!zs_stat_root) - return -ENODEV; + return; entry = debugfs_create_dir(name, zs_stat_root); if (!entry) { pr_warn("debugfs dir <%s> creation failed\n", name); - return -ENOMEM; + return; } pool->stat_dentry = entry; @@ -586,10 +592,8 @@ static int zs_pool_stat_create(const char *name, struct zs_pool *pool) if (!entry) { pr_warn("%s: debugfs file entry <%s> creation failed\n", name, "classes"); - return -ENOMEM; + return; } - - return 0; } static void zs_pool_stat_destroy(struct zs_pool *pool) @@ -607,9 +611,8 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) +static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) { - return 0; } static inline void zs_pool_stat_destroy(struct zs_pool *pool) @@ -617,7 +620,6 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) } #endif - /* * For each size class, zspages are divided into different groups * depending on how "full" they are. This was done so that we could @@ -625,14 +627,15 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) * the pool (not yet implemented). This function returns fullness * status of the given page. */ -static enum fullness_group get_fullness_group(struct page *page) +static enum fullness_group get_fullness_group(struct page *first_page) { int inuse, max_objects; enum fullness_group fg; - BUG_ON(!is_first_page(page)); - inuse = page->inuse; - max_objects = page->objects; + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + + inuse = first_page->inuse; + max_objects = first_page->objects; if (inuse == 0) fg = ZS_EMPTY; @@ -652,12 +655,13 @@ static enum fullness_group get_fullness_group(struct page *page) * have. This functions inserts the given zspage into the freelist * identified by <class, fullness_group>. */ -static void insert_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) +static void insert_zspage(struct size_class *class, + enum fullness_group fullness, + struct page *first_page) { struct page **head; - BUG_ON(!is_first_page(page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; @@ -667,7 +671,7 @@ static void insert_zspage(struct page *page, struct size_class *class, head = &class->fullness_list[fullness]; if (!*head) { - *head = page; + *head = first_page; return; } @@ -675,34 +679,35 @@ static void insert_zspage(struct page *page, struct size_class *class, * We want to see more ZS_FULL pages and less almost * empty/full. Put pages with higher ->inuse first. */ - list_add_tail(&page->lru, &(*head)->lru); - if (page->inuse >= (*head)->inuse) - *head = page; + list_add_tail(&first_page->lru, &(*head)->lru); + if (first_page->inuse >= (*head)->inuse) + *head = first_page; } /* * This function removes the given zspage from the freelist identified * by <class, fullness_group>. */ -static void remove_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) +static void remove_zspage(struct size_class *class, + enum fullness_group fullness, + struct page *first_page) { struct page **head; - BUG_ON(!is_first_page(page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; head = &class->fullness_list[fullness]; - BUG_ON(!*head); + VM_BUG_ON_PAGE(!*head, first_page); if (list_empty(&(*head)->lru)) *head = NULL; - else if (*head == page) + else if (*head == first_page) *head = (struct page *)list_entry((*head)->lru.next, struct page, lru); - list_del_init(&page->lru); + list_del_init(&first_page->lru); zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); } @@ -717,21 +722,19 @@ static void remove_zspage(struct page *page, struct size_class *class, * fullness group. */ static enum fullness_group fix_fullness_group(struct size_class *class, - struct page *page) + struct page *first_page) { int class_idx; enum fullness_group currfg, newfg; - BUG_ON(!is_first_page(page)); - - get_zspage_mapping(page, &class_idx, &currfg); - newfg = get_fullness_group(page); + get_zspage_mapping(first_page, &class_idx, &currfg); + newfg = get_fullness_group(first_page); if (newfg == currfg) goto out; - remove_zspage(page, class, currfg); - insert_zspage(page, class, newfg); - set_zspage_mapping(page, class_idx, newfg); + remove_zspage(class, currfg, first_page); + insert_zspage(class, newfg, first_page); + set_zspage_mapping(first_page, class_idx, newfg); out: return newfg; @@ -809,7 +812,7 @@ static void *location_to_obj(struct page *page, unsigned long obj_idx) unsigned long obj; if (!page) { - BUG_ON(obj_idx); + VM_BUG_ON(obj_idx); return NULL; } @@ -842,7 +845,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page, void *obj) { if (class->huge) { - VM_BUG_ON(!is_first_page(page)); + VM_BUG_ON_PAGE(!is_first_page(page), page); return page_private(page); } else return *(unsigned long *)obj; @@ -892,8 +895,8 @@ static void free_zspage(struct page *first_page) { struct page *nextp, *tmp, *head_extra; - BUG_ON(!is_first_page(first_page)); - BUG_ON(first_page->inuse); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + VM_BUG_ON_PAGE(first_page->inuse, first_page); head_extra = (struct page *)page_private(first_page); @@ -914,12 +917,13 @@ static void free_zspage(struct page *first_page) } /* Initialize a newly allocated zspage */ -static void init_zspage(struct page *first_page, struct size_class *class) +static void init_zspage(struct size_class *class, struct page *first_page) { unsigned long off = 0; struct page *page = first_page; - BUG_ON(!is_first_page(first_page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + while (page) { struct page *next_page; struct link_free *link; @@ -1001,7 +1005,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) prev_page = page; } - init_zspage(first_page, class); + init_zspage(class, first_page); first_page->freelist = location_to_obj(first_page, 0); /* Maximum number of objects we can store in this zspage */ @@ -1234,11 +1238,11 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } -static bool zspage_full(struct page *page) +static bool zspage_full(struct page *first_page) { - BUG_ON(!is_first_page(page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - return page->inuse == page->objects; + return first_page->inuse == first_page->objects; } unsigned long zs_get_total_pages(struct zs_pool *pool) @@ -1274,14 +1278,12 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, struct page *pages[2]; void *ret; - BUG_ON(!handle); - /* * Because we use per-cpu mapping areas shared among the * pools/users, we can't allow mapping in interrupt context * because it can corrupt another users mappings. */ - BUG_ON(in_interrupt()); + WARN_ON_ONCE(in_interrupt()); /* From now on, migration cannot move the object */ pin_tag(handle); @@ -1325,8 +1327,6 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) struct size_class *class; struct mapping_area *area; - BUG_ON(!handle); - obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); @@ -1350,8 +1350,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); -static unsigned long obj_malloc(struct page *first_page, - struct size_class *class, unsigned long handle) +static unsigned long obj_malloc(struct size_class *class, + struct page *first_page, unsigned long handle) { unsigned long obj; struct link_free *link; @@ -1391,7 +1391,7 @@ static unsigned long obj_malloc(struct page *first_page, * otherwise 0. * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; @@ -1400,7 +1400,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - handle = alloc_handle(pool); + handle = alloc_handle(pool, gfp); if (!handle) return 0; @@ -1413,7 +1413,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (!first_page) { spin_unlock(&class->lock); - first_page = alloc_zspage(class, pool->flags); + first_page = alloc_zspage(class, gfp); if (unlikely(!first_page)) { free_handle(pool, handle); return 0; @@ -1428,7 +1428,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) class->size, class->pages_per_zspage)); } - obj = obj_malloc(first_page, class, handle); + obj = obj_malloc(class, first_page, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, first_page); record_obj(handle, obj); @@ -1438,16 +1438,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(struct zs_pool *pool, struct size_class *class, - unsigned long obj) +static void obj_free(struct size_class *class, unsigned long obj) { struct link_free *link; struct page *first_page, *f_page; unsigned long f_objidx, f_offset; void *vaddr; - BUG_ON(!obj); - obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); @@ -1487,7 +1484,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) class = pool->size_class[class_idx]; spin_lock(&class->lock); - obj_free(pool, class, obj); + obj_free(class, obj); fullness = fix_fullness_group(class, first_page); if (fullness == ZS_EMPTY) { zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( @@ -1503,8 +1500,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_free); -static void zs_object_copy(unsigned long dst, unsigned long src, - struct size_class *class) +static void zs_object_copy(struct size_class *class, unsigned long dst, + unsigned long src) { struct page *s_page, *d_page; unsigned long s_objidx, d_objidx; @@ -1547,7 +1544,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src, kunmap_atomic(d_addr); kunmap_atomic(s_addr); s_page = get_next_page(s_page); - BUG_ON(!s_page); s_addr = kmap_atomic(s_page); d_addr = kmap_atomic(d_page); s_size = class->size - written; @@ -1557,7 +1553,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src, if (d_off >= PAGE_SIZE) { kunmap_atomic(d_addr); d_page = get_next_page(d_page); - BUG_ON(!d_page); d_addr = kmap_atomic(d_page); d_size = class->size - written; d_off = 0; @@ -1572,8 +1567,8 @@ static void zs_object_copy(unsigned long dst, unsigned long src, * Find alloced object in zspage from index object and * return handle. */ -static unsigned long find_alloced_obj(struct page *page, int index, - struct size_class *class) +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int index) { unsigned long head; int offset = 0; @@ -1623,7 +1618,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, int ret = 0; while (1) { - handle = find_alloced_obj(s_page, index, class); + handle = find_alloced_obj(class, s_page, index); if (!handle) { s_page = get_next_page(s_page); if (!s_page) @@ -1640,8 +1635,8 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(d_page, class, handle); - zs_object_copy(free_obj, used_obj, class); + free_obj = obj_malloc(class, d_page, handle); + zs_object_copy(class, free_obj, used_obj); index++; /* * record_obj updates handle's value to free_obj and it will @@ -1652,7 +1647,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); - obj_free(pool, class, used_obj); + obj_free(class, used_obj); } /* Remember last position in this iteration */ @@ -1670,7 +1665,7 @@ static struct page *isolate_target_page(struct size_class *class) for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { page = class->fullness_list[i]; if (page) { - remove_zspage(page, class, i); + remove_zspage(class, i, page); break; } } @@ -1692,10 +1687,8 @@ static enum fullness_group putback_zspage(struct zs_pool *pool, { enum fullness_group fullness; - BUG_ON(!is_first_page(first_page)); - fullness = get_fullness_group(first_page); - insert_zspage(first_page, class, fullness); + insert_zspage(class, fullness, first_page); set_zspage_mapping(first_page, class->index, fullness); if (fullness == ZS_EMPTY) { @@ -1720,7 +1713,7 @@ static struct page *isolate_source_page(struct size_class *class) if (!page) continue; - remove_zspage(page, class, i); + remove_zspage(class, i, page); break; } @@ -1735,10 +1728,13 @@ static struct page *isolate_source_page(struct size_class *class) static unsigned long zs_can_compact(struct size_class *class) { unsigned long obj_wasted; + unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, OBJ_USED); - obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - - zs_stat_get(class, OBJ_USED); + if (obj_allocated <= obj_used) + return 0; + obj_wasted = obj_allocated - obj_used; obj_wasted /= get_maxobj_per_zspage(class->size, class->pages_per_zspage); @@ -1754,8 +1750,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) spin_lock(&class->lock); while ((src_page = isolate_source_page(class))) { - BUG_ON(!is_first_page(src_page)); - if (!zs_can_compact(class)) break; @@ -1884,7 +1878,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(const char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name) { int i; struct zs_pool *pool; @@ -1954,10 +1948,8 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) prev_class = class; } - pool->flags = flags; - - if (zs_pool_stat_create(name, pool)) - goto err; + /* debug only, don't abort if it fails */ + zs_pool_stat_create(pool, name); /* * Not critical, we still can use the pool diff --git a/mm/zswap.c b/mm/zswap.c index 91dad80d068b..275b22cc8df4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -117,7 +117,7 @@ struct zswap_pool { struct crypto_comp * __percpu *tfm; struct kref kref; struct list_head list; - struct rcu_head rcu_head; + struct work_struct work; struct notifier_block notifier; char tfm_name[CRYPTO_MAX_ALG_NAME]; }; @@ -170,6 +170,8 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); +/* pool counter to provide unique names to zpool */ +static atomic_t zswap_pools_count = ATOMIC_INIT(0); /* used by param callback function */ static bool zswap_init_started; @@ -565,6 +567,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); @@ -573,7 +576,10 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) return NULL; } - pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops); + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + + pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); if (!pool->zpool) { pr_err("%s zpool not available\n", type); goto error; @@ -652,9 +658,11 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool) return kref_get_unless_zero(&pool->kref); } -static void __zswap_pool_release(struct rcu_head *head) +static void __zswap_pool_release(struct work_struct *work) { - struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head); + struct zswap_pool *pool = container_of(work, typeof(*pool), work); + + synchronize_rcu(); /* nobody should have been able to get a kref... */ WARN_ON(kref_get_unless_zero(&pool->kref)); @@ -674,7 +682,9 @@ static void __zswap_pool_empty(struct kref *kref) WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); - call_rcu(&pool->rcu_head, __zswap_pool_release); + + INIT_WORK(&pool->work, __zswap_pool_release); + schedule_work(&pool->work); spin_unlock(&zswap_pools_lock); } |