diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/cma.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 156 | ||||
-rw-r--r-- | mm/debug.c | 3 | ||||
-rw-r--r-- | mm/gup.c | 228 | ||||
-rw-r--r-- | mm/huge_memory.c | 106 | ||||
-rw-r--r-- | mm/hugetlb.c | 158 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 22 | ||||
-rw-r--r-- | mm/memcontrol.c | 702 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/mempolicy.c | 277 | ||||
-rw-r--r-- | mm/migrate.c | 5 | ||||
-rw-r--r-- | mm/mincore.c | 166 | ||||
-rw-r--r-- | mm/mmap.c | 7 | ||||
-rw-r--r-- | mm/mmzone.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 37 | ||||
-rw-r--r-- | mm/oom_kill.c | 169 | ||||
-rw-r--r-- | mm/page-writeback.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 432 | ||||
-rw-r--r-- | mm/page_counter.c | 7 | ||||
-rw-r--r-- | mm/page_owner.c | 26 | ||||
-rw-r--r-- | mm/pagewalk.c | 238 | ||||
-rw-r--r-- | mm/process_vm_access.c | 7 | ||||
-rw-r--r-- | mm/rmap.c | 12 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/util.c | 10 | ||||
-rw-r--r-- | mm/vmscan.c | 32 | ||||
-rw-r--r-- | mm/vmstat.c | 6 |
28 files changed, 1738 insertions, 1110 deletions
@@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, cma->order_per_bit = order_per_bit; *res_cma = cma; cma_area_count++; + totalcma_pages += (size / PAGE_SIZE); return 0; } @@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base, if (ret) goto err; - totalcma_pages += (size / PAGE_SIZE); pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, &base); return 0; diff --git a/mm/compaction.c b/mm/compaction.c index 546e571e9d60..b68736c8a1ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -34,6 +34,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA +#ifdef CONFIG_TRACEPOINTS +static const char *const compaction_status_string[] = { + "deferred", + "skipped", + "continue", + "partial", + "complete", + "no_suitable_page", + "not_suitable_zone", +}; +#endif #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> @@ -113,6 +124,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn, } #ifdef CONFIG_COMPACTION + +/* Do not skip compaction more than 64 times */ +#define COMPACT_MAX_DEFER_SHIFT 6 + +/* + * Compaction is deferred when compaction fails to result in a page + * allocation success. 1 << compact_defer_limit compactions are skipped up + * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT + */ +void defer_compaction(struct zone *zone, int order) +{ + zone->compact_considered = 0; + zone->compact_defer_shift++; + + if (order < zone->compact_order_failed) + zone->compact_order_failed = order; + + if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) + zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; + + trace_mm_compaction_defer_compaction(zone, order); +} + +/* Returns true if compaction should be skipped this time */ +bool compaction_deferred(struct zone *zone, int order) +{ + unsigned long defer_limit = 1UL << zone->compact_defer_shift; + + if (order < zone->compact_order_failed) + return false; + + /* Avoid possible overflow */ + if (++zone->compact_considered > defer_limit) + zone->compact_considered = defer_limit; + + if (zone->compact_considered >= defer_limit) + return false; + + trace_mm_compaction_deferred(zone, order); + + return true; +} + +/* + * Update defer tracking counters after successful compaction of given order, + * which means an allocation either succeeded (alloc_success == true) or is + * expected to succeed. + */ +void compaction_defer_reset(struct zone *zone, int order, + bool alloc_success) +{ + if (alloc_success) { + zone->compact_considered = 0; + zone->compact_defer_shift = 0; + } + if (order >= zone->compact_order_failed) + zone->compact_order_failed = order + 1; + + trace_mm_compaction_defer_reset(zone, order); +} + +/* Returns true if restarting compaction after many failures */ +bool compaction_restarting(struct zone *zone, int order) +{ + if (order < zone->compact_order_failed) + return false; + + return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && + zone->compact_considered >= 1UL << zone->compact_defer_shift; +} + /* Returns true if the pageblock should be scanned for pages to isolate. */ static inline bool isolation_suitable(struct compact_control *cc, struct page *page) @@ -421,11 +503,12 @@ isolate_fail: } + trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, + nr_scanned, total_isolated); + /* Record how far we have got within the block */ *start_pfn = blockpfn; - trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); - /* * If strict isolation is requested by CMA then check that all the * pages requested were isolated. If there were any failures, 0 is @@ -581,6 +664,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; + unsigned long start_pfn = low_pfn; /* * Ensure that there are not too many pages isolated from the LRU @@ -741,7 +825,8 @@ isolate_success: if (low_pfn == end_pfn) update_pageblock_skip(cc, valid_page, nr_isolated, true); - trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, + nr_scanned, nr_isolated); count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); if (nr_isolated) @@ -1037,7 +1122,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } -static int compact_finished(struct zone *zone, struct compact_control *cc, +static int __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { unsigned int order; @@ -1092,7 +1177,20 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_PARTIAL; } - return COMPACT_CONTINUE; + return COMPACT_NO_SUITABLE_PAGE; +} + +static int compact_finished(struct zone *zone, struct compact_control *cc, + const int migratetype) +{ + int ret; + + ret = __compact_finished(zone, cc, migratetype); + trace_mm_compaction_finished(zone, cc->order, ret); + if (ret == COMPACT_NO_SUITABLE_PAGE) + ret = COMPACT_CONTINUE; + + return ret; } /* @@ -1102,7 +1200,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -unsigned long compaction_suitable(struct zone *zone, int order, +static unsigned long __compaction_suitable(struct zone *zone, int order, int alloc_flags, int classzone_idx) { int fragindex; @@ -1146,11 +1244,24 @@ unsigned long compaction_suitable(struct zone *zone, int order, */ fragindex = fragmentation_index(zone, order); if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - return COMPACT_SKIPPED; + return COMPACT_NOT_SUITABLE_ZONE; return COMPACT_CONTINUE; } +unsigned long compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx) +{ + unsigned long ret; + + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); + trace_mm_compaction_suitable(zone, order, ret); + if (ret == COMPACT_NOT_SUITABLE_ZONE) + ret = COMPACT_SKIPPED; + + return ret; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; @@ -1197,7 +1308,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } - trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, + cc->free_pfn, end_pfn, sync); migrate_prep_local(); @@ -1299,7 +1411,8 @@ out: zone->compact_cached_free_pfn = free_pfn; } - trace_mm_compaction_end(ret); + trace_mm_compaction_end(start_pfn, cc->migrate_pfn, + cc->free_pfn, end_pfn, sync, ret); return ret; } @@ -1335,22 +1448,20 @@ int sysctl_extfrag_threshold = 500; /** * try_to_compact_pages - Direct compact to satisfy a high-order allocation - * @zonelist: The zonelist used for the current allocation - * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation - * @nodemask: The allowed nodes to allocate from + * @order: The order of the current allocation + * @alloc_flags: The allocation flags of the current allocation + * @ac: The context of current allocation * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that determines if compaction was aborted due to * need_resched() or lock contention * * This is the main entry point for direct page compaction. */ -unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) +unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { - enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; @@ -1364,9 +1475,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, if (!order || !may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; + trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); + /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, - nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { int status; int zone_contended; @@ -1374,7 +1487,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, continue; status = compact_zone_order(zone, order, gfp_mask, mode, - &zone_contended, alloc_flags, classzone_idx); + &zone_contended, alloc_flags, + ac->classzone_idx); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1384,7 +1498,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - classzone_idx, alloc_flags)) { + ac->classzone_idx, alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller diff --git a/mm/debug.c b/mm/debug.c index d69cb5a7ba9a..3eb3ac2fcee7 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm) mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, @@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pud_none(*pud)) return no_page_table(vma, flags); if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - return NULL; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - return page; + page = follow_huge_pud(mm, address, pud, flags); + if (page) + return page; + return no_page_table(vma, flags); } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); @@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pmd_none(*pmd)) return no_page_table(vma, flags); if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else - page = NULL; - } - return page; + page = follow_huge_pmd(mm, address, pmd, flags); + if (page) + return page; + return no_page_table(vma, flags); } if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) return no_page_table(vma, flags); @@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return 0; } +static __always_inline long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + int write, int force, + struct page **pages, + struct vm_area_struct **vmas, + int *locked, bool notify_drop, + unsigned int flags) +{ + long ret, pages_done; + bool lock_dropped; + + if (locked) { + /* if VM_FAULT_RETRY can be returned, vmas become invalid */ + BUG_ON(vmas); + /* check caller initialized locked */ + BUG_ON(*locked != 1); + } + + if (pages) + flags |= FOLL_GET; + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + pages_done = 0; + lock_dropped = false; + for (;;) { + ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, + vmas, locked); + if (!locked) + /* VM_FAULT_RETRY couldn't trigger, bypass */ + return ret; + + /* VM_FAULT_RETRY cannot return errors */ + if (!*locked) { + BUG_ON(ret < 0); + BUG_ON(ret >= nr_pages); + } + + if (!pages) + /* If it's a prefault don't insist harder */ + return ret; + + if (ret > 0) { + nr_pages -= ret; + pages_done += ret; + if (!nr_pages) + break; + } + if (*locked) { + /* VM_FAULT_RETRY didn't trigger */ + if (!pages_done) + pages_done = ret; + break; + } + /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ + pages += ret; + start += ret << PAGE_SHIFT; + + /* + * Repeat on the address that fired VM_FAULT_RETRY + * without FAULT_FLAG_ALLOW_RETRY but with + * FAULT_FLAG_TRIED. + */ + *locked = 1; + lock_dropped = true; + down_read(&mm->mmap_sem); + ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, + pages, NULL, NULL); + if (ret != 1) { + BUG_ON(ret > 1); + if (!pages_done) + pages_done = ret; + break; + } + nr_pages--; + pages_done++; + if (!nr_pages) + break; + pages++; + start += PAGE_SIZE; + } + if (notify_drop && lock_dropped && *locked) { + /* + * We must let the caller know we temporarily dropped the lock + * and so the critical section protected by it was lost. + */ + up_read(&mm->mmap_sem); + *locked = 0; + } + return pages_done; +} + +/* + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * + * get_user_pages_locked() is suitable to replace the form: + * + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * to: + * + * int locked = 1; + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages_locked(tsk, mm, ..., pages, &locked); + * if (locked) + * up_read(&mm->mmap_sem); + */ +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, locked, true, FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages_locked); + +/* + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to + * pass additional gup_flags as last parameter (like FOLL_HWPOISON). + * + * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the + * caller if required (just like with __get_user_pages). "FOLL_GET", + * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed + * according to the parameters "pages", "write", "force" + * respectively. + */ +__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + unsigned int gup_flags) +{ + long ret; + int locked = 1; + down_read(&mm->mmap_sem); + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, &locked, false, gup_flags); + if (locked) + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(__get_user_pages_unlocked); + +/* + * get_user_pages_unlocked() is suitable to replace the form: + * + * down_read(&mm->mmap_sem); + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * with: + * + * get_user_pages_unlocked(tsk, mm, ..., pages); + * + * It is functionally equivalent to get_user_pages_fast so + * get_user_pages_fast should be used instead, if the two parameters + * "tsk" and "mm" are respectively equal to current and current->mm, + * or if "force" shall be set to 1 (get_user_pages_fast misses the + * "force" parameter). + */ +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages, FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages_unlocked); + /* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or @@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. + * + * get_user_pages should be phased out in favor of + * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing + * should use get_user_pages because it cannot pass + * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { - int flags = FOLL_TOUCH; - - if (pages) - flags |= FOLL_GET; - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); + return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, vmas, NULL, false, FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start += nr << PAGE_SHIFT; pages += nr; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - nr_pages - nr, write, 0, pages, NULL); - up_read(&mm->mmap_sem); + ret = get_user_pages_unlocked(current, mm, start, + nr_pages - nr, write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 817a875f2b8c..cb7be110cad3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -171,12 +171,7 @@ static int start_khugepaged(void) } static atomic_t huge_zero_refcount; -static struct page *huge_zero_page __read_mostly; - -static inline bool is_huge_zero_page(struct page *page) -{ - return ACCESS_ONCE(huge_zero_page) == page; -} +struct page *huge_zero_page __read_mostly; static inline bool is_huge_zero_pmd(pmd_t pmd) { @@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; } -static inline struct page *alloc_hugepage_vma(int defrag, - struct vm_area_struct *vma, - unsigned long haddr, int nd, - gfp_t extra_gfp) -{ - return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), - HPAGE_PMD_ORDER, vma, haddr, nd); -} - /* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { + gfp_t gfp; struct page *page; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return 0; } - page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); + gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && - !transparent_hugepage_debug_cow()) - new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); - else + !transparent_hugepage_debug_cow()) { + gfp_t gfp; + + gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + } else new_page = NULL; if (unlikely(!new_page)) { @@ -1423,26 +1412,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return ret; } -int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned char *vec) -{ - spinlock_t *ptl; - int ret = 0; - - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - /* - * All logical pages in the range are present - * if backed by a huge page. - */ - spin_unlock(ptl); - memset(vec, 1, (end - addr) >> PAGE_SHIFT); - ret = 1; - } - - return ret; -} - int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, @@ -2148,7 +2117,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page; pte_t *_pte; - int referenced = 0, none = 0; + int none = 0; + bool referenced = false, writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; @@ -2158,7 +2128,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, else goto out; } - if (!pte_present(pteval) || !pte_write(pteval)) + if (!pte_present(pteval)) goto out; page = vm_normal_page(vma, address, pteval); if (unlikely(!page)) @@ -2168,9 +2138,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - /* cannot use mapcount: can't collapse if there's a gup pin */ - if (page_count(page) != 1) - goto out; /* * We can do it before isolate_lru_page because the * page can't be freed from under us. NOTE: PG_lock @@ -2179,6 +2146,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, */ if (!trylock_page(page)) goto out; + + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) { + unlock_page(page); + goto out; + } + if (pte_write(pteval)) { + writable = true; + } else { + if (PageSwapCache(page) && !reuse_swap_page(page)) { + unlock_page(page); + goto out; + } + /* + * Page is not in the swap cache. It can be collapsed + * into a THP. + */ + } + /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. @@ -2195,9 +2185,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, /* If there is no mapped pte young don't collapse the page */ if (pte_young(pteval) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) - referenced = 1; + referenced = true; } - if (likely(referenced)) + if (likely(referenced && writable)) return 1; out: release_pte_pages(pte, _pte); @@ -2550,11 +2540,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, referenced = 0, none = 0; + int ret = 0, none = 0; struct page *page; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE; + bool writable = false, referenced = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -2573,8 +2564,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, else goto out_unmap; } - if (!pte_present(pteval) || !pte_write(pteval)) + if (!pte_present(pteval)) goto out_unmap; + if (pte_write(pteval)) + writable = true; + page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) goto out_unmap; @@ -2591,14 +2585,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; - /* cannot use mapcount: can't collapse if there's a gup pin */ - if (page_count(page) != 1) + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) goto out_unmap; if (pte_young(pteval) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) - referenced = 1; + referenced = true; } - if (referenced) + if (referenced && writable) ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index be0e5d0db5ec..0a9ac6c26832 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2657,9 +2657,10 @@ again: goto unlock; /* - * HWPoisoned hugepage is already unmapped and dropped reference + * Migrating hugepage or HWPoisoned hugepage is already + * unmapped and its refcount is dropped, so just clear pte here. */ - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { + if (unlikely(!pte_present(pte))) { huge_pte_clear(mm, address, ptep); goto unlock; } @@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *pagecache_page = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; + int need_wait_lock = 0; address &= huge_page_mask(h); @@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; /* + * entry could be a migration/hwpoison entry at this point, so this + * check prevents the kernel from going below assuming that we have + * a active hugepage in pagecache. This goto expects the 2nd page fault, + * and is_hugetlb_entry_(migration|hwpoisoned) check will properly + * handle it. + */ + if (!pte_present(entry)) + goto out_mutex; + + /* * If we are going to COW the mapping later, we examine the pending * reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the @@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + ptl = huge_pte_lock(h, mm, ptep); + + /* Check for a racing update before calling hugetlb_cow */ + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_ptl; + /* * hugetlb_cow() requires page locks of pte_page(entry) and * pagecache_page, so here we need take the former one * when page != pagecache_page or !pagecache_page. - * Note that locking order is always pagecache_page -> page, - * so no worry about deadlock. */ page = pte_page(entry); - get_page(page); if (page != pagecache_page) - lock_page(page); - - ptl = huge_pte_lockptr(h, mm, ptep); - spin_lock(ptl); - /* Check for a racing update before calling hugetlb_cow */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_ptl; + if (!trylock_page(page)) { + need_wait_lock = 1; + goto out_ptl; + } + get_page(page); if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page, ptl); - goto out_ptl; + goto out_put_page; } entry = huge_pte_mkdirty(entry); } @@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (huge_ptep_set_access_flags(vma, address, ptep, entry, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); - +out_put_page: + if (page != pagecache_page) + unlock_page(page); + put_page(page); out_ptl: spin_unlock(ptl); @@ -3229,12 +3245,17 @@ out_ptl: unlock_page(pagecache_page); put_page(pagecache_page); } - if (page != pagecache_page) - unlock_page(page); - put_page(page); - out_mutex: mutex_unlock(&htlb_fault_mutex_table[hash]); + /* + * Generally it's safe to hold refcount during waiting page lock. But + * here we just wait to defer the next page fault to avoid busy loop and + * the page is not used after unlocked before returning from the current + * page fault. So we are safe from accessing freed page, even if we wait + * here without taking refcount. + */ + if (need_wait_lock) + wait_on_page_locked(page); return ret; } @@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, spin_unlock(ptl); continue; } - if (!huge_pte_none(huge_ptep_get(ptep))) { + pte = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { + spin_unlock(ptl); + continue; + } + if (unlikely(is_hugetlb_entry_migration(pte))) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (is_write_migration_entry(entry)) { + pte_t newpte; + + make_migration_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + set_huge_pte_at(mm, address, ptep, newpte); + pages++; + } + spin_unlock(ptl); + continue; + } + if (!huge_pte_none(pte)) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(huge_pte_modify(pte, newprot)); pte = arch_make_huge_pte(pte, vma, NULL, 0); @@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (saddr) { spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { + mm_inc_nr_pmds(mm); get_page(virt_to_page(spte)); break; } @@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); spin_lock(ptl); - if (pud_none(*pud)) + if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); - else + } else { put_page(virt_to_page(spte)); + mm_inc_nr_pmds(mm); + } spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); @@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) pud_clear(pud); put_page(virt_to_page(ptep)); + mm_dec_nr_pmds(mm); *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } @@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - struct page *page; +#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ - page = pte_page(*(pte_t *)pmd); - if (page) - page += ((address & ~PMD_MASK) >> PAGE_SHIFT); - return page; +/* + * These functions are overwritable if your architecture needs its own + * behavior. + */ +struct page * __weak +follow_huge_addr(struct mm_struct *mm, unsigned long address, + int write) +{ + return ERR_PTR(-EINVAL); } -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) +struct page * __weak +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int flags) { - struct page *page; - - page = pte_page(*(pte_t *)pud); - if (page) - page += ((address & ~PUD_MASK) >> PAGE_SHIFT); + struct page *page = NULL; + spinlock_t *ptl; +retry: + ptl = pmd_lockptr(mm, pmd); + spin_lock(ptl); + /* + * make sure that the address range covered by this pmd is not + * unmapped from other threads. + */ + if (!pmd_huge(*pmd)) + goto out; + if (pmd_present(*pmd)) { + page = pte_page(*(pte_t *)pmd) + + ((address & ~PMD_MASK) >> PAGE_SHIFT); + if (flags & FOLL_GET) + get_page(page); + } else { + if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { + spin_unlock(ptl); + __migration_entry_wait(mm, (pte_t *)pmd, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); return page; } -#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ - -/* Can be overriden by architectures */ struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) + pud_t *pud, int flags) { - BUG(); - return NULL; -} + if (flags & FOLL_GET) + return NULL; -#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ + return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); +} #ifdef CONFIG_MEMORY_FAILURE diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 037e1c00a5b7..6e0057439a46 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return -EINVAL; buf = strstrip(buf); - ret = page_counter_memparse(buf, &nr_pages); + ret = page_counter_memparse(buf, "-1", &nr_pages); if (ret) return ret; diff --git a/mm/internal.h b/mm/internal.h index efad241f7014..c4d6c9b43491 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); */ /* + * Structure for holding the mostly immutable allocation parameters passed + * between functions involved in allocations, including the alloc_pages* + * family of functions. + * + * nodemask, migratetype and high_zoneidx are initialized only once in + * __alloc_pages_nodemask() and then never change. + * + * zonelist, preferred_zone and classzone_idx are set first in + * __alloc_pages_nodemask() for the fast path, and might be later changed + * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * by a const pointer. + */ +struct alloc_context { + struct zonelist *zonelist; + nodemask_t *nodemask; + struct zone *preferred_zone; + int classzone_idx; + int migratetype; + enum zone_type high_zoneidx; +}; + +/* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). * diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3f8a4f52a0c..095c1f96fbec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; +/* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; - -/* for remember boot option*/ -#ifdef CONFIG_MEMCG_SWAP_ENABLED -static int really_do_swap_account __initdata = 1; -#else -static int really_do_swap_account __initdata; -#endif - #else #define do_swap_account 0 #endif - static const char * const mem_cgroup_stat_names[] = { "cache", "rss", @@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = { "swap", }; -enum mem_cgroup_events_index { - MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ - MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ - MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ - MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ - MEM_CGROUP_EVENTS_NSTATS, -}; - static const char * const mem_cgroup_events_names[] = { "pgpgin", "pgpgout", @@ -138,7 +121,7 @@ enum mem_cgroup_events_target { struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; - unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long events[MEMCG_NR_EVENTS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; @@ -284,6 +267,10 @@ struct mem_cgroup { struct page_counter memsw; struct page_counter kmem; + /* Normal memory consumption range */ + unsigned long low; + unsigned long high; + unsigned long soft_limit; /* vmpressure notifications */ @@ -325,9 +312,11 @@ struct mem_cgroup { /* * set > 0 if pages under this cgroup are moving to other cgroup. */ - atomic_t moving_account; + atomic_t moving_account; /* taken only while moving_account > 0 */ - spinlock_t move_lock; + spinlock_t move_lock; + struct task_struct *move_lock_task; + unsigned long move_lock_flags; /* * percpu counter. */ @@ -371,21 +360,18 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) /* Stuffs for move charges at task migration. */ /* - * Types of charges to be moved. "move_charge_at_immitgrate" and - * "immigrate_flags" are treated as a left-shifted bitmap of these types. + * Types of charges to be moved. */ -enum move_type { - MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ - MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ - NR_MOVE_TYPE, -}; +#define MOVE_ANON 0x1U +#define MOVE_FILE 0x2U +#define MOVE_MASK (MOVE_ANON | MOVE_FILE) /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; - unsigned long immigrate_flags; + unsigned long flags; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; @@ -396,16 +382,6 @@ static struct move_charge_struct { .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; -static bool move_anon(void) -{ - return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); -} - -static bool move_file(void) -{ - return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); -} - /* * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. @@ -1365,6 +1341,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } +bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +{ + struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return true; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + memcg = mz->memcg; + + return !!(memcg->css.flags & CSS_ONLINE); +} + #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1557,7 +1547,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * quickly exit and free its memory. */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { - set_thread_flag(TIF_MEMDIE); + mark_tsk_oom_victim(current); return; } @@ -1931,7 +1921,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle) + if (!handle || oom_killer_disabled) goto cleanup; owait.memcg = memcg; @@ -1977,34 +1967,33 @@ cleanup: /** * mem_cgroup_begin_page_stat - begin a page state statistics transaction * @page: page that is going to change accounted state - * @locked: &memcg->move_lock slowpath was taken - * @flags: IRQ-state flags for &memcg->move_lock * * This function must mark the beginning of an accounted page state * change to prevent double accounting when the page is concurrently * being moved to another memcg: * - * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * memcg = mem_cgroup_begin_page_stat(page); * if (TestClearPageState(page)) * mem_cgroup_update_page_stat(memcg, state, -1); - * mem_cgroup_end_page_stat(memcg, locked, flags); - * - * The RCU lock is held throughout the transaction. The fast path can - * get away without acquiring the memcg->move_lock (@locked is false) - * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when the page - * state that is going to change is the only thing preventing the page - * from being uncharged. E.g. end-writeback clearing PageWriteback(), - * which allows migration to go ahead and uncharge the page before the - * account transaction might be complete. + * mem_cgroup_end_page_stat(memcg); */ -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, - bool *locked, - unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) { struct mem_cgroup *memcg; + unsigned long flags; + /* + * The RCU lock is held throughout the transaction. The fast + * path can get away without acquiring the memcg->move_lock + * because page moving starts with an RCU grace period. + * + * The RCU lock also protects the memcg from being freed when + * the page state that is going to change is the only thing + * preventing the page from being uncharged. + * E.g. end-writeback clearing PageWriteback(), which allows + * migration to go ahead and uncharge the page before the + * account transaction might be complete. + */ rcu_read_lock(); if (mem_cgroup_disabled()) @@ -2014,16 +2003,22 @@ again: if (unlikely(!memcg)) return NULL; - *locked = false; if (atomic_read(&memcg->moving_account) <= 0) return memcg; - spin_lock_irqsave(&memcg->move_lock, *flags); + spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { - spin_unlock_irqrestore(&memcg->move_lock, *flags); + spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } - *locked = true; + + /* + * When charge migration first begins, we can have locked and + * unlocked page stat updates happening concurrently. Track + * the task who has the lock for mem_cgroup_end_page_stat(). + */ + memcg->move_lock_task = current; + memcg->move_lock_flags = flags; return memcg; } @@ -2031,14 +2026,17 @@ again: /** * mem_cgroup_end_page_stat - finish a page state statistics transaction * @memcg: the memcg that was accounted against - * @locked: value received from mem_cgroup_begin_page_stat() - * @flags: value received from mem_cgroup_begin_page_stat() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, - unsigned long *flags) +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) { - if (memcg && *locked) - spin_unlock_irqrestore(&memcg->move_lock, *flags); + if (memcg && memcg->move_lock_task == current) { + unsigned long flags = memcg->move_lock_flags; + + memcg->move_lock_task = NULL; + memcg->move_lock_flags = 0; + + spin_unlock_irqrestore(&memcg->move_lock, flags); + } rcu_read_unlock(); } @@ -2131,17 +2129,6 @@ static void drain_local_stock(struct work_struct *dummy) clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } -static void __init memcg_stock_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct memcg_stock_pcp *stock = - &per_cpu(memcg_stock, cpu); - INIT_WORK(&stock->work, drain_local_stock); - } -} - /* * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. @@ -2291,6 +2278,8 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto nomem; + mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -2332,6 +2321,8 @@ retry: if (fatal_signal_pending(current)) goto bypass; + mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) @@ -2343,6 +2334,16 @@ done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); + /* + * If the hierarchy is above the normal consumption range, + * make the charging task trim their excess contribution. + */ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); done: return ret; } @@ -3390,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, int ret; buf = strstrip(buf); - ret = page_counter_memparse(buf, &nr_pages); + ret = page_counter_memparse(buf, "-1", &nr_pages); if (ret) return ret; @@ -3466,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - if (val >= (1 << NR_MOVE_TYPE)) + if (val & ~MOVE_MASK) return -EINVAL; /* @@ -3544,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v) struct mem_cgroup *mi; unsigned int i; + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != + MEM_CGROUP_STAT_NSTATS); + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != + MEM_CGROUP_EVENTS_NSTATS); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { @@ -3758,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, unsigned long usage; int i, size, ret; - ret = page_counter_memparse(args, &threshold); + ret = page_counter_memparse(args, "-1", &threshold); if (ret) return ret; @@ -4248,7 +4253,7 @@ out_kfree: return ret; } -static struct cftype mem_cgroup_files[] = { +static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), @@ -4359,34 +4364,6 @@ static struct cftype mem_cgroup_files[] = { { }, /* terminate */ }; -#ifdef CONFIG_MEMCG_SWAP -static struct cftype memsw_cgroup_files[] = { - { - .name = "memsw.usage_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), - .read_u64 = mem_cgroup_read_u64, - }, - { - .name = "memsw.max_usage_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), - .write = mem_cgroup_reset, - .read_u64 = mem_cgroup_read_u64, - }, - { - .name = "memsw.limit_in_bytes", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), - .write = mem_cgroup_write, - .read_u64 = mem_cgroup_read_u64, - }, - { - .name = "memsw.failcnt", - .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), - .write = mem_cgroup_reset, - .read_u64 = mem_cgroup_read_u64, - }, - { }, /* terminate */ -}; -#endif static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -4482,29 +4459,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); -static void __init mem_cgroup_soft_limit_tree_init(void) -{ - struct mem_cgroup_tree_per_node *rtpn; - struct mem_cgroup_tree_per_zone *rtpz; - int tmp, node, zone; - - for_each_node(node) { - tmp = node; - if (!node_state(node, N_NORMAL_MEMORY)) - tmp = -1; - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); - BUG_ON(!rtpn); - - soft_limit_tree.rb_tree_per_node[node] = rtpn; - - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - rtpz = &rtpn->rb_tree_per_zone[zone]; - rtpz->rb_root = RB_ROOT; - spin_lock_init(&rtpz->lock); - } - } -} - static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -4524,6 +4478,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent_css == NULL) { root_mem_cgroup = memcg; page_counter_init(&memcg->memory, NULL); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); @@ -4569,6 +4524,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (parent->use_hierarchy) { page_counter_init(&memcg->memory, &parent->memory); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); @@ -4579,6 +4535,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) */ } else { page_counter_init(&memcg->memory, NULL); + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); @@ -4654,6 +4611,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + memcg->low = 0; + memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; } @@ -4730,12 +4689,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, if (!page || !page_mapped(page)) return NULL; if (PageAnon(page)) { - /* we don't move shared anon */ - if (!move_anon()) + if (!(mc.flags & MOVE_ANON)) return NULL; - } else if (!move_file()) - /* we ignore mapcount for file pages */ - return NULL; + } else { + if (!(mc.flags & MOVE_FILE)) + return NULL; + } if (!get_page_unless_zero(page)) return NULL; @@ -4749,7 +4708,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); - if (!move_anon() || non_swap_entry(ent)) + if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) return NULL; /* * Because lookup_swap_cache() updates some statistics counter, @@ -4778,7 +4737,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, if (!vma->vm_file) /* anonymous vma */ return NULL; - if (!move_file()) + if (!(mc.flags & MOVE_FILE)) return NULL; mapping = vma->vm_file->f_mapping; @@ -4857,7 +4816,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); - if (!move_anon()) + if (!(mc.flags & MOVE_ANON)) return ret; if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; @@ -4880,7 +4839,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; @@ -4906,20 +4865,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) { unsigned long precharge; - struct vm_area_struct *vma; + struct mm_walk mem_cgroup_count_precharge_walk = { + .pmd_entry = mem_cgroup_count_precharge_pte_range, + .mm = mm, + }; down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - struct mm_walk mem_cgroup_count_precharge_walk = { - .pmd_entry = mem_cgroup_count_precharge_pte_range, - .mm = mm, - .private = vma, - }; - if (is_vm_hugetlb_page(vma)) - continue; - walk_page_range(vma->vm_start, vma->vm_end, - &mem_cgroup_count_precharge_walk); - } + walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -4999,15 +4951,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - unsigned long move_charge_at_immigrate; + unsigned long move_flags; /* * We are now commited to this value whatever it is. Changes in this * tunable will only affect upcoming migrations, not the current one. * So we need to save it, and keep it going. */ - move_charge_at_immigrate = memcg->move_charge_at_immigrate; - if (move_charge_at_immigrate) { + move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); + if (move_flags) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); @@ -5027,7 +4979,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, spin_lock(&mc.lock); mc.from = from; mc.to = memcg; - mc.immigrate_flags = move_charge_at_immigrate; + mc.flags = move_flags; spin_unlock(&mc.lock); /* We set mc.moving_task later */ @@ -5052,7 +5004,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, struct mm_walk *walk) { int ret = 0; - struct vm_area_struct *vma = walk->private; + struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; enum mc_target_type target_type; @@ -5148,7 +5100,10 @@ put: /* get_mctgt_type() gets the page */ static void mem_cgroup_move_charge(struct mm_struct *mm) { - struct vm_area_struct *vma; + struct mm_walk mem_cgroup_move_charge_walk = { + .pmd_entry = mem_cgroup_move_charge_pte_range, + .mm = mm, + }; lru_add_drain_all(); /* @@ -5171,24 +5126,11 @@ retry: cond_resched(); goto retry; } - for (vma = mm->mmap; vma; vma = vma->vm_next) { - int ret; - struct mm_walk mem_cgroup_move_charge_walk = { - .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mm, - .private = vma, - }; - if (is_vm_hugetlb_page(vma)) - continue; - ret = walk_page_range(vma->vm_start, vma->vm_end, - &mem_cgroup_move_charge_walk); - if (ret) - /* - * means we have consumed all precharges and failed in - * doing additional charge. Just abandon here. - */ - break; - } + /* + * When we have consumed all precharges and failed in doing + * additional charge, the page walk just aborts. + */ + walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); up_read(&mm->mmap_sem); atomic_dec(&mc.from->moving_account); } @@ -5239,118 +5181,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) mem_cgroup_from_css(root_css)->use_hierarchy = true; } -struct cgroup_subsys memory_cgrp_subsys = { - .css_alloc = mem_cgroup_css_alloc, - .css_online = mem_cgroup_css_online, - .css_offline = mem_cgroup_css_offline, - .css_free = mem_cgroup_css_free, - .css_reset = mem_cgroup_css_reset, - .can_attach = mem_cgroup_can_attach, - .cancel_attach = mem_cgroup_cancel_attach, - .attach = mem_cgroup_move_task, - .bind = mem_cgroup_bind, - .legacy_cftypes = mem_cgroup_files, - .early_init = 0, -}; +static u64 memory_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_usage(mem_cgroup_from_css(css), false); +} -#ifdef CONFIG_MEMCG_SWAP -static int __init enable_swap_account(char *s) +static int memory_low_show(struct seq_file *m, void *v) { - if (!strcmp(s, "1")) - really_do_swap_account = 1; - else if (!strcmp(s, "0")) - really_do_swap_account = 0; - return 1; + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long low = ACCESS_ONCE(memcg->low); + + if (low == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); + + return 0; } -__setup("swapaccount=", enable_swap_account); -static void __init memsw_file_init(void) +static ssize_t memory_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, - memsw_cgroup_files)); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long low; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &low); + if (err) + return err; + + memcg->low = low; + + return nbytes; } -static void __init enable_swap_cgroup(void) +static int memory_high_show(struct seq_file *m, void *v) { - if (!mem_cgroup_disabled() && really_do_swap_account) { - do_swap_account = 1; - memsw_file_init(); - } + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long high = ACCESS_ONCE(memcg->high); + + if (high == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); + + return 0; } -#else -static void __init enable_swap_cgroup(void) +static ssize_t memory_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &high); + if (err) + return err; + + memcg->high = high; + + return nbytes; } -#endif -#ifdef CONFIG_MEMCG_SWAP -/** - * mem_cgroup_swapout - transfer a memsw charge to swap - * @page: page whose memsw charge to transfer - * @entry: swap entry to move the charge to - * - * Transfer the memsw charge of @page to @entry. - */ -void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +static int memory_max_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg; - unsigned short oldid; + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long max = ACCESS_ONCE(memcg->memory.limit); - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); + if (max == PAGE_COUNTER_MAX) + seq_puts(m, "infinity\n"); + else + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); - if (!do_swap_account) - return; + return 0; +} - memcg = page->mem_cgroup; +static ssize_t memory_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; - /* Readahead page, never charged */ - if (!memcg) - return; + buf = strstrip(buf); + err = page_counter_memparse(buf, "infinity", &max); + if (err) + return err; - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); - VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + err = mem_cgroup_resize_limit(memcg, max); + if (err) + return err; - page->mem_cgroup = NULL; + return nbytes; +} - if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memory, 1); +static int memory_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - /* XXX: caller holds IRQ-safe mapping->tree_lock */ - VM_BUG_ON(!irqs_disabled()); + seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); + seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); + seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); + seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); - mem_cgroup_charge_statistics(memcg, page, -1); - memcg_check_events(memcg, page); + return 0; +} + +static struct cftype memory_files[] = { + { + .name = "current", + .read_u64 = memory_current_read, + }, + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_low_show, + .write = memory_low_write, + }, + { + .name = "high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_high_show, + .write = memory_high_write, + }, + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_max_show, + .write = memory_max_write, + }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_events_show, + }, + { } /* terminate */ +}; + +struct cgroup_subsys memory_cgrp_subsys = { + .css_alloc = mem_cgroup_css_alloc, + .css_online = mem_cgroup_css_online, + .css_offline = mem_cgroup_css_offline, + .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, + .can_attach = mem_cgroup_can_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .attach = mem_cgroup_move_task, + .bind = mem_cgroup_bind, + .dfl_cftypes = memory_files, + .legacy_cftypes = mem_cgroup_legacy_files, + .early_init = 0, +}; + +/** + * mem_cgroup_events - count memory events against a cgroup + * @memcg: the memory cgroup + * @idx: the event index + * @nr: the number of events to account for + */ +void mem_cgroup_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx, + unsigned int nr) +{ + this_cpu_add(memcg->stat->events[idx], nr); } /** - * mem_cgroup_uncharge_swap - uncharge a swap entry - * @entry: swap entry to uncharge + * mem_cgroup_low - check if memory consumption is below the normal range + * @root: the highest ancestor to consider + * @memcg: the memory cgroup to check * - * Drop the memsw charge associated with @entry. + * Returns %true if memory consumption of @memcg, and that of all + * configurable ancestors up to @root, is below the normal range. */ -void mem_cgroup_uncharge_swap(swp_entry_t entry) +bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; - unsigned short id; + if (mem_cgroup_disabled()) + return false; - if (!do_swap_account) - return; + /* + * The toplevel group doesn't have a configurable range, so + * it's never low when looked at directly, and it is not + * considered an ancestor when assessing the hierarchy. + */ - id = swap_cgroup_record(entry, 0); - rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { - if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memsw, 1); - mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + if (memcg == root_mem_cgroup) + return false; + + if (page_counter_read(&memcg->memory) > memcg->low) + return false; + + while (memcg != root) { + memcg = parent_mem_cgroup(memcg); + + if (memcg == root_mem_cgroup) + break; + + if (page_counter_read(&memcg->memory) > memcg->low) + return false; } - rcu_read_unlock(); + return true; } -#endif /** * mem_cgroup_try_charge - try charging a page @@ -5684,10 +5719,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, */ static int __init mem_cgroup_init(void) { + int cpu, node; + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); - enable_swap_cgroup(); - mem_cgroup_soft_limit_tree_init(); - memcg_stock_init(); + + for_each_possible_cpu(cpu) + INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, + drain_local_stock); + + for_each_node(node) { + struct mem_cgroup_tree_per_node *rtpn; + int zone; + + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + struct mem_cgroup_tree_per_zone *rtpz; + + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + soft_limit_tree.rb_tree_per_node[node] = rtpn; + } + return 0; } subsys_initcall(mem_cgroup_init); + +#ifdef CONFIG_MEMCG_SWAP +/** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ +void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + memcg = page->mem_cgroup; + + /* Readahead page, never charged */ + if (!memcg) + return; + + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); + + page->mem_cgroup = NULL; + + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memory, 1); + + /* XXX: caller holds IRQ-safe mapping->tree_lock */ + VM_BUG_ON(!irqs_disabled()); + + mem_cgroup_charge_statistics(memcg, page, -1); + memcg_check_events(memcg, page); +} + +/** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ +void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memsw, 1); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); +} + +/* for remember boot option*/ +#ifdef CONFIG_MEMCG_SWAP_ENABLED +static int really_do_swap_account __initdata = 1; +#else +static int really_do_swap_account __initdata; +#endif + +static int __init enable_swap_account(char *s) +{ + if (!strcmp(s, "1")) + really_do_swap_account = 1; + else if (!strcmp(s, "0")) + really_do_swap_account = 0; + return 1; +} +__setup("swapaccount=", enable_swap_account); + +static struct cftype memsw_cgroup_files[] = { + { + .name = "memsw.usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.failcnt", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { }, /* terminate */ +}; + +static int __init mem_cgroup_swap_init(void) +{ + if (!mem_cgroup_disabled() && really_do_swap_account) { + do_swap_account = 1; + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memsw_cgroup_files)); + } + return 0; +} +subsys_initcall(mem_cgroup_swap_init); + +#endif /* CONFIG_MEMCG_SWAP */ diff --git a/mm/memory.c b/mm/memory.c index d63849b5188f..bbe6a73a899d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK - if (pud_present(*pud)) /* Another has populated it */ - pmd_free(mm, new); - else + if (!pud_present(*pud)) { + mm_inc_nr_pmds(mm); pud_populate(mm, pud, new); -#else - if (pgd_present(*pud)) /* Another has populated it */ + } else /* Another has populated it */ pmd_free(mm, new); - else +#else + if (!pgd_present(*pud)) { + mm_inc_nr_pmds(mm); pgd_populate(mm, pud, new); + } else /* Another has populated it */ + pmd_free(mm, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e0961b8c39c..f1bd23803576 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); +struct queue_pages { + struct list_head *pagelist; + unsigned long flags; + nodemask_t *nmask; + struct vm_area_struct *prev; +}; + /* * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. */ -static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) +static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) { - pte_t *orig_pte; + struct vm_area_struct *vma = walk->vma; + struct page *page; + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + int nid; pte_t *pte; spinlock_t *ptl; - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - do { - struct page *page; - int nid; + split_huge_page_pmd(vma, addr, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); @@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (PageReserved(page)) continue; nid = page_to_nid(page); - if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, private, flags); - else - break; - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(orig_pte, ptl); - return addr != end; + migrate_page_add(page, qp->pagelist, flags); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; } -static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, - pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, - void *private) +static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; int nid; struct page *page; spinlock_t *ptl; pte_t entry; - ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); - entry = huge_ptep_get((pte_t *)pmd); + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; page = pte_page(entry); nid = page_to_nid(page); - if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) goto unlock; /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) - isolate_huge_page(page, private); + isolate_huge_page(page, qp->pagelist); unlock: spin_unlock(ptl); #else BUG(); #endif -} - -static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (!pmd_present(*pmd)) - continue; - if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { - queue_pages_hugetlb_pmd_range(vma, pmd, nodes, - flags, private); - continue; - } - split_huge_page_pmd(vma, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; - if (queue_pages_pte_range(vma, pmd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pmd++, addr = next, addr != end); - return 0; -} - -static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) - continue; - if (pud_none_or_clear_bad(pud)) - continue; - if (queue_pages_pmd_range(vma, pud, addr, next, nodes, - flags, private)) - return -EIO; - } while (pud++, addr = next, addr != end); - return 0; -} - -static inline int queue_pages_pgd_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pgd_t *pgd; - unsigned long next; - - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - if (queue_pages_pud_range(vma, pgd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pgd++, addr = next, addr != end); return 0; } @@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, } #endif /* CONFIG_NUMA_BALANCING */ +static int queue_pages_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct queue_pages *qp = walk->private; + unsigned long endvma = vma->vm_end; + unsigned long flags = qp->flags; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return -EFAULT; + if (qp->prev && qp->prev->vm_end < vma->vm_start) + return -EFAULT; + } + + qp->prev = vma; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (flags & MPOL_MF_LAZY) { + /* Similar to task_numa_work, skip inaccessible VMAs */ + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + change_prot_numa(vma, start, endvma); + return 1; + } + + if ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma))) + /* queue pages from current vma */ + return 0; + return 1; +} + /* * Walk through page tables and collect pages to be migrated. * @@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, */ static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, - const nodemask_t *nodes, unsigned long flags, void *private) -{ - int err = 0; - struct vm_area_struct *vma, *prev; - - vma = find_vma(mm, start); - if (!vma) - return -EFAULT; - prev = NULL; - for (; vma && vma->vm_start < end; vma = vma->vm_next) { - unsigned long endvma = vma->vm_end; - - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; - - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return -EFAULT; - if (prev && prev->vm_end < vma->vm_start) - return -EFAULT; - } - - if (flags & MPOL_MF_LAZY) { - /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) - change_prot_numa(vma, start, endvma); - goto next; - } - - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) { - - err = queue_pages_pgd_range(vma, start, endvma, nodes, - flags, private); - if (err) - break; - } -next: - prev = vma; - } - return err; + nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) +{ + struct queue_pages qp = { + .pagelist = pagelist, + .flags = flags, + .nmask = nodes, + .prev = NULL, + }; + struct mm_walk queue_pages_walk = { + .hugetlb_entry = queue_pages_hugetlb, + .pmd_entry = queue_pages_pte_range, + .test_walk = queue_pages_test_walk, + .mm = mm, + .private = &qp, + }; + + return walk_page_range(start, end, &queue_pages_walk); } /* @@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. + * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. * When VMA is not NULL caller must hold down_read on the mmap_sem of the * mm_struct of the VMA to prevent it from going away. Should be used for - * all allocations for pages that will be mapped into - * user space. Returns NULL when no page can be allocated. - * - * Should be called with the mm_sem of the vma hold. + * all allocations for pages that will be mapped into user space. Returns + * NULL when no page can be allocated. */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node) + unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; struct page *page; unsigned int cpuset_mems_cookie; + struct zonelist *zl; + nodemask_t *nmask; retry_cpuset: pol = get_vma_policy(vma, addr); cpuset_mems_cookie = read_mems_allowed_begin(); - if (unlikely(pol->mode == MPOL_INTERLEAVE)) { + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && + pol->mode != MPOL_INTERLEAVE)) { + /* + * For hugepage allocation and non-interleave policy which + * allows the current node, we only try to allocate from the + * current node and don't fall back to other nodes, as the + * cost of remote accesses would likely offset THP benefits. + * + * If the policy is interleave, or does not allow the current + * node in its nodemask, we allocate the standard way. + */ + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(node, *nmask)) { + mpol_cond_put(pol); + page = alloc_pages_exact_node(node, gfp, order); + goto out; + } + } + + if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; - - return page; + goto out; } - page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, node), - policy_nodemask(gfp, pol)); + + nmask = policy_nodemask(gfp, pol); + zl = policy_zonelist(gfp, pol, node); mpol_cond_put(pol); + page = __alloc_pages_nodemask(gfp, order, zl, nmask); +out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; diff --git a/mm/migrate.c b/mm/migrate.c index 6e284bcca8bb..f98067e5d353 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, +void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { pte_t pte; @@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto put_and_set; if (PageHuge(page)) { - isolate_huge_page(page, &pagelist); + if (PageHead(page)) + isolate_huge_page(page, &pagelist); goto put_and_set; } diff --git a/mm/mincore.c b/mm/mincore.c index 46527c023e0c..be25efde64a4 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -19,38 +19,25 @@ #include <asm/uaccess.h> #include <asm/pgtable.h> -static void mincore_hugetlb_page_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - unsigned char *vec) +static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, + unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE - struct hstate *h; + unsigned char present; + unsigned char *vec = walk->private; - h = hstate_vma(vma); - while (1) { - unsigned char present; - pte_t *ptep; - /* - * Huge pages are always in RAM for now, but - * theoretically it needs to be checked. - */ - ptep = huge_pte_offset(current->mm, - addr & huge_page_mask(h)); - present = ptep && !huge_pte_none(huge_ptep_get(ptep)); - while (1) { - *vec = present; - vec++; - addr += PAGE_SIZE; - if (addr == end) - return; - /* check hugepage border */ - if (!(addr & ~huge_page_mask(h))) - break; - } - } + /* + * Hugepages under user process are always in RAM and never + * swapped out, but theoretically it needs to be checked. + */ + present = pte && !huge_pte_none(huge_ptep_get(pte)); + for (; addr != end; vec++, addr += PAGE_SIZE) + *vec = present; + walk->private = vec; #else BUG(); #endif + return 0; } /* @@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) return present; } -static void mincore_unmapped_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - unsigned char *vec) +static int __mincore_unmapped_range(unsigned long addr, unsigned long end, + struct vm_area_struct *vma, unsigned char *vec) { unsigned long nr = (end - addr) >> PAGE_SHIFT; int i; @@ -111,23 +97,44 @@ static void mincore_unmapped_range(struct vm_area_struct *vma, for (i = 0; i < nr; i++) vec[i] = 0; } + return nr; +} + +static int mincore_unmapped_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + walk->private += __mincore_unmapped_range(addr, end, + walk->vma, walk->private); + return 0; } -static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - unsigned char *vec) +static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) { - unsigned long next; spinlock_t *ptl; + struct vm_area_struct *vma = walk->vma; pte_t *ptep; + unsigned char *vec = walk->private; + int nr = (end - addr) >> PAGE_SHIFT; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + memset(vec, 1, nr); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmd)) { + __mincore_unmapped_range(addr, end, vma, vec); + goto out; + } - ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - do { + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { pte_t pte = *ptep; - next = addr + PAGE_SIZE; if (pte_none(pte)) - mincore_unmapped_range(vma, addr, next, vec); + __mincore_unmapped_range(addr, addr + PAGE_SIZE, + vma, vec); else if (pte_present(pte)) *vec = 1; else { /* pte is a swap entry */ @@ -150,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } } vec++; - } while (ptep++, addr = next, addr != end); + } pte_unmap_unlock(ptep - 1, ptl); -} - -static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, - unsigned char *vec) -{ - unsigned long next; - pmd_t *pmd; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd)) { - if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { - vec += (next - addr) >> PAGE_SHIFT; - continue; - } - /* fall through */ - } - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - mincore_unmapped_range(vma, addr, next, vec); - else - mincore_pte_range(vma, pmd, addr, next, vec); - vec += (next - addr) >> PAGE_SHIFT; - } while (pmd++, addr = next, addr != end); -} - -static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, - unsigned char *vec) -{ - unsigned long next; - pud_t *pud; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - mincore_unmapped_range(vma, addr, next, vec); - else - mincore_pmd_range(vma, pud, addr, next, vec); - vec += (next - addr) >> PAGE_SHIFT; - } while (pud++, addr = next, addr != end); -} - -static void mincore_page_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - unsigned char *vec) -{ - unsigned long next; - pgd_t *pgd; - - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - mincore_unmapped_range(vma, addr, next, vec); - else - mincore_pud_range(vma, pgd, addr, next, vec); - vec += (next - addr) >> PAGE_SHIFT; - } while (pgd++, addr = next, addr != end); +out: + walk->private += nr; + cond_resched(); + return 0; } /* @@ -224,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v { struct vm_area_struct *vma; unsigned long end; + int err; + struct mm_walk mincore_walk = { + .pmd_entry = mincore_pte_range, + .pte_hole = mincore_unmapped_range, + .hugetlb_entry = mincore_hugetlb, + .private = vec, + }; vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; - + mincore_walk.mm = vma->vm_mm; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); - - if (is_vm_hugetlb_page(vma)) - mincore_hugetlb_page_range(vma, addr, end, vec); - else - mincore_page_range(vma, addr, end, vec); - + err = walk_page_range(addr, end, &mincore_walk); + if (err < 0) + return err; return (end - addr) >> PAGE_SHIFT; } diff --git a/mm/mmap.c b/mm/mmap.c index 14d84666e8ba..da9990acc08b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed, reserve; + long free, allowed, reserve; VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < -(s64)vm_committed_as_batch * num_online_cpus(), @@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ if (mm) { reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min(mm->total_vm / 32, reserve); + allowed -= min_t(long, mm->total_vm / 32, reserve); } if (percpu_counter_read_positive(&vm_committed_as) < allowed) @@ -2851,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm) vma = remove_vma(vma); } vm_unacct_memory(nr_accounted); - - WARN_ON(atomic_long_read(&mm->nr_ptes) > - (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/mmzone.c b/mm/mmzone.c index bf34fb8556db..7d87ebb0d632 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) /* Returns the next zone at or below highest_zoneidx in a zonelist */ struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, - nodemask_t *nodes, - struct zone **zone) + nodemask_t *nodes) { /* * Find the next suitable zone to use for the allocation. @@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, (z->zone && !zref_in_nodemask(z, nodes))) z++; - *zone = zonelist_zone(z); return z; } diff --git a/mm/nommu.c b/mm/nommu.c index 541bed64e348..1a19fb3b0463 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -214,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages); +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return get_user_pages(tsk, mm, start, nr_pages, write, force, + pages, NULL); +} +EXPORT_SYMBOL(get_user_pages_locked); + +long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + unsigned int gup_flags) +{ + long ret; + down_read(&mm->mmap_sem); + ret = get_user_pages(tsk, mm, start, nr_pages, write, force, + pages, NULL); + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(__get_user_pages_unlocked); + +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages, 0); +} +EXPORT_SYMBOL(get_user_pages_unlocked); + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping @@ -1895,7 +1928,7 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed, reserve; + long free, allowed, reserve; vm_acct_memory(pages); @@ -1959,7 +1992,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ if (mm) { reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min(mm->total_vm / 32, reserve); + allowed -= min_t(long, mm->total_vm / 32, reserve); } if (percpu_counter_read_positive(&vm_committed_as) < allowed) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d503e9ce1c7b..642f38cb175a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + - get_mm_counter(p->mm, MM_SWAPENTS); + points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); task_unlock(p); /* @@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (unlikely(frozen(task))) - __thaw_task(task); if (!force_kill) return OOM_SCAN_ABORT; } @@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), atomic_long_read(&task->mm->nr_ptes), + mm_nr_pmds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } /* - * Number of OOM killer invocations (including memcg OOM killer). - * Primarily used by PM freezer to check for potential races with - * OOM killed frozen task. + * Number of OOM victims in flight */ -static atomic_t oom_kills = ATOMIC_INIT(0); +static atomic_t oom_victims = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); -int oom_kills_count(void) +bool oom_killer_disabled __read_mostly; +static DECLARE_RWSEM(oom_sem); + +/** + * mark_tsk_oom_victim - marks the given taks as OOM victim. + * @tsk: task to mark + * + * Has to be called with oom_sem taken for read and never after + * oom has been disabled already. + */ +void mark_tsk_oom_victim(struct task_struct *tsk) { - return atomic_read(&oom_kills); + WARN_ON(oom_killer_disabled); + /* OOM killer might race with memcg OOM */ + if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; + /* + * Make sure that the task is woken up from uninterruptible sleep + * if it is frozen because OOM killer wouldn't be able to free + * any memory and livelock. freezing_slow_path will tell the freezer + * that TIF_MEMDIE tasks should be ignored. + */ + __thaw_task(tsk); + atomic_inc(&oom_victims); +} + +/** + * unmark_oom_victim - unmarks the current task as OOM victim. + * + * Wakes up all waiters in oom_killer_disable() + */ +void unmark_oom_victim(void) +{ + if (!test_and_clear_thread_flag(TIF_MEMDIE)) + return; + + down_read(&oom_sem); + /* + * There is no need to signal the lasst oom_victim if there + * is nobody who cares. + */ + if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + wake_up_all(&oom_victims_wait); + up_read(&oom_sem); +} + +/** + * oom_killer_disable - disable OOM killer + * + * Forces all page allocations to fail rather than trigger OOM killer. + * Will block and wait until all OOM victims are killed. + * + * The function cannot be called when there are runnable user tasks because + * the userspace would see unexpected allocation failures as a result. Any + * new usage of this function should be consulted with MM people. + * + * Returns true if successful and false if the OOM killer cannot be + * disabled. + */ +bool oom_killer_disable(void) +{ + /* + * Make sure to not race with an ongoing OOM killer + * and that the current is not the victim. + */ + down_write(&oom_sem); + if (test_thread_flag(TIF_MEMDIE)) { + up_write(&oom_sem); + return false; + } + + oom_killer_disabled = true; + up_write(&oom_sem); + + wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + + return true; } -void note_oom_kill(void) +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) { - atomic_inc(&oom_kills); + down_write(&oom_sem); + oom_killer_disabled = false; + up_write(&oom_sem); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly */ - if (task_will_free_mem(p)) { - set_tsk_thread_flag(p, TIF_MEMDIE); + task_lock(p); + if (p->mm && task_will_free_mem(p)) { + mark_tsk_oom_victim(p); + task_unlock(p); put_task_struct(p); return; } + task_unlock(p); if (__ratelimit(&oom_rs)) dump_header(p, gfp_mask, order, memcg, nodemask); @@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; + mark_tsk_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), @@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } rcu_read_unlock(); - set_tsk_thread_flag(victim, TIF_MEMDIE); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); put_task_struct(victim); } @@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) } /** - * out_of_memory - kill the "best" process when we run out of memory + * __out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 @@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, +static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; @@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. + * + * But don't select if current has already released its mm and cleared + * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. */ - if (fatal_signal_pending(current) || task_will_free_mem(current)) { - set_thread_flag(TIF_MEMDIE); + if (current->mm && + (fatal_signal_pending(current) || task_will_free_mem(current))) { + mark_tsk_oom_victim(current); return; } @@ -688,6 +772,32 @@ out: schedule_timeout_killable(1); } +/** + * out_of_memory - tries to invoke OOM killer. + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting + * + * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() + * when it returns false. Otherwise returns true. + */ +bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) +{ + bool ret = false; + + down_read(&oom_sem); + if (!oom_killer_disabled) { + __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); + ret = true; + } + up_read(&oom_sem); + + return ret; +} + /* * The pagefault handler calls here because it is out of memory, so kill a * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a @@ -697,12 +807,25 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; + down_read(&oom_sem); if (mem_cgroup_oom_synchronize(true)) - return; + goto unlock; zonelist = node_zonelist(first_memory_node, GFP_KERNEL); if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { - out_of_memory(NULL, 0, 0, NULL, false); + if (!oom_killer_disabled) + __out_of_memory(NULL, 0, 0, NULL, false); + else + /* + * There shouldn't be any user tasks runable while the + * OOM killer is disabled so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); + oom_zonelist_unlock(zonelist, GFP_KERNEL); } +unlock: + up_read(&oom_sem); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6f4335238e33..6a73e47e81c6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2168,9 +2168,12 @@ EXPORT_SYMBOL(account_page_redirty); */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { + int ret; + wbc->pages_skipped++; + ret = __set_page_dirty_nobuffers(page); account_page_redirty(page); - return __set_page_dirty_nobuffers(page); + return ret; } EXPORT_SYMBOL(redirty_page_for_writepage); @@ -2308,12 +2311,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - unsigned long memcg_flags; struct mem_cgroup *memcg; - bool locked; int ret; - memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2338,19 +2339,17 @@ int test_clear_page_writeback(struct page *page) dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - unsigned long memcg_flags; struct mem_cgroup *memcg; - bool locked; int ret; - memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2380,7 +2379,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f121050e8530..8d52ab18fe0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } -bool oom_killer_disabled __read_mostly; - #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -381,36 +379,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -/* update __split_huge_page_refcount if you change this function */ -static int destroy_compound_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - int bad = 0; - - if (unlikely(compound_order(page) != order)) { - bad_page(page, "wrong compound order", 0); - bad++; - } - - __ClearPageHead(page); - - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - - if (unlikely(!PageTail(p))) { - bad_page(page, "PageTail not set", 0); - bad++; - } else if (unlikely(p->first_page != page)) { - bad_page(page, "first_page not consistent", 0); - bad++; - } - __ClearPageTail(p); - } - - return bad; -} - static inline void prep_zero_page(struct page *page, unsigned int order, gfp_t gfp_flags) { @@ -613,10 +581,7 @@ static inline void __free_one_page(struct page *page, int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); - - if (unlikely(PageCompound(page))) - if (unlikely(destroy_compound_page(page, order))) - return; + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); if (is_migrate_isolate(migratetype)) { @@ -797,21 +762,40 @@ static void free_one_page(struct zone *zone, spin_unlock(&zone->lock); } +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return 0; + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + return 1; + } + if (unlikely(page->first_page != head_page)) { + bad_page(page, "first_page not consistent", 0); + return 1; + } + return 0; +} + static bool free_pages_prepare(struct page *page, unsigned int order) { - int i; - int bad = 0; + bool compound = PageCompound(page); + int i, bad = 0; VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) page->mapping = NULL; - for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page); + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); bad += free_pages_check(page + i); + } if (bad) return false; @@ -970,7 +954,8 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + int alloc_flags) { int i; @@ -994,6 +979,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) set_page_owner(page, order, gfp_flags); + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * allocate the page. The expectation is that the caller is taking + * steps that will free more memory. The caller should avoid the page + * being used for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return 0; } @@ -1130,39 +1123,34 @@ static void change_pageblock_range(struct page *pageblock_page, } /* - * If breaking a large block of pages, move all free pages to the preferred - * allocation list. If falling back for a reclaimable kernel allocation, be - * more aggressive about taking ownership of free pages. + * When we are falling back to another migratetype during allocation, try to + * steal extra free pages from the same pageblocks to satisfy further + * allocations, instead of polluting multiple pageblocks. * - * On the other hand, never change migration type of MIGRATE_CMA pageblocks - * nor move CMA pages to different free lists. We don't want unmovable pages - * to be allocated from MIGRATE_CMA areas. + * If we are stealing a relatively large buddy page, it is likely there will + * be more free pages in the pageblock, so try to steal them all. For + * reclaimable and unmovable allocations, we steal regardless of page size, + * as fragmentation caused by those allocations polluting movable pageblocks + * is worse than movable allocations stealing from unmovable and reclaimable + * pageblocks. * - * Returns the new migratetype of the pageblock (or the same old migratetype - * if it was unchanged). + * If we claim more than half of the pageblock, change pageblock's migratetype + * as well. */ -static int try_to_steal_freepages(struct zone *zone, struct page *page, +static void try_to_steal_freepages(struct zone *zone, struct page *page, int start_type, int fallback_type) { int current_order = page_order(page); - /* - * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. We also ensure the freepage_migratetype - * is set to CMA so it is returned to the correct freelist in case - * the page ends up being not actually allocated from the pcp lists. - */ - if (is_migrate_cma(fallback_type)) - return fallback_type; - /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return start_type; + return; } if (current_order >= pageblock_order / 2 || start_type == MIGRATE_RECLAIMABLE || + start_type == MIGRATE_UNMOVABLE || page_group_by_mobility_disabled) { int pages; @@ -1170,15 +1158,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Claim the whole block if over half of it is free */ if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) { - + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); - return start_type; - } - } - - return fallback_type; } /* Remove an element from the buddy allocator from the fallback list */ @@ -1188,14 +1170,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct free_area *area; unsigned int current_order; struct page *page; - int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order && current_order <= MAX_ORDER-1; --current_order) { + int i; for (i = 0;; i++) { - migratetype = fallbacks[start_migratetype][i]; + int migratetype = fallbacks[start_migratetype][i]; + int buddy_type = start_migratetype; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) @@ -1209,25 +1192,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct page, lru); area->nr_free--; - new_type = try_to_steal_freepages(zone, page, - start_migratetype, - migratetype); + if (!is_migrate_cma(migratetype)) { + try_to_steal_freepages(zone, page, + start_migratetype, + migratetype); + } else { + /* + * When borrowing from MIGRATE_CMA, we need to + * release the excess buddy pages to CMA + * itself, and we do not try to steal extra + * free pages. + */ + buddy_type = migratetype; + } /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); expand(zone, page, order, current_order, area, - new_type); - /* The freepage_migratetype may differ from pageblock's + buddy_type); + + /* + * The freepage_migratetype may differ from pageblock's * migratetype depending on the decisions in - * try_to_steal_freepages. This is OK as long as it does - * not differ for MIGRATE_CMA type. + * try_to_steal_freepages(). This is OK as long as it + * does not differ for MIGRATE_CMA pageblocks. For CMA + * we need to make sure unallocated pages flushed from + * pcp lists are returned to the correct freelist. */ - set_freepage_migratetype(page, new_type); + set_freepage_migratetype(page, buddy_type); trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype, new_type); + start_migratetype, migratetype); return page; } @@ -1642,9 +1639,7 @@ int split_free_page(struct page *page) } /* - * Really, prep_compound_page() should be called from __rmqueue_bulk(). But - * we cheat by calling it from here, in the order > 0 path. Saves a branch - * or two. + * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, @@ -1655,7 +1650,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, struct page *page; bool cold = ((gfp_flags & __GFP_COLD) != 0); -again: if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; @@ -1711,8 +1705,6 @@ again: local_irq_restore(flags); VM_BUG_ON_PAGE(bad_range(zone, page), page); - if (prep_new_page(page, order, gfp_flags)) - goto again; return page; failed: @@ -2033,10 +2025,10 @@ static void reset_alloc_batches(struct zone *preferred_zone) * a page. */ static struct page * -get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, - struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int classzone_idx, int migratetype) +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + const struct alloc_context *ac) { + struct zonelist *zonelist = ac->zonelist; struct zoneref *z; struct page *page = NULL; struct zone *zone; @@ -2055,8 +2047,8 @@ zonelist_scan: * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, - high_zoneidx, nodemask) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { unsigned long mark; if (IS_ENABLED(CONFIG_NUMA) && zlc_active && @@ -2073,7 +2065,7 @@ zonelist_scan: * time the page has in memory before being reclaimed. */ if (alloc_flags & ALLOC_FAIR) { - if (!zone_local(preferred_zone, zone)) + if (!zone_local(ac->preferred_zone, zone)) break; if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { nr_fair_skipped++; @@ -2111,7 +2103,7 @@ zonelist_scan: mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { + ac->classzone_idx, alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ @@ -2132,7 +2124,7 @@ zonelist_scan: } if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(preferred_zone, zone)) + !zone_allows_reclaim(ac->preferred_zone, zone)) goto this_zone_full; /* @@ -2154,7 +2146,7 @@ zonelist_scan: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) + ac->classzone_idx, alloc_flags)) goto try_this_zone; /* @@ -2175,27 +2167,18 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(preferred_zone, zone, order, - gfp_mask, migratetype); - if (page) - break; + page = buffered_rmqueue(ac->preferred_zone, zone, order, + gfp_mask, ac->migratetype); + if (page) { + if (prep_new_page(page, order, gfp_mask, alloc_flags)) + goto try_this_zone; + return page; + } this_zone_full: if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } - if (page) { - /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was - * necessary to allocate the page. The expectation is - * that the caller is taking steps that will free more - * memory. The caller should avoid the page being used - * for !PFMEMALLOC purposes. - */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); - return page; - } - /* * The first pass makes sure allocations are spread fairly within the * local node. However, the local node might have free pages left @@ -2208,7 +2191,7 @@ this_zone_full: alloc_flags &= ~ALLOC_FAIR; if (nr_fair_skipped) { zonelist_rescan = true; - reset_alloc_batches(preferred_zone); + reset_alloc_batches(ac->preferred_zone); } if (nr_online_nodes > 1) zonelist_rescan = true; @@ -2330,44 +2313,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype, unsigned long *did_some_progress) + const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page; *did_some_progress = 0; - if (oom_killer_disabled) - return NULL; - /* * Acquire the per-zone oom lock for each zone. If that * fails, somebody else is making progress for us. */ - if (!oom_zonelist_trylock(zonelist, gfp_mask)) { + if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { *did_some_progress = 1; schedule_timeout_uninterruptible(1); return NULL; } /* - * PM-freezer should be notified that there might be an OOM killer on - * its way to kill and wake somebody up. This is too early and we might - * end up not killing anything but false positives are acceptable. - * See freeze_processes. - */ - note_oom_kill(); - - /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if * we're still under heavy pressure. */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, - order, zonelist, high_zoneidx, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); if (page) goto out; @@ -2379,7 +2347,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (high_zoneidx < ZONE_NORMAL) + if (ac->high_zoneidx < ZONE_NORMAL) goto out; /* The OOM killer does not compensate for light reclaim */ if (!(gfp_mask & __GFP_FS)) @@ -2395,10 +2363,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order, nodemask, false); - *did_some_progress = 1; + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + *did_some_progress = 1; out: - oom_zonelist_unlock(zonelist, gfp_mask); + oom_zonelist_unlock(ac->zonelist, gfp_mask); return page; } @@ -2406,10 +2374,9 @@ out: /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, enum migrate_mode mode, - int *contended_compaction, bool *deferred_compaction) + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended_compaction, + bool *deferred_compaction) { unsigned long compact_result; struct page *page; @@ -2418,10 +2385,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; current->flags |= PF_MEMALLOC; - compact_result = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, mode, - contended_compaction, - alloc_flags, classzone_idx); + compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + mode, contended_compaction); current->flags &= ~PF_MEMALLOC; switch (compact_result) { @@ -2440,10 +2405,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, nodemask, - order, zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); if (page) { struct zone *zone = page_zone(page); @@ -2467,10 +2430,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, enum migrate_mode mode, - int *contended_compaction, bool *deferred_compaction) + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended_compaction, + bool *deferred_compaction) { return NULL; } @@ -2478,8 +2440,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, /* Perform direct synchronous page reclaim */ static int -__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, - nodemask_t *nodemask) +__perform_reclaim(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac) { struct reclaim_state reclaim_state; int progress; @@ -2493,7 +2455,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + progress = try_to_free_pages(ac->zonelist, order, gfp_mask, + ac->nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -2507,28 +2470,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, unsigned long *did_some_progress) + int alloc_flags, const struct alloc_context *ac, + unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; - *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, - nodemask); + *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) return NULL; /* After successful reclaim, reconsider all zones for allocation */ if (IS_ENABLED(CONFIG_NUMA)) - zlc_clear_zones_full(zonelist); + zlc_clear_zones_full(ac->zonelist); retry: - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, - migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); /* * If an allocation failed after direct reclaim, it could be because @@ -2549,36 +2507,30 @@ retry: */ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype) + const struct alloc_context *ac) { struct page *page; do { - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); if (!page && gfp_mask & __GFP_NOFAIL) - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, + HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; } -static void wake_all_kswapds(unsigned int order, - struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone, - nodemask_t *nodemask) +static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist_nodemask(zone, z, zonelist, - high_zoneidx, nodemask) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->high_zoneidx, ac->nodemask) + wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); } static inline int @@ -2637,9 +2589,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype) + struct alloc_context *ac) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; @@ -2675,8 +2625,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, retry: if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapds(order, zonelist, high_zoneidx, - preferred_zone, nodemask); + wake_all_kswapds(order, ac); /* * OK, we're below the kswapd watermark and have kicked background @@ -2689,17 +2638,16 @@ retry: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, - NULL, &preferred_zone); - classzone_idx = zonelist_zone_idx(preferred_zoneref); + preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, NULL, &ac->preferred_zone); + ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); } /* This is the last chance, in general, before the goto nopage. */ - page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, - high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); if (page) goto got_pg; @@ -2710,11 +2658,10 @@ retry: * the allocation is high priority and these type of * allocations are system rather than user orientated */ - zonelist = node_zonelist(numa_node_id(), gfp_mask); + ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); + + page = __alloc_pages_high_priority(gfp_mask, order, ac); - page = __alloc_pages_high_priority(gfp_mask, order, - zonelist, high_zoneidx, nodemask, - preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2743,11 +2690,9 @@ retry: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, - high_zoneidx, nodemask, alloc_flags, - preferred_zone, - classzone_idx, migratetype, - migration_mode, &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + migration_mode, + &contended_compaction, &deferred_compaction); if (page) goto got_pg; @@ -2793,12 +2738,8 @@ retry: migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ - page = __alloc_pages_direct_reclaim(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - classzone_idx, migratetype, - &did_some_progress); + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, + &did_some_progress); if (page) goto got_pg; @@ -2812,17 +2753,15 @@ retry: * start OOM killing tasks. */ if (!did_some_progress) { - page = __alloc_pages_may_oom(gfp_mask, order, zonelist, - high_zoneidx, nodemask, - preferred_zone, classzone_idx, - migratetype,&did_some_progress); + page = __alloc_pages_may_oom(gfp_mask, order, ac, + &did_some_progress); if (page) goto got_pg; if (!did_some_progress) goto nopage; } /* Wait for some write requests to complete then retry */ - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); goto retry; } else { /* @@ -2830,11 +2769,9 @@ retry: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, - high_zoneidx, nodemask, alloc_flags, - preferred_zone, - classzone_idx, migratetype, - migration_mode, &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, migration_mode, + &contended_compaction, &deferred_compaction); if (page) goto got_pg; @@ -2842,11 +2779,7 @@ retry: nopage: warn_alloc_failed(gfp_mask, order, NULL); - return page; got_pg: - if (kmemcheck_enabled) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); - return page; } @@ -2857,14 +2790,16 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - enum zone_type high_zoneidx = gfp_zone(gfp_mask); - struct zone *preferred_zone; struct zoneref *preferred_zoneref; struct page *page = NULL; - int migratetype = gfpflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - int classzone_idx; + gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { + .high_zoneidx = gfp_zone(gfp_mask), + .nodemask = nodemask, + .migratetype = gfpflags_to_migratetype(gfp_mask), + }; gfp_mask &= gfp_allowed_mask; @@ -2883,37 +2818,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) + if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); + /* We set it here, as __alloc_pages_slowpath might have changed it */ + ac.zonelist = zonelist; /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, - nodemask ? : &cpuset_current_mems_allowed, - &preferred_zone); - if (!preferred_zone) + preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, + ac.nodemask ? : &cpuset_current_mems_allowed, + &ac.preferred_zone); + if (!ac.preferred_zone) goto out; - classzone_idx = zonelist_zone_idx(preferred_zoneref); + ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, alloc_flags, - preferred_zone, classzone_idx, migratetype); + alloc_mask = gfp_mask|__GFP_HARDWALL; + page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (unlikely(!page)) { /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. */ - gfp_mask = memalloc_noio_flags(gfp_mask); - page = __alloc_pages_slowpath(gfp_mask, order, - zonelist, high_zoneidx, nodemask, - preferred_zone, classzone_idx, migratetype); + alloc_mask = memalloc_noio_flags(gfp_mask); + + page = __alloc_pages_slowpath(alloc_mask, order, &ac); } - trace_mm_page_alloc(page, order, gfp_mask, migratetype); + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); out: /* @@ -5047,8 +4985,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); - printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, - (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5420,9 +5358,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) arch_zone_highest_possible_pfn[i]) pr_cont("empty\n"); else - pr_cont("[mem %0#10lx-%0#10lx]\n", - arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, - (arch_zone_highest_possible_pfn[i] + pr_cont("[mem %#018Lx-%#018Lx]\n", + (u64)arch_zone_lowest_possible_pfn[i] + << PAGE_SHIFT, + ((u64)arch_zone_highest_possible_pfn[i] << PAGE_SHIFT) - 1); } @@ -5430,15 +5369,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) pr_info("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - pr_info(" Node %d: %#010lx\n", i, - zone_movable_pfn[i] << PAGE_SHIFT); + pr_info(" Node %d: %#018Lx\n", i, + (u64)zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early node map */ pr_info("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, - start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); + pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + ((u64)end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ mminit_verify_pageflags_layout(); diff --git a/mm/page_counter.c b/mm/page_counter.c index a009574fbba9..11b4beda14ba 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse + * @max: string meaning maximum possible value * @nr_pages: returns the result in number of pages * * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be * limited to %PAGE_COUNTER_MAX. */ -int page_counter_memparse(const char *buf, unsigned long *nr_pages) +int page_counter_memparse(const char *buf, const char *max, + unsigned long *nr_pages) { - char unlimited[] = "-1"; char *end; u64 bytes; - if (!strncmp(buf, unlimited, sizeof(unlimited))) { + if (!strcmp(buf, max)) { *nr_pages = PAGE_COUNTER_MAX; return 0; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 9ab4a9b5bc09..0993f5f36b01 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order) void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { - struct page_ext *page_ext; - struct stack_trace *trace; - - page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(page); + struct stack_trace trace = { + .nr_entries = 0, + .max_entries = ARRAY_SIZE(page_ext->trace_entries), + .entries = &page_ext->trace_entries[0], + .skip = 3, + }; - trace = &page_ext->trace; - trace->nr_entries = 0; - trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); - trace->entries = &page_ext->trace_entries[0]; - trace->skip = 3; - save_stack_trace(&page_ext->trace); + save_stack_trace(&trace); page_ext->order = order; page_ext->gfp_mask = gfp_mask; + page_ext->nr_entries = trace.nr_entries; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } @@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, int ret; int pageblock_mt, page_mt; char *kbuf; + struct stack_trace trace = { + .nr_entries = page_ext->nr_entries, + .entries = &page_ext->trace_entries[0], + }; kbuf = kmalloc(count, GFP_KERNEL); if (!kbuf) @@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; - ret += snprint_stack_trace(kbuf + ret, count - ret, - &page_ext->trace, 0); + ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); if (ret >= count) goto err; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b264bda46e1b..75c1f2878519 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { again: next = pmd_addr_end(addr, end); - if (pmd_none(*pmd)) { + if (pmd_none(*pmd) || !walk->vma) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) @@ -59,7 +59,7 @@ again: continue; split_huge_page_pmd_mm(walk->mm, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + if (pmd_trans_unstable(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); if (err) @@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, break; continue; } - if (walk->pud_entry) - err = walk->pud_entry(pud, addr, next, walk); - if (!err && (walk->pmd_entry || walk->pte_entry)) + if (walk->pmd_entry || walk->pte_entry) err = walk_pmd_range(pud, addr, next, walk); if (err) break; @@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +static int walk_pgd_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + pgd = pgd_offset(walk->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_pud_range(pgd, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) @@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, return boundary < end ? boundary : end; } -static int walk_hugetlb_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, +static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { + struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); @@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, if (pte && walk->hugetlb_entry) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); if (err) - return err; + break; } while (addr = next, addr != end); - return 0; + return err; } #else /* CONFIG_HUGETLB_PAGE */ -static int walk_hugetlb_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, +static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; @@ -137,115 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, #endif /* CONFIG_HUGETLB_PAGE */ +/* + * Decide whether we really walk over the current vma on [@start, @end) + * or skip it via the returned value. Return 0 if we do walk over the + * current vma, and return 1 if we skip the vma. Negative values means + * error, where we abort the current walk. + */ +static int walk_page_test(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (walk->test_walk) + return walk->test_walk(start, end, walk); + + /* + * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP + * range, so we don't walk over it as we do for normal vmas. However, + * Some callers are interested in handling hole range and they don't + * want to just ignore any single address range. Such users certainly + * define their ->pte_hole() callbacks, so let's delegate them to handle + * vma(VM_PFNMAP). + */ + if (vma->vm_flags & VM_PFNMAP) { + int err = 1; + if (walk->pte_hole) + err = walk->pte_hole(start, end, walk); + return err ? err : 1; + } + return 0; +} + +static int __walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + struct vm_area_struct *vma = walk->vma; + + if (vma && is_vm_hugetlb_page(vma)) { + if (walk->hugetlb_entry) + err = walk_hugetlb_range(start, end, walk); + } else + err = walk_pgd_range(start, end, walk); + return err; +} /** - * walk_page_range - walk a memory map's page tables with a callback - * @addr: starting address - * @end: ending address - * @walk: set of callbacks to invoke for each level of the tree + * walk_page_range - walk page table with caller specific callbacks * - * Recursively walk the page table for the memory area in a VMA, - * calling supplied callbacks. Callbacks are called in-order (first - * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, - * etc.). If lower-level callbacks are omitted, walking depth is reduced. + * Recursively walk the page table tree of the process represented by @walk->mm + * within the virtual address range [@start, @end). During walking, we can do + * some caller-specific works for each entry, by setting up pmd_entry(), + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these + * callbacks, the associated entries/pages are just ignored. + * The return values of these callbacks are commonly defined like below: + * - 0 : succeeded to handle the current entry, and if you don't reach the + * end address yet, continue to walk. + * - >0 : succeeded to handle the current entry, and return to the caller + * with caller specific value. + * - <0 : failed to handle the current entry, and return to the caller + * with error code. * - * Each callback receives an entry pointer and the start and end of the - * associated range, and a copy of the original mm_walk for access to - * the ->private or ->mm fields. + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the current vma, typically by checking + * its vm_flags. walk_page_test() and @walk->test_walk() are used for this + * purpose. * - * Usually no locks are taken, but splitting transparent huge page may - * take page table lock. And the bottom level iterator will map PTE - * directories from highmem if necessary. + * struct mm_walk keeps current values of some common data like vma and pmd, + * which are useful for the access from callbacks. If you want to pass some + * caller-specific data to callbacks, @walk->private should be helpful. * - * If any callback returns a non-zero value, the walk is aborted and - * the return value is propagated back to the caller. Otherwise 0 is returned. - * - * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry - * is !NULL. + * Locking: + * Callers of walk_page_range() and walk_page_vma() should hold + * @walk->mm->mmap_sem, because these function traverse vma list and/or + * access to vma's data. */ -int walk_page_range(unsigned long addr, unsigned long end, +int walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { - pgd_t *pgd; - unsigned long next; int err = 0; + unsigned long next; + struct vm_area_struct *vma; - if (addr >= end) - return err; + if (start >= end) + return -EINVAL; if (!walk->mm) return -EINVAL; VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); - pgd = pgd_offset(walk->mm, addr); + vma = find_vma(walk->mm, start); do { - struct vm_area_struct *vma = NULL; - - next = pgd_addr_end(addr, end); + if (!vma) { /* after the last vma */ + walk->vma = NULL; + next = end; + } else if (start < vma->vm_start) { /* outside vma */ + walk->vma = NULL; + next = min(end, vma->vm_start); + } else { /* inside vma */ + walk->vma = vma; + next = min(end, vma->vm_end); + vma = vma->vm_next; - /* - * This function was not intended to be vma based. - * But there are vma special cases to be handled: - * - hugetlb vma's - * - VM_PFNMAP vma's - */ - vma = find_vma(walk->mm, addr); - if (vma) { - /* - * There are no page structures backing a VM_PFNMAP - * range, so do not allow split_huge_page_pmd(). - */ - if ((vma->vm_start <= addr) && - (vma->vm_flags & VM_PFNMAP)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; - } - /* - * Handle hugetlb vma individually because pagetable - * walk for the hugetlb page is dependent on the - * architecture and we can't handled it in the same - * manner as non-huge pages. - */ - if (walk->hugetlb_entry && (vma->vm_start <= addr) && - is_vm_hugetlb_page(vma)) { - if (vma->vm_end < next) - next = vma->vm_end; - /* - * Hugepage is very tightly coupled with vma, - * so walk through hugetlb entries within a - * given vma. - */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); + err = walk_page_test(start, next, walk); + if (err > 0) continue; - } - } - - if (pgd_none_or_clear_bad(pgd)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); - if (err) + if (err < 0) break; - pgd++; - continue; } - if (walk->pgd_entry) - err = walk->pgd_entry(pgd, addr, next, walk); - if (!err && - (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) - err = walk_pud_range(pgd, addr, next, walk); + if (walk->vma || walk->pte_hole) + err = __walk_page_range(start, next, walk); if (err) break; - pgd++; - } while (addr = next, addr < end); - + } while (start = next, start < end); return err; } + +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) +{ + int err; + + if (!walk->mm) + return -EINVAL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON(!vma); + walk->vma = vma; + err = walk_page_test(vma->vm_start, vma->vm_end, walk); + if (err > 0) + return 0; + if (err < 0) + return err; + return __walk_page_range(vma->vm_start, vma->vm_end, walk); +} diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5077afcd9e11..b1597690530c 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr, size_t bytes; /* Get the pages we're interested in */ - down_read(&mm->mmap_sem); - pages = get_user_pages(task, mm, pa, pages, - vm_write, 0, process_pages, NULL); - up_read(&mm->mmap_sem); - + pages = get_user_pages_unlocked(task, mm, pa, pages, + vm_write, 0, process_pages); if (pages <= 0) return -EFAULT; diff --git a/mm/rmap.c b/mm/rmap.c index 70b32498d4f2..5e3e09081164 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page) { struct mem_cgroup *memcg; - unsigned long flags; - bool locked; - memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg, &locked, &flags); + mem_cgroup_end_page_stat(memcg); } static void page_remove_file_rmap(struct page *page) { struct mem_cgroup *memcg; - unsigned long flags; - bool locked; - memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page); /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) @@ -1123,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg, &locked, &flags); + mem_cgroup_end_page_stat(memcg); } /** diff --git a/mm/shmem.c b/mm/shmem.c index b3e403181981..864c878401e6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1131,7 +1131,7 @@ repeat: * truncated or holepunched since swap was confirmed. * shmem_undo_range() will have done some of the * unaccounting, now delete_from_swap_cache() will do - * the rest (including mem_cgroup_uncharge_swapcache). + * the rest. * Reset swap.val? No, leave it so "failed" goes back to * "repeat": reading a hole and writing should succeed. */ diff --git a/mm/util.c b/mm/util.c index fec39d4509a9..f3ef639c4857 100644 --- a/mm/util.c +++ b/mm/util.c @@ -240,14 +240,8 @@ int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; - int ret; - - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, nr_pages, - write, 0, pages, NULL); - up_read(&mm->mmap_sem); - - return ret; + return get_user_pages_unlocked(current, mm, start, nr_pages, + write, 0, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/mm/vmscan.c b/mm/vmscan.c index dcd90c891d8e..8e645ee52045 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -91,6 +91,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Can cgroups be reclaimed below their normal consumption range? */ + unsigned int may_thrash:1; + unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ @@ -1903,8 +1906,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && !zone_reclaimable(zone)) - force_scan = true; + if (current_is_kswapd()) { + if (!zone_reclaimable(zone)) + force_scan = true; + if (!mem_cgroup_lruvec_online(lruvec)) + force_scan = true; + } if (!global_reclaim(sc)) force_scan = true; @@ -2290,6 +2297,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, struct lruvec *lruvec; int swappiness; + if (mem_cgroup_low(root, memcg)) { + if (!sc->may_thrash) + continue; + mem_cgroup_events(memcg, MEMCG_LOW, 1); + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); @@ -2311,8 +2324,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, mem_cgroup_iter_break(root, memcg); break; } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); /* * Shrink the slab caches in the same proportion that @@ -2515,10 +2527,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { + int initial_priority = sc->priority; unsigned long total_scanned = 0; unsigned long writeback_threshold; bool zones_reclaimable; - +retry: delayacct_freepages_start(); if (global_reclaim(sc)) @@ -2568,6 +2581,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (sc->compaction_ready) return 1; + /* Untapped cgroup reserves? Don't OOM, retry. */ + if (!sc->may_thrash) { + sc->priority = initial_priority; + sc->may_thrash = 1; + goto retry; + } + /* Any of the zones still reclaimable? Don't OOM. */ if (zones_reclaimable) return 1; @@ -3175,7 +3195,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && pfmemalloc_watermark_ok(pgdat)) - wake_up(&pgdat->pfmemalloc_wait); + wake_up_all(&pgdat->pfmemalloc_wait); /* * Fragmentation may mean that the system cannot be rebalanced diff --git a/mm/vmstat.c b/mm/vmstat.c index 9943e5fd74e6..4f5cd974e11a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1437,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w) if (need_update(cpu) && cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), - __round_jiffies_relative(sysctl_stat_interval, cpu)); + schedule_delayed_work_on(cpu, + &per_cpu(vmstat_work, cpu), 0); put_online_cpus(); @@ -1452,7 +1452,7 @@ static void __init start_shepherd_timer(void) int cpu; for_each_possible_cpu(cpu) - INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), + INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) |