diff options
author | Ingo Molnar <mingo@kernel.org> | 2018-01-30 15:08:27 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-01-30 15:08:27 +0100 |
commit | 7e86548e2cc8d308cb75439480f428137151b0de (patch) | |
tree | fa3bcdedb64f4642a21080bc2b4ddc69ce2b2285 /mm | |
parent | 64e16720ea0879f8ab4547e3b9758936d483909b (diff) | |
parent | d8a5b80568a9cb66810e75b182018e9edb68e8ff (diff) | |
download | linux-7e86548e2cc8d308cb75439480f428137151b0de.tar.bz2 |
Merge tag 'v4.15' into x86/pti, to be able to merge dependent changes
Time has come to switch PTI development over to a v4.15 base - we'll still
try to make sure that all PTI fixes backport cleanly to v4.14 and earlier.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
67 files changed, 1709 insertions, 1023 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 9c4bdddd80c2..03ff7703d322 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -756,3 +756,12 @@ config PERCPU_STATS This feature collects and exposes statistics via debugfs. The information includes global and per chunk statistics, which can be used to help understand percpu memory usage. + +config GUP_BENCHMARK + bool "Enable infrastructure for get_user_pages_fast() benchmarking" + default n + help + Provides /sys/kernel/debug/gup_benchmark that helps with testing + performance of get_user_pages_fast(). + + See tools/testing/selftests/vm/gup_benchmark.c diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5b0adf1435de..e5e606ee5f71 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC - depends on !KMEMCHECK select PAGE_EXTENSION select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- diff --git a/mm/Makefile b/mm/Makefile index 4659b93cba43..e669f02c5a54 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n KCOV_INSTRUMENT_page_alloc.o := n KCOV_INSTRUMENT_debug-pagealloc.o := n KCOV_INSTRUMENT_kmemleak.o := n -KCOV_INSTRUMENT_kmemcheck.o := n KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n @@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o -obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o @@ -82,6 +80,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o +obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e19606bb41a0..b5f940ce0143 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -113,11 +113,23 @@ static const struct file_operations bdi_debug_stats_fops = { .release = single_release, }; -static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) +static int bdi_debug_register(struct backing_dev_info *bdi, const char *name) { + if (!bdi_debug_root) + return -ENOMEM; + bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); + if (!bdi->debug_dir) + return -ENOMEM; + bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); + if (!bdi->debug_stats) { + debugfs_remove(bdi->debug_dir); + return -ENOMEM; + } + + return 0; } static void bdi_debug_unregister(struct backing_dev_info *bdi) @@ -129,9 +141,10 @@ static void bdi_debug_unregister(struct backing_dev_info *bdi) static inline void bdi_debug_init(void) { } -static inline void bdi_debug_register(struct backing_dev_info *bdi, +static inline int bdi_debug_register(struct backing_dev_info *bdi, const char *name) { + return 0; } static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { @@ -1072,23 +1085,3 @@ out: return ret; } EXPORT_SYMBOL(wait_iff_congested); - -int pdflush_proc_obsolete(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - char kbuf[] = "0\n"; - - if (*ppos || *lenp < sizeof(kbuf)) { - *lenp = 0; - return 0; - } - - if (copy_to_user(buffer, kbuf, sizeof(kbuf))) - return -EFAULT; - pr_warn_once("%s exported in /proc is scheduled for removal\n", - table->procname); - - *lenp = 2; - *ppos += *lenp; - return 2; -} diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 68d28924ba79..ef858d547e2d 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -11,22 +11,37 @@ #include <linux/balloon_compaction.h> /* + * balloon_page_alloc - allocates a new page for insertion into the balloon + * page list. + * + * Driver must call it to properly allocate a new enlisted balloon page. + * Driver must call balloon_page_enqueue before definitively removing it from + * the guest system. This function returns the page address for the recently + * allocated page or NULL in the case we fail to allocate a new page this turn. + */ +struct page *balloon_page_alloc(void) +{ + struct page *page = alloc_page(balloon_mapping_gfp_mask() | + __GFP_NOMEMALLOC | __GFP_NORETRY); + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_alloc); + +/* * balloon_page_enqueue - allocates a new page and inserts it into the balloon * page list. * @b_dev_info: balloon device descriptor where we will insert a new page to + * @page: new page to enqueue - allocated using balloon_page_alloc. * - * Driver must call it to properly allocate a new enlisted balloon page + * Driver must call it to properly enqueue a new allocated balloon page * before definitively removing it from the guest system. * This function returns the page address for the recently enqueued page or * NULL in the case we fail to allocate a new page this turn. */ -struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page) { unsigned long flags; - struct page *page = alloc_page(balloon_mapping_gfp_mask() | - __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!page) - return NULL; /* * Block others from accessing the 'page' when we get around to @@ -39,7 +54,6 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) __count_vm_event(BALLOON_INFLATE); spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); - return page; } EXPORT_SYMBOL_GPL(balloon_page_enqueue); @@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); if (ret && !(gfp_mask & __GFP_NOWARN)) { - pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", + pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); } diff --git a/mm/compaction.c b/mm/compaction.c index 85395dc6eb13..10cd757f1006 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -219,6 +219,24 @@ static void reset_cached_positions(struct zone *zone) } /* + * Compound pages of >= pageblock_order should consistenly be skipped until + * released. It is always pointless to compact pages of such order (if they are + * migratable), and the pageblocks they occupy cannot contain any free pages. + */ +static bool pageblock_skip_persistent(struct page *page) +{ + if (!PageCompound(page)) + return false; + + page = compound_head(page); + + if (compound_order(page) >= pageblock_order) + return true; + + return false; +} + +/* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner * meet. @@ -242,6 +260,8 @@ static void __reset_isolation_suitable(struct zone *zone) continue; if (zone != page_zone(page)) continue; + if (pageblock_skip_persistent(page)) + continue; clear_pageblock_skip(page); } @@ -275,7 +295,7 @@ static void update_pageblock_skip(struct compact_control *cc, struct zone *zone = cc->zone; unsigned long pfn; - if (cc->ignore_skip_hint) + if (cc->no_set_skip_hint) return; if (!page) @@ -307,7 +327,12 @@ static inline bool isolation_suitable(struct compact_control *cc, return true; } -static void update_pageblock_skip(struct compact_control *cc, +static inline bool pageblock_skip_persistent(struct page *page) +{ + return false; +} + +static inline void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, bool migrate_scanner) { @@ -449,13 +474,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * and the only danger is skipping too much. */ if (PageCompound(page)) { - unsigned int comp_order = compound_order(page); + const unsigned int order = compound_order(page); - if (likely(comp_order < MAX_ORDER)) { - blockpfn += (1UL << comp_order) - 1; - cursor += (1UL << comp_order) - 1; + if (likely(order < MAX_ORDER)) { + blockpfn += (1UL << order) - 1; + cursor += (1UL << order) - 1; } - goto isolate_fail; } @@ -772,11 +796,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * danger is skipping too much. */ if (PageCompound(page)) { - unsigned int comp_order = compound_order(page); - - if (likely(comp_order < MAX_ORDER)) - low_pfn += (1UL << comp_order) - 1; + const unsigned int order = compound_order(page); + if (likely(order < MAX_ORDER)) + low_pfn += (1UL << order) - 1; goto isolate_fail; } @@ -1928,9 +1951,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, - .ignore_skip_hint = true, + .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, - }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); diff --git a/mm/debug.c b/mm/debug.c index 6726bec731c9..56e2d9125ea5 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -50,7 +50,7 @@ void __dump_page(struct page *page, const char *reason) */ int mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", + pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) @@ -69,7 +69,7 @@ void __dump_page(struct page *page, const char *reason) #ifdef CONFIG_MEMCG if (page->mem_cgroup) - pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); + pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); #endif } @@ -84,10 +84,10 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %p start %p end %p\n" - "next %p prev %p mm %p\n" - "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n" + pr_emerg("vma %px start %px end %px\n" + "next %px prev %px mm %px\n" + "prot %lx anon_vma %px vm_ops %px\n" + "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, @@ -100,27 +100,27 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n" + pr_emerg("mm %px mmap %px seqnum %d task_size %lu\n" #ifdef CONFIG_MMU - "get_unmapped_area %p\n" + "get_unmapped_area %px\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %p flags %lx core_state %p\n" + "binfmt %px flags %lx core_state %px\n" #ifdef CONFIG_AIO - "ioctx_table %p\n" + "ioctx_table %px\n" #endif #ifdef CONFIG_MEMCG - "owner %p " + "owner %px " #endif - "exe_file %p\n" + "exe_file %px\n" #ifdef CONFIG_MMU_NOTIFIER - "mmu_notifier_mm %p\n" + "mmu_notifier_mm %px\n" #endif #ifdef CONFIG_NUMA_BALANCING "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" @@ -135,8 +135,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - atomic_long_read((atomic_long_t *)&mm->nr_ptes), - mm_nr_pmds((struct mm_struct *)mm), + mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index d04ac1ec0559..1826f191e72c 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -111,7 +111,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) enum fixed_addresses idx; int i, slot; - WARN_ON(system_state != SYSTEM_BOOTING); + WARN_ON(system_state >= SYSTEM_RUNNING); slot = -1; for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { diff --git a/mm/filemap.c b/mm/filemap.c index 594d73fef8b4..ee83baaf855d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,6 +35,7 @@ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/cleancache.h> +#include <linux/shmem_fs.h> #include <linux/rmap.h> #include "internal.h" @@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping, *shadowp = p; } __radix_tree_replace(&mapping->page_tree, node, slot, page, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); mapping->nrpages++; return 0; } @@ -162,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping, radix_tree_clear_tags(&mapping->page_tree, node, slot); __radix_tree_replace(&mapping->page_tree, node, slot, shadow, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); } + page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ + if (shadow) { mapping->nrexceptional += nr; /* @@ -178,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping, mapping->nrpages -= nr; } -/* - * Delete a page from the page cache and free it. Caller has to make - * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. - */ -void __delete_from_page_cache(struct page *page, void *shadow) +static void unaccount_page_cache_page(struct address_space *mapping, + struct page *page) { - struct address_space *mapping = page->mapping; - int nr = hpage_nr_pages(page); + int nr; - trace_mm_filemap_delete_from_page_cache(page); /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave @@ -224,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow) } } - page_cache_tree_delete(mapping, page, shadow); - - page->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ - /* hugetlb pages do not participate in page cache accounting. */ if (PageHuge(page)) return; + nr = hpage_nr_pages(page); + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); @@ -243,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow) } /* - * At this point page must be either written or cleaned by truncate. - * Dirty page here signals a bug and loss of unwritten data. + * At this point page must be either written or cleaned by + * truncate. Dirty page here signals a bug and loss of + * unwritten data. * - * This fixes dirty accounting after removing the page entirely but - * leaves PageDirty set: it has no effect for truncated page and - * anyway will be cleared before returning page into buddy allocator. + * This fixes dirty accounting after removing the page entirely + * but leaves PageDirty set: it has no effect for truncated + * page and anyway will be cleared before returning page into + * buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } +/* + * Delete a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold the mapping's tree_lock. + */ +void __delete_from_page_cache(struct page *page, void *shadow) +{ + struct address_space *mapping = page->mapping; + + trace_mm_filemap_delete_from_page_cache(page); + + unaccount_page_cache_page(mapping, page); + page_cache_tree_delete(mapping, page, shadow); +} + +static void page_cache_free_page(struct address_space *mapping, + struct page *page) +{ + void (*freepage)(struct page *); + + freepage = mapping->a_ops->freepage; + if (freepage) + freepage(page); + + if (PageTransHuge(page) && !PageHuge(page)) { + page_ref_sub(page, HPAGE_PMD_NR); + VM_BUG_ON_PAGE(page_count(page) <= 0, page); + } else { + put_page(page); + } +} + /** * delete_from_page_cache - delete page from page cache * @page: the page which the kernel is trying to remove from page cache @@ -266,27 +295,98 @@ void delete_from_page_cache(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; - void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); - - freepage = mapping->a_ops->freepage; - spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - if (freepage) - freepage(page); + page_cache_free_page(mapping, page); +} +EXPORT_SYMBOL(delete_from_page_cache); - if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, HPAGE_PMD_NR); - VM_BUG_ON_PAGE(page_count(page) <= 0, page); - } else { - put_page(page); +/* + * page_cache_tree_delete_batch - delete several pages from page cache + * @mapping: the mapping to which pages belong + * @pvec: pagevec with pages to delete + * + * The function walks over mapping->page_tree and removes pages passed in @pvec + * from the radix tree. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (radix tree entries at those indices are not + * modified). The function expects only THP head pages to be present in the + * @pvec and takes care to delete all corresponding tail pages from the radix + * tree as well. + * + * The function expects mapping->tree_lock to be held. + */ +static void +page_cache_tree_delete_batch(struct address_space *mapping, + struct pagevec *pvec) +{ + struct radix_tree_iter iter; + void **slot; + int total_pages = 0; + int i = 0, tail_pages = 0; + struct page *page; + pgoff_t start; + + start = pvec->pages[0]->index; + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (i >= pagevec_count(pvec) && !tail_pages) + break; + page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (radix_tree_exceptional_entry(page)) + continue; + if (!tail_pages) { + /* + * Some page got inserted in our range? Skip it. We + * have our pages locked so they are protected from + * being removed. + */ + if (page != pvec->pages[i]) + continue; + WARN_ON_ONCE(!PageLocked(page)); + if (PageTransHuge(page) && !PageHuge(page)) + tail_pages = HPAGE_PMD_NR - 1; + page->mapping = NULL; + /* + * Leave page->index set: truncation lookup relies + * upon it + */ + i++; + } else { + tail_pages--; + } + radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); + __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + workingset_lookup_update(mapping)); + total_pages++; } + mapping->nrpages -= total_pages; +} + +void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec) +{ + int i; + unsigned long flags; + + if (!pagevec_count(pvec)) + return; + + spin_lock_irqsave(&mapping->tree_lock, flags); + for (i = 0; i < pagevec_count(pvec); i++) { + trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); + + unaccount_page_cache_page(mapping, pvec->pages[i]); + } + page_cache_tree_delete_batch(mapping, pvec); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + for (i = 0; i < pagevec_count(pvec); i++) + page_cache_free_page(mapping, pvec->pages[i]); } -EXPORT_SYMBOL(delete_from_page_cache); int filemap_check_errors(struct address_space *mapping) { @@ -419,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping, if (end_byte < start_byte) return; - pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + pagevec_init(&pvec); + while (index <= end) { unsigned i; + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_WRITEBACK); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - wait_on_page_writeback(page); ClearPageError(page); } @@ -1041,6 +1139,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) wait_queue_head_t *q = page_waitqueue(page); return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false); } +EXPORT_SYMBOL(wait_on_page_bit_killable); /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue @@ -1754,9 +1853,10 @@ repeat: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_tag - find and return pages that match @tag + * find_get_pages_range_tag - find and return pages in given range matching @tag * @mapping: the address_space to search * @index: the starting page index + * @end: The final page index (inclusive) * @tag: the tag index * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed @@ -1764,8 +1864,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. */ -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1778,6 +1879,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1819,18 +1923,28 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *index = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when we got at @end. We take care to not overflow the + * index @index as it confuses some of the callers. This breaks the + * iteration when there is page at index -1 but that is already broken + * anyway. + */ + if (end == (pgoff_t)-1) + *index = (pgoff_t)-1; + else + *index = end + 1; +out: rcu_read_unlock(); - if (ret) - *index = pages[ret - 1]->index + 1; - return ret; } -EXPORT_SYMBOL(find_get_pages_tag); +EXPORT_SYMBOL(find_get_pages_range_tag); /** * find_get_entries_tag - find and return entries that match @tag @@ -2159,7 +2273,7 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc(mapping); if (!page) { error = -ENOMEM; goto out; @@ -2271,7 +2385,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) int ret; do { - page = __page_cache_alloc(gfp_mask|__GFP_COLD); + page = __page_cache_alloc(gfp_mask); if (!page) return -ENOMEM; @@ -2675,7 +2789,7 @@ static struct page *do_read_cache_page(struct address_space *mapping, repeat: page = find_get_page(mapping, index); if (!page) { - page = __page_cache_alloc(gfp | __GFP_COLD); + page = __page_cache_alloc(gfp); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, gfp); diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 2f98df0d460e..c64dca6e27c2 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -53,6 +53,20 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, ret = -EFAULT; goto out; } + + /* + * While get_vaddr_frames() could be used for transient (kernel + * controlled lifetime) pinning of memory pages all current + * users establish long term (userspace controlled lifetime) + * page pinning. Treat get_vaddr_frames() like + * get_user_pages_longterm() and disallow it for filesystem-dax + * mappings. + */ + if (vma_is_fsdax(vma)) { + ret = -EOPNOTSUPP; + goto out; + } + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; @@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); +#ifdef CONFIG_FS_DAX +/* + * This is the same as get_user_pages() in that it assumes we are + * operating on the current task's mm, but it goes further to validate + * that the vmas associated with the address range are suitable for + * longterm elevated page reference counts. For example, filesystem-dax + * mappings are subject to the lifetime enforced by the filesystem and + * we need guarantees that longterm users like RDMA and V4L2 only + * establish mappings that have a kernel enforced revocation mechanism. + * + * "longterm" == userspace controlled elevated page count lifetime. + * Contrast this to iov_iter_get_pages() usages which are transient. + */ +long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas_arg) +{ + struct vm_area_struct **vmas = vmas_arg; + struct vm_area_struct *vma_prev = NULL; + long rc, i; + + if (!pages) + return -EINVAL; + + if (!vmas) { + vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas) + return -ENOMEM; + } + + rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + + for (i = 0; i < rc; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma == vma_prev) + continue; + + vma_prev = vma; + + if (vma_is_fsdax(vma)) + break; + } + + /* + * Either get_user_pages() failed, or the vma validation + * succeeded, in either case we don't need to put_page() before + * returning. + */ + if (i >= rc) + goto out; + + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; +out: + if (vmas != vmas_arg) + kfree(vmas); + return rc; +} +EXPORT_SYMBOL(get_user_pages_longterm); +#endif /* CONFIG_FS_DAX */ + /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c new file mode 100644 index 000000000000..5c8e2abeaa15 --- /dev/null +++ b/mm/gup_benchmark.c @@ -0,0 +1,100 @@ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/ktime.h> +#include <linux/debugfs.h> + +#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) + +struct gup_benchmark { + __u64 delta_usec; + __u64 addr; + __u64 size; + __u32 nr_pages_per_call; + __u32 flags; +}; + +static int __gup_benchmark_ioctl(unsigned int cmd, + struct gup_benchmark *gup) +{ + ktime_t start_time, end_time; + unsigned long i, nr, nr_pages, addr, next; + struct page **pages; + + nr_pages = gup->size / PAGE_SIZE; + pages = kvmalloc(sizeof(void *) * nr_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + i = 0; + nr = gup->nr_pages_per_call; + start_time = ktime_get(); + for (addr = gup->addr; addr < gup->addr + gup->size; addr = next) { + if (nr != gup->nr_pages_per_call) + break; + + next = addr + nr * PAGE_SIZE; + if (next > gup->addr + gup->size) { + next = gup->addr + gup->size; + nr = (next - addr) / PAGE_SIZE; + } + + nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); + i += nr; + } + end_time = ktime_get(); + + gup->delta_usec = ktime_us_delta(end_time, start_time); + gup->size = addr - gup->addr; + + for (i = 0; i < nr_pages; i++) { + if (!pages[i]) + break; + put_page(pages[i]); + } + + kvfree(pages); + return 0; +} + +static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + struct gup_benchmark gup; + int ret; + + if (cmd != GUP_FAST_BENCHMARK) + return -EINVAL; + + if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) + return -EFAULT; + + ret = __gup_benchmark_ioctl(cmd, &gup); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &gup, sizeof(gup))) + return -EFAULT; + + return 0; +} + +static const struct file_operations gup_benchmark_fops = { + .open = nonseekable_open, + .unlocked_ioctl = gup_benchmark_ioctl, +}; + +static int gup_benchmark_init(void) +{ + void *ret; + + ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, + &gup_benchmark_fops); + if (!ret) + pr_warn("Failed to create gup_benchmark in debugfs"); + + return 0; +} + +late_initcall(gup_benchmark_init); @@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); static void hmm_devmem_radix_release(struct resource *resource) { - resource_size_t key, align_start, align_size, align_end; + resource_size_t key, align_start, align_size; align_start = resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); - align_end = align_start + align_size - 1; mutex_lock(&hmm_devmem_lock); for (key = resource->start; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1981ed697dab..0e7ded98d114 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -39,10 +39,10 @@ #include "internal.h" /* - * By default transparent hugepage support is disabled in order that avoid - * to risk increase the memory footprint of applications without a guaranteed - * benefit. When transparent hugepage support is enabled, is for all mappings, - * and khugepaged scans all mappings. + * By default, transparent hugepage support is disabled in order to avoid + * risking an increased memory footprint for applications that are not + * guaranteed to benefit from it. When transparent hugepage support is + * enabled, it is for all mappings, and khugepaged scans all mappings. * Defrag is invoked by khugepaged hugepage allocations and by page faults * for all hugepage allocations. */ @@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, if (pgtable) pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); return true; } @@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); } set_pmd_at(mm, addr, pmd, entry); @@ -842,20 +842,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) + pmd_t *pmd, int flags) { pmd_t _pmd; - /* - * We should set the dirty bit only for FOLL_WRITE but for now - * the dirty bit in the pmd is meaningless. And if the dirty - * bit will become meaningful and we'll only set it with - * FOLL_WRITE, an atomic set_bit will be required on the pmd to - * set the young bit, instead of the current set_pmd_at. - */ - _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + _pmd = pmd_mkyoung(*pmd); + if (flags & FOLL_WRITE) + _pmd = pmd_mkdirty(_pmd); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, 1)) + pmd, _pmd, flags & FOLL_WRITE)) update_mmu_cache_pmd(vma, addr, pmd); } @@ -884,7 +879,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return NULL; if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd); + touch_pmd(vma, addr, pmd, flags); /* * device mapped pages can only be returned if the @@ -942,7 +937,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; @@ -978,7 +973,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); @@ -995,20 +990,15 @@ out: #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void touch_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud) + pud_t *pud, int flags) { pud_t _pud; - /* - * We should set the dirty bit only for FOLL_WRITE but for now - * the dirty bit in the pud is meaningless. And if the dirty - * bit will become meaningful and we'll only set it with - * FOLL_WRITE, an atomic set_bit will be required on the pud to - * set the young bit, instead of the current set_pud_at. - */ - _pud = pud_mkyoung(pud_mkdirty(*pud)); + _pud = pud_mkyoung(*pud); + if (flags & FOLL_WRITE) + _pud = pud_mkdirty(_pud); if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, - pud, _pud, 1)) + pud, _pud, flags & FOLL_WRITE)) update_mmu_cache_pud(vma, addr, pud); } @@ -1031,7 +1021,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return NULL; if (flags & FOLL_TOUCH) - touch_pud(vma, addr, pud); + touch_pud(vma, addr, pud, flags); /* * device mapped pages can only be returned if the @@ -1189,8 +1179,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); + /* + * Leave pmd empty until pte is filled note we must notify here as + * concurrent CPU thread might write to new page before the call to + * mmu_notifier_invalidate_range_end() happens which can lead to a + * device seeing memory write in different order than CPU. + * + * See Documentation/vm/mmu_notifier.txt + */ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); @@ -1216,7 +1213,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, page_remove_rmap(page, true); spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -1365,7 +1367,12 @@ alloc: } spin_unlock(vmf->ptl); out_mn: - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); out: return ret; out_unlock: @@ -1407,7 +1414,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd); + touch_pmd(vma, addr, pmd, flags); if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * We don't mlock() pte-mapped THPs. This way we can avoid @@ -1678,7 +1685,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -2017,7 +2024,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, out: spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pudp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + + HPAGE_PUD_SIZE); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2029,8 +2041,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - /* leave pmd empty until pte is filled */ - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * Leave pmd empty until pte is filled note that it is fine to delay + * notification until mmu_notifier_invalidate_range_end() as we are + * replacing a zero pmd write protected page with a zero pte write + * protected page. + * + * See Documentation/vm/mmu_notifier.txt + */ + pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2085,6 +2104,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { + /* + * FIXME: Do we want to invalidate secondary mmu by calling + * mmu_notifier_invalidate_range() see comments below inside + * __split_huge_pmd() ? + * + * We are going from a zero huge page write protected to zero + * small page also write protected so it does not seems useful + * to invalidate secondary mmu at this time. + */ return __split_huge_zero_page_pmd(vma, haddr, pmd); } @@ -2220,7 +2248,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); + /* + * No need to double call mmu_notifier->invalidate_range() callback. + * They are 3 cases to consider inside __split_huge_pmd_locked(): + * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious + * 2) __split_huge_zero_page_pmd() read only zero page and any write + * fault will trigger a flush_notify before pointing to a new page + * (it is fine if the secondary mmu keeps pointing to the old zero + * page in the meantime) + * 3) Split a huge pmd into pte pointing to the same page. No need + * to invalidate secondary tlb entry they are all still valid. + * any further changes to individual pte will notify. So no need + * to call mmu_notifier->invalidate_range() + */ + mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + + HPAGE_PMD_SIZE); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, @@ -2718,7 +2760,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); - return ACCESS_ONCE(pgdata->split_queue_len); + return READ_ONCE(pgdata->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2d2ff5e8bf2b..9a334f5fb730 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) } } +static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) +{ + if (addr & ~(huge_page_mask(hstate_vma(vma)))) + return -EINVAL; + return 0; +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, + .split = hugetlb_vm_op_split, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, @@ -3256,9 +3264,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { if (cow) { + /* + * No need to notify as we are downgrading page + * table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ huge_ptep_set_wrprotect(src, addr, src_pte); - mmu_notifier_invalidate_range(src, mmun_start, - mmun_end); } entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); @@ -4318,7 +4331,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * and that page table be reused and filled with junk. */ flush_hugetlb_tlb_range(vma, start, end); - mmu_notifier_invalidate_range(mm, start, end); + /* + * No need to call mmu_notifier_invalidate_range() we are downgrading + * page table protection not changing it to point to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(mm, start, end); @@ -4617,7 +4635,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte_t *pte = NULL; pgd = pgd_offset(mm, addr); - p4d = p4d_offset(pgd, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; pud = pud_alloc(mm, p4d, addr); if (pud) { if (sz == PUD_SIZE) { diff --git a/mm/internal.h b/mm/internal.h index 1df011f62480..e6bd35182dae 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -198,6 +198,7 @@ struct compact_control { const int classzone_idx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6f319fb81718..405bba487df5 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -337,7 +337,7 @@ static size_t optimal_redzone(size_t object_size) } void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags) + slab_flags_t *flags) { int redzone_adjust; int orig_size = *size; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 6bcfb01ba038..410c8235e671 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -134,7 +134,7 @@ static void print_error_description(struct kasan_access_info *info) pr_err("BUG: KASAN: %s in %pS\n", bug_type, (void *)info->ip); - pr_err("%s of size %zu at addr %p by task %s/%d\n", + pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, info->access_addr, current->comm, task_pid_nr(current)); } @@ -206,7 +206,7 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, const char *rel_type; int rel_bytes; - pr_err("The buggy address belongs to the object at %p\n" + pr_err("The buggy address belongs to the object at %px\n" " which belongs to the cache %s of size %d\n", object, cache->name, cache->object_size); @@ -225,7 +225,7 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, } pr_err("The buggy address is located %d bytes %s of\n" - " %d-byte region [%p, %p)\n", + " %d-byte region [%px, %px)\n", rel_bytes, rel_type, cache->object_size, (void *)object_addr, (void *)(object_addr + cache->object_size)); } @@ -302,7 +302,7 @@ static void print_shadow_for_address(const void *addr) char shadow_buf[SHADOW_BYTES_PER_ROW]; snprintf(buffer, sizeof(buffer), - (i == 0) ? ">%p: " : " %p: ", kaddr); + (i == 0) ? ">%px: " : " %px: ", kaddr); /* * We should not pass a shadow pointer to generic * function, because generic functions may try to diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 43cb3043311b..ea4ff259b671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); + mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); } } diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c deleted file mode 100644 index 800d64b854ea..000000000000 --- a/mm/kmemcheck.c +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/gfp.h> -#include <linux/mm_types.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include "slab.h" -#include <linux/kmemcheck.h> - -void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) -{ - struct page *shadow; - int pages; - int i; - - pages = 1 << order; - - /* - * With kmemcheck enabled, we need to allocate a memory area for the - * shadow bits as well. - */ - shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); - if (!shadow) { - if (printk_ratelimit()) - pr_err("kmemcheck: failed to allocate shadow bitmap\n"); - return; - } - - for(i = 0; i < pages; ++i) - page[i].shadow = page_address(&shadow[i]); - - /* - * Mark it as non-present for the MMU so that our accesses to - * this memory will trigger a page fault and let us analyze - * the memory accesses. - */ - kmemcheck_hide_pages(page, pages); -} - -void kmemcheck_free_shadow(struct page *page, int order) -{ - struct page *shadow; - int pages; - int i; - - if (!kmemcheck_page_is_tracked(page)) - return; - - pages = 1 << order; - - kmemcheck_show_pages(page, pages); - - shadow = virt_to_page(page[0].shadow); - - for(i = 0; i < pages; ++i) - page[i].shadow = NULL; - - __free_pages(shadow, order); -} - -void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size) -{ - if (unlikely(!object)) /* Skip object if allocation failed */ - return; - - /* - * Has already been memset(), which initializes the shadow for us - * as well. - */ - if (gfpflags & __GFP_ZERO) - return; - - /* No need to initialize the shadow of a non-tracked slab. */ - if (s->flags & SLAB_NOTRACK) - return; - - if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { - /* - * Allow notracked objects to be allocated from - * tracked caches. Note however that these objects - * will still get page faults on access, they just - * won't ever be flagged as uninitialized. If page - * faults are not acceptable, the slab cache itself - * should be marked NOTRACK. - */ - kmemcheck_mark_initialized(object, size); - } else if (!s->ctor) { - /* - * New objects should be marked uninitialized before - * they're returned to the called. - */ - kmemcheck_mark_uninitialized(object, size); - } -} - -void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) -{ - /* TODO: RCU freeing is unsupported for now; hide false positives. */ - if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - kmemcheck_mark_freed(object, size); -} - -void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, - gfp_t gfpflags) -{ - int pages; - - if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) - return; - - pages = 1 << order; - - /* - * NOTE: We choose to track GFP_ZERO pages too; in fact, they - * can become uninitialized by copying uninitialized memory - * into them. - */ - - /* XXX: Can use zone->node for node? */ - kmemcheck_alloc_shadow(page, order, gfpflags, -1); - - if (gfpflags & __GFP_ZERO) - kmemcheck_mark_initialized_pages(page, pages); - else - kmemcheck_mark_uninitialized_pages(page, pages); -} diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 7780cd83a495..f656ca27f6c2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -110,7 +110,6 @@ #include <linux/atomic.h> #include <linux/kasan.h> -#include <linux/kmemcheck.h> #include <linux/kmemleak.h> #include <linux/memory_hotplug.h> @@ -128,7 +127,7 @@ /* GFP bitmask for kmemleak internal allocations */ #define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ - __GFP_NOWARN) + __GFP_NOWARN | __GFP_NOFAIL) /* scanning area inside a memory block */ struct kmemleak_scan_area { @@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object) { u32 old_csum = object->checksum; - if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) - return false; - kasan_disable_current(); object->checksum = crc32(0, (void *)object->pointer, object->size); kasan_enable_current(); @@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end, if (scan_should_stop()) break; - /* don't scan uninitialized memory */ - if (!kmemcheck_is_obj_initialized((unsigned long)ptr, - BYTES_PER_POINTER)) - continue; - kasan_disable_current(); pointer = *ptr; kasan_enable_current(); @@ -1532,6 +1523,8 @@ static void kmemleak_scan(void) if (page_count(page) == 0) continue; scan_block(page, page + 1, NULL); + if (!(pfn & 63)) + cond_resched(); } } put_online_mems(); @@ -2104,7 +2097,7 @@ static int __init kmemleak_late_init(void) return -ENOMEM; } - dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, + dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops); if (!dentry) pr_warn("Failed to create the debugfs kmemleak file\n"); @@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. + * + * No need to notify as we are downgrading page table to read + * only not changing it to point to a new page. + * + * See Documentation/vm/mmu_notifier.txt */ - entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); + entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page @@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); + /* + * No need to notify as we are replacing a read only page with another + * read only page with the same content. + * + * See Documentation/vm/mmu_notifier.txt + */ + ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); page_remove_rmap(page, false); diff --git a/mm/list_lru.c b/mm/list_lru.c index f141f0c80ff3..fd41e969ede5 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -221,6 +221,7 @@ restart: switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); + /* fall through */ case LRU_REMOVED: isolated++; nlru->nr_items--; diff --git a/mm/madvise.c b/mm/madvise.c index 375cf32087e4..751e97aa2210 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -276,15 +276,14 @@ static long madvise_willneed(struct vm_area_struct *vma, { struct file *file = vma->vm_file; + *prev = vma; #ifdef CONFIG_SWAP if (!file) { - *prev = vma; force_swapin_readahead(vma, start, end); return 0; } if (shmem_mapping(file->f_mapping)) { - *prev = vma; force_shm_swapin_readahead(vma, start, end, file->f_mapping); return 0; @@ -299,7 +298,6 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } - *prev = vma; start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) end = vma->vm_end; diff --git a/mm/memblock.c b/mm/memblock.c index 91205780e6b1..46aacdfa4f4d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -533,7 +533,7 @@ repeat: base = obase; nr_new = 0; - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -637,7 +637,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, if (memblock_double_array(type, base, size) < 0) return -ENOMEM; - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -1327,7 +1327,6 @@ again: return NULL; done: ptr = phys_to_virt(alloc); - memset(ptr, 0, size); /* * The min_count is set to 0 so that bootmem allocated blocks @@ -1341,6 +1340,45 @@ done: } /** + * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing + * memory and without panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. Does not zero allocated memory, does not panic if request + * cannot be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); +#ifdef CONFIG_DEBUG_VM + if (ptr && size > 0) + memset(ptr, 0xff, size); +#endif + return ptr; +} + +/** * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size @@ -1351,8 +1389,8 @@ done: * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides - * additional debug information (including caller info), if enabled. + * Public function, provides additional debug information (including caller + * info), if enabled. This function zeroes the allocated memory. * * RETURNS: * Virtual address of allocated memory block on success, NULL on failure. @@ -1362,11 +1400,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { + void *ptr; + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", __func__, (u64)size, (u64)align, nid, (u64)min_addr, (u64)max_addr, (void *)_RET_IP_); - return memblock_virt_alloc_internal(size, align, min_addr, - max_addr, nid); + + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + memset(ptr, 0, size); + return ptr; } /** @@ -1380,7 +1424,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * Public panicking version of memblock_virt_alloc_try_nid_nopanic() * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * @@ -1399,8 +1443,10 @@ void * __init memblock_virt_alloc_try_nid( (u64)max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); - if (ptr) + if (ptr) { + memset(ptr, 0, size); return ptr; + } panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", __func__, (u64)size, (u64)align, nid, (u64)min_addr, @@ -1715,7 +1761,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { char nid_buf[32] = ""; base = rgn->base; @@ -1739,7 +1785,7 @@ memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) unsigned long size = 0; int idx; - for_each_memblock_type((&memblock.reserved), rgn) { + for_each_memblock_type(idx, (&memblock.reserved), rgn) { phys_addr_t start, end; if (rgn->base + rgn->size < start_addr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 661f046ad318..ac2ffd5e02b9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4049,7 +4049,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#ifdef CONFIG_SLABINFO +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) { .name = "kmem.slabinfo", .seq_start = memcg_slab_start, @@ -6044,7 +6044,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) - css_put(&memcg->css); + css_put_many(&memcg->css, nr_entries); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 88366626c0b7..4acdf393a801 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1587,7 +1587,7 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n", pfn, ret, page->flags, &page->flags); if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); diff --git a/mm/memory.c b/mm/memory.c index a728bed16c20..793004608332 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -665,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); pmd_populate(mm, pmd, new); new = NULL; } @@ -2554,7 +2555,11 @@ static int wp_page_copy(struct vm_fault *vmf) put_page(new_page); pte_unmap_unlock(vmf->pte, vmf->ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, @@ -2842,7 +2847,7 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *swapcache; + struct page *page = NULL, *swapcache = NULL; struct mem_cgroup *memcg; struct vma_swap_readahead swap_ra; swp_entry_t entry; @@ -2852,8 +2857,11 @@ int do_swap_page(struct vm_fault *vmf) int ret = 0; bool vma_readahead = swap_use_vma_readahead(); - if (vma_readahead) + if (vma_readahead) { page = swap_readahead_detect(vmf, &swap_ra); + swapcache = page; + } + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { if (page) put_page(page); @@ -2881,17 +2889,39 @@ int do_swap_page(struct vm_fault *vmf) } goto out; } + + delayacct_set_flag(DELAYACCT_PF_SWAPIN); - if (!page) + if (!page) { page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, vmf->address); + swapcache = page; + } + if (!page) { - if (vma_readahead) - page = do_swap_page_readahead(entry, - GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); - else - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, vmf->address); + struct swap_info_struct *si = swp_swap_info(entry); + + if (si->flags & SWP_SYNCHRONOUS_IO && + __swap_count(si, entry) == 1) { + /* skip swapcache */ + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (page) { + __SetPageLocked(page); + __SetPageSwapBacked(page); + set_page_private(page, entry.val); + lru_cache_add_anon(page); + swap_readpage(page, true); + } + } else { + if (vma_readahead) + page = do_swap_page_readahead(entry, + GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + else + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, vmf->address); + swapcache = page; + } + if (!page) { /* * Back out if somebody else faulted in this pte @@ -2920,7 +2950,6 @@ int do_swap_page(struct vm_fault *vmf) goto out_release; } - swapcache = page; locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); @@ -2935,7 +2964,8 @@ int do_swap_page(struct vm_fault *vmf) * test below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not changed. */ - if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) + if (unlikely((!PageSwapCache(page) || + page_private(page) != entry.val)) && swapcache) goto out_page; page = ksm_might_need_to_copy(page, vma, vmf->address); @@ -2988,14 +3018,16 @@ int do_swap_page(struct vm_fault *vmf) pte = pte_mksoft_dirty(pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); vmf->orig_pte = pte; - if (page == swapcache) { - do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - mem_cgroup_commit_charge(page, memcg, true, false); - activate_page(page); - } else { /* ksm created a completely new copy */ + + /* ksm created a completely new copy */ + if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); + } else { + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); + mem_cgroup_commit_charge(page, memcg, true, false); + activate_page(page); } swap_free(entry); @@ -3003,7 +3035,7 @@ int do_swap_page(struct vm_fault *vmf) (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); - if (page != swapcache) { + if (page != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -3036,7 +3068,7 @@ out_page: unlock_page(page); out_release: put_page(page); - if (page != swapcache) { + if (page != swapcache && swapcache) { unlock_page(swapcache); put_page(swapcache); } @@ -3212,7 +3244,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf) goto map_pte; } - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; @@ -3271,7 +3303,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) * We are going to consume the prealloc table, * count that as nr_ptes. */ - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } @@ -3805,7 +3837,8 @@ static inline int create_huge_pmd(struct vm_fault *vmf) return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) +/* `inline' is required to avoid gcc 4.1.2 build error */ +static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); @@ -3891,9 +3924,9 @@ static int handle_pte_fault(struct vm_fault *vmf) /* * some architectures can have larger ptes than wordsize, * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and - * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee - * atomic accesses. The code below just needs a consistent - * view for the ifs and we later double check anyway with the + * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic + * accesses. The code below just needs a consistent view + * for the ifs and we later double check anyway with the * ptl lock held. So here a barrier will do. */ barrier(); @@ -4124,15 +4157,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_5LEVEL_HACK - if (p4d_present(*p4d)) /* Another has populated it */ - pud_free(mm, new); - else + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); -#else - if (pgd_present(*p4d)) /* Another has populated it */ + } else /* Another has populated it */ pud_free(mm, new); - else +#else + if (!pgd_present(*p4d)) { + mm_inc_nr_puds(mm); pgd_populate(mm, p4d, new); + } else /* Another has populated it */ + pud_free(mm, new); #endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; @@ -4457,17 +4492,15 @@ void print_vma_addr(char *prefix, unsigned long ip) struct vm_area_struct *vma; /* - * Do not print if we are in atomic - * contexts (in exception stacks, etc.): + * we might be running from an atomic context so we cannot sleep */ - if (preempt_count()) + if (!down_read_trylock(&mm->mmap_sem)) return; - down_read(&mm->mmap_sem); vma = find_vma(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; - char *buf = (char *)__get_free_page(GFP_KERNEL); + char *buf = (char *)__get_free_page(GFP_NOWAIT); if (buf) { char *p; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d4b5f29906b9..c52aa05b106c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -265,7 +265,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, /* * Make all the pages reserved so that nobody will stumble over half * initialized state. - * FIXME: We also have to associate it with a node because pfn_to_node + * FIXME: We also have to associate it with a node because page_to_nid * relies on having page with the proper node. */ for (i = 0; i < PAGES_PER_SECTION; i++) { @@ -1590,11 +1590,11 @@ static void node_states_clear_node(int node, struct memory_notify *arg) } static int __ref __offline_pages(unsigned long start_pfn, - unsigned long end_pfn, unsigned long timeout) + unsigned long end_pfn) { - unsigned long pfn, nr_pages, expire; + unsigned long pfn, nr_pages; long offlined_pages; - int ret, drain, retry_max, node; + int ret, node; unsigned long flags; unsigned long valid_start, valid_end; struct zone *zone; @@ -1630,44 +1630,22 @@ static int __ref __offline_pages(unsigned long start_pfn, goto failed_removal; pfn = start_pfn; - expire = jiffies + timeout; - drain = 0; - retry_max = 5; repeat: /* start memory hot removal */ - ret = -EAGAIN; - if (time_after(jiffies, expire)) - goto failed_removal; ret = -EINTR; if (signal_pending(current)) goto failed_removal; - ret = 0; - if (drain) { - lru_add_drain_all_cpuslocked(); - cond_resched(); - drain_all_pages(zone); - } + + cond_resched(); + lru_add_drain_all_cpuslocked(); + drain_all_pages(zone); pfn = scan_movable_pages(start_pfn, end_pfn); if (pfn) { /* We have movable pages */ ret = do_migrate_range(pfn, end_pfn); - if (!ret) { - drain = 1; - goto repeat; - } else { - if (ret < 0) - if (--retry_max == 0) - goto failed_removal; - yield(); - drain = 1; - goto repeat; - } + goto repeat; } - /* drain all zone's lru pagevec, this is asynchronous... */ - lru_add_drain_all_cpuslocked(); - yield(); - /* drain pcp pages, this is synchronous. */ - drain_all_pages(zone); + /* * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. @@ -1677,10 +1655,8 @@ repeat: goto failed_removal; /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); - if (offlined_pages < 0) { - ret = -EBUSY; - goto failed_removal; - } + if (offlined_pages < 0) + goto repeat; pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ @@ -1728,7 +1704,7 @@ failed_removal: /* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { - return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); + return __offline_pages(start_pfn, start_pfn + nr_pages); } #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a2af6d58a68f..4ce44d3ff03d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -85,6 +85,7 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> +#include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> @@ -1365,7 +1366,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { - const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; @@ -1401,15 +1401,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, err = -EINVAL; /* - * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative - * capabilities, superuser privileges or the same - * userid as the target process. + * Check if this process has the right to modify the specified process. + * Use the regular "ptrace_may_access()" checks. */ - tcred = __task_cred(task); - if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && - !capable(CAP_SYS_NICE)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; @@ -1920,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, struct page *page; page = __alloc_pages(gfp, order, nid); + /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return page; if (page && page_to_nid(page) == nid) { preempt_disable(); __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); diff --git a/mm/mempool.c b/mm/mempool.c index c4a23cdae3f0..7d8c5a0010a2 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -189,7 +189,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); if (!pool) return NULL; - pool->elements = kmalloc_node(min_nr * sizeof(void *), + pool->elements = kmalloc_array_node(min_nr, sizeof(void *), gfp_mask, node_id); if (!pool->elements) { kfree(pool); diff --git a/mm/migrate.c b/mm/migrate.c index 1236449b4777..4d0be47a322a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2089,7 +2089,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2805,9 +2809,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() + * did already call it. + */ if (notified) - mmu_notifier_invalidate_range_end(mm, mmu_start, - migrate->end); + mmu_notifier_invalidate_range_only_end(mm, mmu_start, + migrate->end); } /* diff --git a/mm/mlock.c b/mm/mlock.c index 46af369c13e5..30472d438794 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) struct pagevec pvec_putback; int pgrescued = 0; - pagevec_init(&pvec_putback, 0); + pagevec_init(&pvec_putback); /* Phase 1: page isolation */ spin_lock_irq(zone_lru_lock(zone)); @@ -448,7 +448,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, struct pagevec pvec; struct zone *zone; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); /* * Although FOLL_DUMP is intended for get_dump_page(), * it just so happens that its special treatment of the @@ -670,8 +670,6 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla if (!can_do_mlock()) return -EPERM; - lru_add_drain_all(); /* flush pagevec */ - len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; @@ -798,9 +796,6 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!can_do_mlock()) return -EPERM; - if (flags & MCL_CURRENT) - lru_add_drain_all(); /* flush pagevec */ - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; diff --git a/mm/mmap.c b/mm/mmap.c index 680506faceae..9efdc021ad22 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); + unsigned long flags_mask; + + flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; switch (flags & MAP_TYPE) { case MAP_SHARED: + /* + * Force use of MAP_SHARED_VALIDATE with non-legacy + * flags. E.g. MAP_SYNC is dangerous to use with + * MAP_SHARED as you don't know which consistency model + * you will get. We silently ignore unsupported flags + * with MAP_SHARED to preserve backward compatibility. + */ + flags &= LEGACY_MAP_MASK; + /* fall through */ + case MAP_SHARED_VALIDATE: + if (flags & ~flags_mask) + return -EOPNOTSUPP; if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) return -EACCES; @@ -2540,9 +2555,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - if (is_vm_hugetlb_page(vma) && (addr & - ~(huge_page_mask(hstate_vma(vma))))) - return -EINVAL; + if (vma->vm_ops && vma->vm_ops->split) { + err = vma->vm_ops->split(vma, addr); + if (err) + return err; + } new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) @@ -3002,20 +3019,20 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - set_bit(MMF_OOM_SKIP, &mm->flags); - if (unlikely(tsk_is_oom_victim(current))) { + if (unlikely(mm_is_oom_victim(mm))) { /* * Wait for oom_reap_task() to stop working on this * mm. Because MMF_OOM_SKIP is already set before * calling down_read(), oom_reap_task() will not run * on this "mm" post up_write(). * - * tsk_is_oom_victim() cannot be set from under us - * either because current->mm is already set to NULL + * mm_is_oom_victim() cannot be set from under us + * either because victim->mm is already set to NULL * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if current->mm + * set not NULL by the OOM killer only if victim->mm * is found not NULL while holding the task_lock. */ + set_bit(MMF_OOM_SKIP, &mm->flags); down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 314285284e6e..96edb33fd09a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -190,7 +190,9 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, + unsigned long end, + bool only_end) { struct mmu_notifier *mn; int id; @@ -204,8 +206,13 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, * subsystem registers either invalidate_range_start()/end() or * invalidate_range(), so this will be no additional overhead * (besides the pointer check). + * + * We skip call to invalidate_range() if we know it is safe ie + * call site use mmu_notifier_invalidate_range_only_end() which + * is safe to do when we know that a call to invalidate_range() + * already happen under page table lock. */ - if (mn->ops->invalidate_range) + if (!only_end && mn->ops->invalidate_range) mn->ops->invalidate_range(mn, mm, start, end); if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); diff --git a/mm/mprotect.c b/mm/mprotect.c index ec39f730a0bf..58b629bb70de 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -166,7 +166,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, next = pmd_addr_end(addr, end); if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) && pmd_none_or_clear_bad(pmd)) - continue; + goto next; /* invoke the mmu notifier if the pmd is populated */ if (!mni_start) { @@ -188,7 +188,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } /* huge pmd was handled */ - continue; + goto next; } } /* fall through, the trans huge pmd just split */ @@ -196,6 +196,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; +next: + cond_resched(); } while (pmd++, addr = next, addr != end); if (mni_start) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dee0f75c3013..29f855551efe 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include <asm/tlb.h> #include "internal.h" +#include "slab.h" #define CREATE_TRACE_POINTS #include <trace/events/oom.h> @@ -161,6 +162,25 @@ static bool oom_unkillable_task(struct task_struct *p, return false; } +/* + * Print out unreclaimble slabs info when unreclaimable slabs amount is greater + * than all user memory (LRU pages) + */ +static bool is_dump_unreclaim_slabs(void) +{ + unsigned long nr_lru; + + nr_lru = global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + + global_node_page_state(NR_ISOLATED_ANON) + + global_node_page_state(NR_ISOLATED_FILE) + + global_node_page_state(NR_UNEVICTABLE); + + return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru); +} + /** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate @@ -201,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); /* @@ -369,15 +389,15 @@ static void select_bad_process(struct oom_control *oc) * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes * are not shown. - * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, - * swapents, oom_score_adj value, and name. + * State information includes task's pid, uid, tgid, vm size, rss, + * pgtables_bytes, swapents, oom_score_adj value, and name. */ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -393,11 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - atomic_long_read(&task->mm->nr_ptes), - mm_nr_pmds(task->mm), + mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -407,23 +426,22 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", - current->comm, oc->gfp_mask, &oc->gfp_mask); - if (oc->nodemask) - pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); - else - pr_cont("(null)"); - pr_cont(", order=%d, oom_score_adj=%hd\n", - oc->order, current->signal->oom_score_adj); + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, + nodemask_pr_args(oc->nodemask), oc->order, + current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); cpuset_print_current_mems_allowed(); dump_stack(); - if (oc->memcg) + if (is_memcg_oom(oc)) mem_cgroup_print_oom_info(oc->memcg, p); - else + else { show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); + if (is_dump_unreclaim_slabs()) + dump_unreclaimable_slab(); + } if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); } @@ -532,7 +550,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ set_bit(MMF_UNSTABLE, &mm->flags); - tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { if (!can_madv_dontneed_vma(vma)) continue; @@ -547,11 +564,13 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * we do not want to block exit_mmap by keeping mm ref * count elevated without a good reason. */ - if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { + tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end); unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, NULL); + tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end); + } } - tlb_finish_mmu(&tlb, 0, -1); pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), @@ -618,9 +637,6 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - if (!oom_reaper_th) - return; - /* tsk is already queued? */ if (tsk == oom_reaper_list || tsk->oom_reaper_list) return; @@ -638,11 +654,6 @@ static void wake_oom_reaper(struct task_struct *tsk) static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); - if (IS_ERR(oom_reaper_th)) { - pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", - PTR_ERR(oom_reaper_th)); - oom_reaper_th = NULL; - } return 0; } subsys_initcall(oom_init) @@ -672,8 +683,10 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { mmgrab(tsk->signal->oom_mm); + set_bit(MMF_OOM_VICTIM, &mm->flags); + } /* * Make sure that the task is woken up from uninterruptible sleep diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b9c5cbe8eba..586f31261c83 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -625,9 +625,9 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ -static void writeout_period(unsigned long t) +static void writeout_period(struct timer_list *t) { - struct wb_domain *dom = (void *)t; + struct wb_domain *dom = from_timer(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; @@ -650,8 +650,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) spin_lock_init(&dom->lock); - setup_deferrable_timer(&dom->period_timer, writeout_period, - (unsigned long)dom); + timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); dom->dirty_limit_tstamp = jiffies; @@ -1543,7 +1542,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * actually dirty; with m+n sitting in the percpu * deltas. */ - if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { + if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { @@ -1559,8 +1558,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping, - struct bdi_writeback *wb, +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; @@ -1802,7 +1800,7 @@ pause: * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ - if (sdtc->wb_dirty <= wb_stat_error(wb)) + if (sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) @@ -1910,7 +1908,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(mapping, wb, current->nr_dirtied); + balance_dirty_pages(wb, current->nr_dirtied); wb_put(wb); } @@ -1972,31 +1970,32 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); - return 0; -} + unsigned int old_interval = dirty_writeback_interval; + int ret; -#ifdef CONFIG_BLOCK -void laptop_mode_timer_fn(unsigned long data) -{ - struct request_queue *q = (struct request_queue *)data; - int nr_pages = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); - struct bdi_writeback *wb; + ret = proc_dointvec(table, write, buffer, length, ppos); /* - * We want to write everything out, not just down to the dirty - * threshold + * Writing 0 to dirty_writeback_interval will disable periodic writeback + * and a different non-zero value will wakeup the writeback threads. + * wb_wakeup_delayed() would be more appropriate, but it's a pain to + * iterate over all bdis and wbs. + * The reason we do this is to make the change take effect immediately. */ - if (!bdi_has_dirty_io(q->backing_dev_info)) - return; + if (!ret && write && dirty_writeback_interval && + dirty_writeback_interval != old_interval) + wakeup_flusher_threads(WB_REASON_PERIODIC); - rcu_read_lock(); - list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node) - if (wb_has_dirty_io(wb)) - wb_start_writeback(wb, nr_pages, true, - WB_REASON_LAPTOP_TIMER); - rcu_read_unlock(); + return ret; +} + +#ifdef CONFIG_BLOCK +void laptop_mode_timer_fn(struct timer_list *t) +{ + struct backing_dev_info *backing_dev_info = + from_timer(backing_dev_info, t, laptop_mode_wb_timer); + + wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); } /* @@ -2167,7 +2166,7 @@ int write_cache_pages(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -2194,30 +2193,14 @@ retry: while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - break; - } - done_index = page->index; lock_page(page); @@ -2623,7 +2606,7 @@ EXPORT_SYMBOL(set_page_dirty_lock); * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ -void cancel_dirty_page(struct page *page) +void __cancel_dirty_page(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -2644,7 +2627,7 @@ void cancel_dirty_page(struct page *page) ClearPageDirty(page); } } -EXPORT_SYMBOL(cancel_dirty_page); +EXPORT_SYMBOL(__cancel_dirty_page); /* * Clear a page's dirty flag, while caring for dirty memory accounting. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8dfd13f724d9..76c9688b6a0a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -24,7 +24,6 @@ #include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> -#include <linux/kmemcheck.h> #include <linux/kasan.h> #include <linux/module.h> #include <linux/suspend.h> @@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); #endif +DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); + #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. @@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + +/* + * Determine how many pages need to be initialized durig early boot + * (non-deferred initialization). + * The value of first_deferred_pfn will be set later, once non-deferred pages + * are initialized, but for now set it ULONG_MAX. + */ static inline void reset_deferred_meminit(pg_data_t *pgdat) { - unsigned long max_initialise; - unsigned long reserved_lowmem; + phys_addr_t start_addr, end_addr; + unsigned long max_pgcnt; + unsigned long reserved; /* * Initialise at least 2G of a node but also take into account that * two large system hashes that can take up 1GB for 0.25TB/node. */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); + max_pgcnt = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); /* * Compensate the all the memblock reservations (e.g. crash kernel) * from the initial estimation to make sure we will initialize enough * memory to boot. */ - reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, - pgdat->node_start_pfn + max_initialise); - max_initialise += reserved_lowmem; + start_addr = PFN_PHYS(pgdat->node_start_pfn); + end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); + reserved = memblock_reserved_memory_within(start_addr, end_addr); + max_pgcnt += PHYS_PFN(reserved); - pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); + pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_size) && + if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); /* * Check tail pages before head page information is cleared to @@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone, static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone) } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __init deferred_free_range(struct page *page, - unsigned long pfn, int nr_pages) +static void __init deferred_free_range(unsigned long pfn, + unsigned long nr_pages) { - int i; + struct page *page; + unsigned long i; - if (!page) + if (!nr_pages) return; + page = pfn_to_page(pfn); + /* Free a large naturally-aligned chunk if possible */ if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { @@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void) complete(&pgdat_init_all_done_comp); } +/* + * Helper for deferred_init_range, free the given range, reset the counters, and + * return number of pages freed. + */ +static inline unsigned long __init __def_free(unsigned long *nr_free, + unsigned long *free_base_pfn, + struct page **page) +{ + unsigned long nr = *nr_free; + + deferred_free_range(*free_base_pfn, nr); + *free_base_pfn = 0; + *nr_free = 0; + *page = NULL; + + return nr; +} + +static unsigned long __init deferred_init_range(int nid, int zid, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long free_base_pfn = 0; + unsigned long nr_pages = 0; + unsigned long nr_free = 0; + struct page *page = NULL; + unsigned long pfn; + + /* + * First we check if pfn is valid on architectures where it is possible + * to have holes within pageblock_nr_pages. On systems where it is not + * possible, this function is optimized out. + * + * Then, we check if a current large page is valid by only checking the + * validity of the head pfn. + * + * meminit_pfn_in_nid is checked on systems where pfns can interleave + * within a node: a pfn is between start and end of a node, but does not + * belong to this memory node. + * + * Finally, we minimize pfn page lookups and scheduler checks by + * performing it only once every pageblock_nr_pages. + * + * We do it in two loops: first we initialize struct page, than free to + * buddy allocator, becuse while we are freeing pages we can access + * pages that are ahead (computing buddy page in __free_one_page()). + */ + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + if ((pfn & nr_pgmask) || pfn_valid(pfn)) { + if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + if (page && (pfn & nr_pgmask)) + page++; + else + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zid, nid); + cond_resched(); + } + } + } + + page = NULL; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (page && (pfn & nr_pgmask)) { + page++; + nr_free++; + } else { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + page = pfn_to_page(pfn); + free_base_pfn = pfn; + nr_free = 1; + cond_resched(); + } + } + /* Free the last block of pages to allocator */ + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; int nid = pgdat->node_id; - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long walk_start, walk_end; - int i, zid; + unsigned long spfn, epfn; + phys_addr_t spa, epa; + int zid; struct zone *zone; unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + u64 i; if (first_init_pfn == ULONG_MAX) { pgdat_init_report_one_done(); @@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } + first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); - for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { - unsigned long pfn, end_pfn; - struct page *page = NULL; - struct page *free_base_page = NULL; - unsigned long free_base_pfn = 0; - int nr_to_free = 0; - - end_pfn = min(walk_end, zone_end_pfn(zone)); - pfn = first_init_pfn; - if (pfn < walk_start) - pfn = walk_start; - if (pfn < zone->zone_start_pfn) - pfn = zone->zone_start_pfn; - - for (; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - goto free_range; - - /* - * Ensure pfn_valid is checked every - * pageblock_nr_pages for memory holes - */ - if ((pfn & (pageblock_nr_pages - 1)) == 0) { - if (!pfn_valid(pfn)) { - page = NULL; - goto free_range; - } - } - - if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - page = NULL; - goto free_range; - } - - /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { - page++; - } else { - nr_pages += nr_to_free; - deferred_free_range(free_base_page, - free_base_pfn, nr_to_free); - free_base_page = NULL; - free_base_pfn = nr_to_free = 0; - - page = pfn_to_page(pfn); - cond_resched(); - } - - if (page->flags) { - VM_BUG_ON(page_zone(page) != zone); - goto free_range; - } - - __init_single_page(page, pfn, zid, nid); - if (!free_base_page) { - free_base_page = page; - free_base_pfn = pfn; - nr_to_free = 0; - } - nr_to_free++; - - /* Where possible, batch up pages for a single free */ - continue; -free_range: - /* Free the current block of pages to allocator */ - nr_pages += nr_to_free; - deferred_free_range(free_base_page, free_base_pfn, - nr_to_free); - free_base_page = NULL; - free_base_pfn = nr_to_free = 0; - } - /* Free the last block of pages to allocator */ - nr_pages += nr_to_free; - deferred_free_range(free_base_page, free_base_pfn, nr_to_free); - - first_init_pfn = max(end_pfn, first_init_pfn); + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + nr_pages += deferred_init_range(nid, zid, spfn, epfn); } /* Sanity check that the next zone really is unpopulated */ @@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ -static inline +static __always_inline struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { @@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { }; #ifdef CONFIG_CMA -static struct page *__rmqueue_cma_fallback(struct zone *zone, +static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, unsigned int order) { return __rmqueue_smallest(zone, order, MIGRATE_CMA); @@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * deviation from the rest of this file, to make the for loop * condition simpler. */ -static inline bool +static __always_inline bool __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area *area; @@ -2289,8 +2321,8 @@ do_steal: * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) +static __always_inline struct page * +__rmqueue(struct zone *zone, unsigned int order, int migratetype) { struct page *page; @@ -2315,7 +2347,7 @@ retry: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, bool cold) + int migratetype) { int i, alloced = 0; @@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, continue; /* - * Split buddy pages returned by expand() are received here - * in physical page order. The page is added to the callers and - * list and the list head then moves forward. From the callers - * perspective, the linked list is ordered by page number in - * some conditions. This is useful for IO devices that can - * merge IO requests if the physical pages are ordered - * properly. + * Split buddy pages returned by expand() are received here in + * physical page order. The page is added to the tail of + * caller's list. From the callers perspective, the linked list + * is ordered by page number under some conditions. This is + * useful for IO devices that can forward direction from the + * head, thus also in the physical page order. This is useful + * for IO devices that can merge IO requests if the physical + * pages are ordered properly. */ - if (likely(!cold)) - list_add(&page->lru, list); - else - list_add_tail(&page->lru, list); - list = &page->lru; + list_add_tail(&page->lru, list); alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -2478,10 +2507,6 @@ void drain_all_pages(struct zone *zone) if (WARN_ON_ONCE(!mm_percpu_wq)) return; - /* Workqueues cannot recurse */ - if (current->flags & PF_WQ_WORKER) - return; - /* * Do not drain if one is already in progress unless it's specific to * a zone. Such callers are primarily CMA and memory hotplug and need @@ -2590,24 +2615,25 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */ -/* - * Free a 0-order page - * cold == true ? free a cold page : free a hot page - */ -void free_hot_cold_page(struct page *page, bool cold) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; - unsigned long flags; - unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pcp_prepare(page)) - return; + return false; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - local_irq_save(flags); + return true; +} + +static void free_unref_page_commit(struct page *page, unsigned long pfn) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + int migratetype; + + migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); /* @@ -2620,38 +2646,73 @@ void free_hot_cold_page(struct page *page, bool cold) if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(zone, page, pfn, 0, migratetype); - goto out; + return; } migratetype = MIGRATE_MOVABLE; } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (!cold) - list_add(&page->lru, &pcp->lists[migratetype]); - else - list_add_tail(&page->lru, &pcp->lists[migratetype]); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); pcp->count -= batch; } +} -out: +/* + * Free a 0-order page + */ +void free_unref_page(struct page *page) +{ + unsigned long flags; + unsigned long pfn = page_to_pfn(page); + + if (!free_unref_page_prepare(page, pfn)) + return; + + local_irq_save(flags); + free_unref_page_commit(page, pfn); local_irq_restore(flags); } /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, bool cold) +void free_unref_page_list(struct list_head *list) { struct page *page, *next; + unsigned long flags, pfn; + int batch_count = 0; + /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { - trace_mm_page_free_batched(page, cold); - free_hot_cold_page(page, cold); + pfn = page_to_pfn(page); + if (!free_unref_page_prepare(page, pfn)) + list_del(&page->lru); + set_page_private(page, pfn); } + + local_irq_save(flags); + list_for_each_entry_safe(page, next, list, lru) { + unsigned long pfn = page_private(page); + + set_page_private(page, 0); + trace_mm_page_free_batched(page); + free_unref_page_commit(page, pfn); + + /* + * Guard against excessive IRQ disabled times when we get + * a large list of pages to free. + */ + if (++batch_count == SWAP_CLUSTER_MAX) { + local_irq_restore(flags); + batch_count = 0; + local_irq_save(flags); + } + } + local_irq_restore(flags); } /* @@ -2669,15 +2730,6 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); -#ifdef CONFIG_KMEMCHECK - /* - * Split shadow pages too, because free(page[0]) would - * otherwise free the whole shadow. - */ - if (kmemcheck_page_is_tracked(page)) - split_page(virt_to_page(page[0].shadow), order); -#endif - for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order); @@ -2743,6 +2795,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; + /* skip numa counters update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return; + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; @@ -2758,7 +2814,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) /* Remove page from the per-cpu list, caller must protect the list */ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, - bool cold, struct per_cpu_pages *pcp, + struct per_cpu_pages *pcp, struct list_head *list) { struct page *page; @@ -2767,16 +2823,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, - migratetype, cold); + migratetype); if (unlikely(list_empty(list))) return NULL; } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; } while (check_new_pcp(page)); @@ -2791,14 +2843,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, { struct per_cpu_pages *pcp; struct list_head *list; - bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; unsigned long flags; local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); @@ -3006,9 +3057,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, if (!area->nr_free) continue; - if (alloc_harder) - return true; - for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!list_empty(&area->free_list[mt])) return true; @@ -3020,6 +3068,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif + if (alloc_harder && + !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + return true; } return false; } @@ -3235,20 +3286,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; - pr_warn("%s: ", current->comm); - va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - pr_cont("%pV", &vaf); + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", + current->comm, &vaf, gfp_mask, &gfp_mask, + nodemask_pr_args(nodemask)); va_end(args); - pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); - if (nodemask) - pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); - else - pr_cont("(null)\n"); - cpuset_print_current_mems_allowed(); dump_stack(); @@ -3868,8 +3913,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, enum compact_result compact_result; int compaction_retries; int no_progress_loops; - unsigned long alloc_start = jiffies; - unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; int reserve_flags; @@ -4001,14 +4044,6 @@ retry: if (!can_direct_reclaim) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; @@ -4223,9 +4258,6 @@ out: page = NULL; } - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; @@ -4262,7 +4294,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, false); + free_unref_page(page); else __free_pages_ok(page, order); } @@ -4320,7 +4352,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) unsigned int order = compound_order(page); if (order == 0) - free_hot_cold_page(page, false); + free_unref_page(page); else __free_pages_ok(page, order); } @@ -5646,16 +5678,6 @@ void __init sparse_memory_present_with_active_regions(int nid) unsigned long start_pfn, end_pfn; int i, this_nid; -#ifdef CONFIG_SPARSEMEM_EXTREME - if (!mem_section) { - unsigned long size, align; - - size = sizeof(struct mem_section) * NR_SECTION_ROOTS; - align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_virt_alloc(size, align); - } -#endif - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) memory_present(this_nid, start_pfn, end_pfn); } @@ -6136,6 +6158,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) } } +#ifdef CONFIG_FLAT_NODE_MEM_MAP static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; @@ -6145,7 +6168,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) if (!pgdat->node_spanned_pages) return; -#ifdef CONFIG_FLAT_NODE_MEM_MAP start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* ia64 gets its own node_mem_map, before this, without bootmem */ @@ -6167,6 +6189,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) pgdat->node_id); pgdat->node_mem_map = map + offset; } + pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", + __func__, pgdat->node_id, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); #ifndef CONFIG_NEED_MULTIPLE_NODES /* * With no DISCONTIG, the global mem_map is just set as node 0's @@ -6179,8 +6204,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif -#endif /* CONFIG_FLAT_NODE_MEM_MAP */ } +#else +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) @@ -6207,16 +6234,51 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, zones_size, zholes_size); alloc_node_mem_map(pgdat); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", - nid, (unsigned long)pgdat, - (unsigned long)pgdat->node_mem_map); -#endif reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } +#ifdef CONFIG_HAVE_MEMBLOCK +/* + * Only struct pages that are backed by physical memory are zeroed and + * initialized by going through __init_single_page(). But, there are some + * struct pages which are reserved in memblock allocator and their fields + * may be accessed (for example page_to_pfn() on some configuration accesses + * flags). We must explicitly zero those struct pages. + */ +void __paginginit zero_resv_unavail(void) +{ + phys_addr_t start, end; + unsigned long pfn; + u64 i, pgcnt; + + /* + * Loop through ranges that are reserved, but do not have reported + * physical memory backing. + */ + pgcnt = 0; + for_each_resv_unavail_range(i, &start, &end) { + for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) + continue; + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + } + + /* + * Struct pages that do not have backing memory. This could be because + * firmware is using some of this memory, or for some other reasons. + * Once memblock is changed so such behaviour is not allowed: i.e. + * list of "reserved" memory must be a subset of list of "memory", then + * this code can be removed. + */ + if (pgcnt) + pr_info("Reserved but unavailable: %lld pages", pgcnt); +} +#endif /* CONFIG_HAVE_MEMBLOCK */ + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 @@ -6640,6 +6702,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } + zero_resv_unavail(); } static int __init cmdline_parse_core(char *p, unsigned long *core) @@ -6803,6 +6866,7 @@ void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + zero_resv_unavail(); } static int page_alloc_cpu_dead(unsigned int cpu) @@ -7315,18 +7379,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); - /* - * memblock allocator returns zeroed memory already, so HASH_ZERO is - * currently not used when HASH_EARLY is specified. - */ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; - if (flags & HASH_EARLY) - table = memblock_virt_alloc_nopanic(size, 0); - else if (hashdist) + if (flags & HASH_EARLY) { + if (flags & HASH_ZERO) + table = memblock_virt_alloc_nopanic(size, 0); + else + table = memblock_virt_alloc_raw(size, 0); + } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); - else { + } else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which @@ -7363,10 +7426,10 @@ void *__init alloc_large_system_hash(const char *tablename, * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, + int migratetype, bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; - int mt; /* * For avoiding noise data, lru_add_drain_all() should be called @@ -7374,8 +7437,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, */ if (zone_idx(zone) == ZONE_MOVABLE) return false; - mt = get_pageblock_migratetype(page); - if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) + + /* + * CMA allocations (alloc_contig_range) really need to mark isolate + * CMA pageblocks even when they are not movable in fact so consider + * them movable here. + */ + if (is_migrate_cma(migratetype) && + is_migrate_cma(get_pageblock_migratetype(page))) return false; pfn = page_to_pfn(page); @@ -7387,6 +7456,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, page = pfn_to_page(check); + if (PageReserved(page)) + return true; + /* * Hugepages are not in LRU lists, but they're movable. * We need not scan over tail pages bacause we don't @@ -7460,7 +7532,7 @@ bool is_pageblock_removable_nolock(struct page *page) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, true); + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); } #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) @@ -7556,6 +7628,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, + .no_set_skip_hint = true, .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); @@ -7592,11 +7665,18 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* * In case of -EBUSY, we'd like to know which page causes problem. - * So, just fall through. We will check it in test_pages_isolated(). + * So, just fall through. test_pages_isolated() has a tracepoint + * which will report the busy page. + * + * It is possible that busy pages could become available before + * the call to test_pages_isolated, and the range will actually be + * allocated. So, if we fall through be sure to clear ret so that + * -EBUSY is not accidentally used or returned to caller. */ ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; + ret =0; /* * Pages from [start, end) are within a MAX_ORDER_NR_PAGES diff --git a/mm/page_ext.c b/mm/page_ext.c index 4f0367d472c4..2c16216c29b6 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -125,7 +125,6 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -134,7 +133,6 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (unlikely(!base)) return NULL; -#endif index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); @@ -199,7 +197,6 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -208,7 +205,6 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (!section->page_ext) return NULL; -#endif return get_entry(section->page_ext, pfn); } diff --git a/mm/page_io.c b/mm/page_io.c index 5d882de3fbfd..e93f1a4cacd7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -347,7 +347,7 @@ out: return ret; } -int swap_readpage(struct page *page, bool do_poll) +int swap_readpage(struct page *page, bool synchronous) { struct bio *bio; int ret = 0; @@ -355,7 +355,7 @@ int swap_readpage(struct page *page, bool do_poll) blk_qc_t qc; struct gendisk *disk; - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -403,12 +403,12 @@ int swap_readpage(struct page *page, bool do_poll) count_vm_event(PSWPIN); bio_get(bio); qc = submit_bio(bio); - while (do_poll) { + while (synchronous) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio->bi_private)) break; - if (!blk_mq_poll(disk->queue, qc)) + if (!blk_poll(disk->queue, qc)) break; } __set_current_state(TASK_RUNNING); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 44f213935bf6..165ed8117bd1 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -15,7 +15,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/page_isolation.h> -static int set_migratetype_isolate(struct page *page, +static int set_migratetype_isolate(struct page *page, int migratetype, bool skip_hwpoisoned_pages) { struct zone *zone; @@ -52,7 +52,7 @@ static int set_migratetype_isolate(struct page *page, * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found, + if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, skip_hwpoisoned_pages)) ret = 0; @@ -64,14 +64,14 @@ static int set_migratetype_isolate(struct page *page, out: if (!ret) { unsigned long nr_pages; - int migratetype = get_pageblock_migratetype(page); + int mt = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, NULL); - __mod_zone_freepage_state(zone, -nr_pages, migratetype); + __mod_zone_freepage_state(zone, -nr_pages, mt); } spin_unlock_irqrestore(&zone->lock, flags); @@ -183,7 +183,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); if (page && - set_migratetype_isolate(page, skip_hwpoisoned_pages)) { + set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) { undo_pfn = pfn; goto undo; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f44b95b9d1e..270a8219ccd0 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -20,9 +20,9 @@ #define PAGE_OWNER_STACK_DEPTH (16) struct page_owner { - unsigned int order; + unsigned short order; + short last_migrate_reason; gfp_t gfp_mask; - int last_migrate_reason; depot_stack_handle_t handle; }; @@ -616,7 +616,6 @@ static void init_early_allocated_pages(void) { pg_data_t *pgdat; - drain_all_pages(NULL); for_each_online_pgdat(pgdat) init_zones_in_node(pgdat); } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index d22b84310f6d..ae3c2a35d61b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -30,10 +30,37 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) return true; } +static inline bool pfn_in_hpage(struct page *hpage, unsigned long pfn) +{ + unsigned long hpage_pfn = page_to_pfn(hpage); + + /* THP can be referenced by any subpage */ + return pfn >= hpage_pfn && pfn - hpage_pfn < hpage_nr_pages(hpage); +} + +/** + * check_pte - check if @pvmw->page is mapped at the @pvmw->pte + * + * page_vma_mapped_walk() found a place where @pvmw->page is *potentially* + * mapped. check_pte() has to validate this. + * + * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary + * page. + * + * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration + * entry that points to @pvmw->page or any subpage in case of THP. + * + * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to + * @pvmw->page or any subpage in case of THP. + * + * Otherwise, return false. + * + */ static bool check_pte(struct page_vma_mapped_walk *pvmw) { + unsigned long pfn; + if (pvmw->flags & PVMW_MIGRATION) { -#ifdef CONFIG_MIGRATION swp_entry_t entry; if (!is_swap_pte(*pvmw->pte)) return false; @@ -41,38 +68,25 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) if (!is_migration_entry(entry)) return false; - if (migration_entry_to_page(entry) - pvmw->page >= - hpage_nr_pages(pvmw->page)) { - return false; - } - if (migration_entry_to_page(entry) < pvmw->page) - return false; -#else - WARN_ON_ONCE(1); -#endif - } else { - if (is_swap_pte(*pvmw->pte)) { - swp_entry_t entry; - entry = pte_to_swp_entry(*pvmw->pte); - if (is_device_private_entry(entry) && - device_private_entry_to_page(entry) == pvmw->page) - return true; - } + pfn = migration_entry_to_pfn(entry); + } else if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; - if (!pte_present(*pvmw->pte)) + /* Handle un-addressable ZONE_DEVICE memory */ + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_device_private_entry(entry)) return false; - /* THP can be referenced by any subpage */ - if (pte_page(*pvmw->pte) - pvmw->page >= - hpage_nr_pages(pvmw->page)) { - return false; - } - if (pte_page(*pvmw->pte) < pvmw->page) + pfn = device_private_entry_to_pfn(entry); + } else { + if (!pte_present(*pvmw->pte)) return false; + + pfn = pte_pfn(*pvmw->pte); } - return true; + return pfn_in_hpage(pvmw->page, pfn); } /** diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8bd4afa83cb8..23a3e415ac2c 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -188,8 +188,12 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, do { next = hugetlb_entry_end(h, addr, end); pte = huge_pte_offset(walk->mm, addr & hmask, sz); - if (pte && walk->hugetlb_entry) + + if (pte) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + else if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) break; } while (addr = next, addr != end); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 15dab691ea70..9158e5a81391 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM; unsigned int cpu, tcpu; int i; diff --git a/mm/percpu.c b/mm/percpu.c index a0e0c82c1e4c..50e7fdf84055 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1856,7 +1856,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); + ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; @@ -2719,6 +2719,11 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); +#ifdef CONFIG_CRIS +#warning "the CRIS architecture has physical and virtual addresses confused" +#else + pcpu_free_alloc_info(ai); +#endif } #endif /* CONFIG_SMP */ diff --git a/mm/rmap.c b/mm/rmap.c index b874c4761e84..47db27f8049e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -899,7 +899,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { - unsigned long cstart, cend; + unsigned long cstart; int ret = 0; cstart = address = pvmw.address; @@ -915,7 +915,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); - cend = cstart + PAGE_SIZE; ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE @@ -931,7 +930,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); cstart &= PMD_MASK; - cend = cstart + PMD_SIZE; ret = 1; #else /* unexpected pmd-mapped page? */ @@ -939,10 +937,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, #endif } - if (ret) { - mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); + /* + * No need to call mmu_notifier_invalidate_range() as we are + * downgrading page table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ + if (ret) (*cleaned)++; - } } mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); @@ -1318,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound) * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, + * before us: so leave the reset to free_unref_page, * and remember that it's only reliable while mapped. * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. @@ -1426,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ goto discard; } @@ -1483,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * will take care of the rest. */ dec_mm_counter(mm, mm_counter(page)); + /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { swp_entry_t entry; @@ -1498,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; @@ -1509,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, WARN_ON_ONCE(1); ret = false; /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); page_vma_mapped_walk_done(&pvmw); break; } @@ -1516,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!PageSwapBacked(page)) { if (!PageDirty(page)) { + /* Invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, + address, address + PAGE_SIZE); dec_mm_counter(mm, MM_ANONPAGES); goto discard; } @@ -1549,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); - } else + /* Invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); + } else { + /* + * We should not need to notify here as we reach this + * case only from freeze_page() itself only call from + * split_huge_page_to_list() so everything below must + * be true: + * - page is not anonymous + * - page is locked + * + * So as it is a locked file back page thus it can not + * be remove from the page cache and replace by a new + * page before mmu_notifier_invalidate_range_end so no + * concurrent thread might update its page table to + * point at new page while a device still is using this + * page. + * + * See Documentation/vm/mmu_notifier.txt + */ dec_mm_counter(mm, mm_counter_file(page)); + } discard: + /* + * No need to call mmu_notifier_invalidate_range() it has be + * done above for all cases requiring it to happen under page + * table lock before mmu_notifier_invalidate_range_end() + * + * See Documentation/vm/mmu_notifier.txt + */ page_remove_rmap(subpage, PageHuge(page)); put_page(page); - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); diff --git a/mm/shmem.c b/mm/shmem.c index 07a1d22807be..7fbe67be86fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -338,7 +338,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, if (item != expected) return -ENOENT; __radix_tree_replace(&mapping->page_tree, node, pslot, - replacement, NULL, NULL); + replacement, NULL); return 0; } @@ -747,7 +747,7 @@ void shmem_unlock_mapping(struct address_space *mapping) pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ @@ -790,7 +790,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index < end) { pvec.nr = find_get_entries(mapping, index, @@ -2528,7 +2528,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, bool done = false; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); pvec.nr = 1; /* start small: we may be there already */ while (!done) { pvec.nr = find_get_entries(mapping, index, @@ -3202,7 +3202,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s int len; struct inode *inode; struct page *page; - struct shmem_inode_info *info; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3222,7 +3221,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s error = 0; } - info = SHMEM_I(inode); inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { inode->i_link = kmemdup(symname, len, GFP_KERNEL); @@ -3778,7 +3776,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ - if (!(sb->s_flags & MS_KERNMOUNT)) { + if (!(sb->s_flags & SB_KERNMOUNT)) { sbinfo->max_blocks = shmem_default_max_blocks(); sbinfo->max_inodes = shmem_default_max_inodes(); if (shmem_parse_options(data, sbinfo, false)) { @@ -3786,12 +3784,12 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) goto failed; } } else { - sb->s_flags |= MS_NOUSER; + sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= MS_NOSEC; + sb->s_flags |= SB_NOSEC; #else - sb->s_flags |= MS_NOUSER; + sb->s_flags |= SB_NOUSER; #endif spin_lock_init(&sbinfo->stat_lock); @@ -3811,7 +3809,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = shmem_xattr_handlers; #endif #ifdef CONFIG_TMPFS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif uuid_gen(&sb->s_uuid); @@ -3862,12 +3860,11 @@ static void shmem_init_inode(void *foo) inode_init_once(&info->vfs_inode); } -static int shmem_init_inodecache(void) +static void shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); - return 0; } static void shmem_destroy_inodecache(void) @@ -3991,9 +3988,7 @@ int __init shmem_init(void) if (shmem_inode_cachep) return 0; - error = shmem_init_inodecache(); - if (error) - goto out3; + shmem_init_inodecache(); error = register_filesystem(&shmem_fs_type); if (error) { @@ -4020,7 +4015,6 @@ out1: unregister_filesystem(&shmem_fs_type); out2: shmem_destroy_inodecache(); -out3: shm_mnt = ERR_PTR(error); return error; } @@ -4102,6 +4096,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) return true; + /* fall through */ case SHMEM_HUGE_ADVISE: /* TODO: implement fadvise() hints */ return (vma->vm_flags & VM_HUGEPAGE); @@ -4183,7 +4178,7 @@ static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; -static struct file *__shmem_file_setup(const char *name, loff_t size, +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags, unsigned int i_flags) { struct file *res; @@ -4192,8 +4187,8 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, struct super_block *sb; struct qstr this; - if (IS_ERR(shm_mnt)) - return ERR_CAST(shm_mnt); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); @@ -4205,8 +4200,8 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, this.name = name; this.len = strlen(name); this.hash = 0; /* will go */ - sb = shm_mnt->mnt_sb; - path.mnt = mntget(shm_mnt); + sb = mnt->mnt_sb; + path.mnt = mntget(mnt); path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; @@ -4251,7 +4246,7 @@ put_path: */ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) { - return __shmem_file_setup(name, size, flags, S_PRIVATE); + return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } /** @@ -4262,11 +4257,25 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon */ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { - return __shmem_file_setup(name, size, flags, 0); + return __shmem_file_setup(shm_mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup); /** + * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs + * @mnt: the tmpfs mount where the file will be created + * @name: name for dentry (to be seen in /proc/<pid>/maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long flags) +{ + return __shmem_file_setup(mnt, name, size, flags, 0); +} +EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); + +/** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap_pgoff */ @@ -4281,7 +4290,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE); + file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/mm/slab.c b/mm/slab.c index b7095884fd93..4e51ef954026 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -114,7 +114,6 @@ #include <linux/rtmutex.h> #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> -#include <linux/kmemcheck.h> #include <linux/memory.h> #include <linux/prefetch.h> #include <linux/sched/task_stack.h> @@ -252,8 +251,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) -#define CFLGS_OFF_SLAB (0x80000000UL) +#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) +#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) #define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) @@ -441,7 +440,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) * Calculate the number of objects and left-over bytes for a given buffer size. */ static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, - unsigned long flags, size_t *left_over) + slab_flags_t flags, size_t *left_over) { unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; @@ -1410,10 +1409,8 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nr_pages; flags |= cachep->allocflags; - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - flags |= __GFP_RECLAIMABLE; - page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); + page = __alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) { slab_out_of_memory(cachep, flags, nodeid); return NULL; @@ -1435,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); - if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { - kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); - - if (cachep->ctor) - kmemcheck_mark_uninitialized_pages(page, nr_pages); - else - kmemcheck_mark_unallocated_pages(page, nr_pages); - } - return page; } @@ -1455,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) int order = cachep->gfporder; unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, order); - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); else @@ -1598,11 +1584,8 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) *dbg_redzone2(cachep, objp)); } - if (cachep->flags & SLAB_STORE_USER) { - pr_err("Last user: [<%p>](%pSR)\n", - *dbg_userword(cachep, objp), - *dbg_userword(cachep, objp)); - } + if (cachep->flags & SLAB_STORE_USER) + pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; for (i = 0; i < size && lines; i += 16, lines--) { @@ -1635,7 +1618,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + pr_err("Slab corruption (%s): %s start=%px, len=%d\n", print_tainted(), cachep->name, realobj, size); print_objinfo(cachep, objp, 0); @@ -1664,13 +1647,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Prev obj: start=%p, len=%d\n", realobj, size); + pr_err("Prev obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - pr_err("Next obj: start=%p, len=%d\n", realobj, size); + pr_err("Next obj: start=%px, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -1761,7 +1744,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left_over = 0; int gfporder; @@ -1888,8 +1871,8 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) return 0; } -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; @@ -1897,7 +1880,7 @@ unsigned long kmem_cache_flags(unsigned long object_size, struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *cachep; @@ -1915,7 +1898,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, } static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -1938,7 +1921,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, } static bool set_off_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -1972,7 +1955,7 @@ static bool set_off_slab_cache(struct kmem_cache *cachep, } static bool set_on_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -2008,8 +1991,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ -int -__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) { size_t ralign = BYTES_PER_WORD; gfp_t gfp; @@ -2144,6 +2126,8 @@ done: cachep->allocflags = __GFP_COMP; if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; + if (flags & SLAB_RECLAIM_ACCOUNT) + cachep->allocflags |= __GFP_RECLAIMABLE; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); @@ -2621,7 +2605,7 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - pr_err("slab: double free detected in cache '%s', objp %p\n", + pr_err("slab: double free detected in cache '%s', objp %px\n", cachep->name, objp); BUG(); } @@ -2785,7 +2769,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", obj, redzone1, redzone2); } @@ -3091,7 +3075,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { slab_error(cachep, "double free, or memory outside object was overwritten"); - pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -3104,7 +3088,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -3516,8 +3500,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); - kmemcheck_slab_free(cachep, objp, cachep->object_size); - /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which @@ -4097,7 +4079,6 @@ out: schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); } -#ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { unsigned long active_objs, num_objs, active_slabs; @@ -4299,7 +4280,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) return; } #endif - seq_printf(m, "%p", (void *)address); + seq_printf(m, "%px", (void *)address); } static int leaks_show(struct seq_file *m, void *p) @@ -4405,7 +4386,6 @@ static int __init slab_proc_init(void) return 0; } module_init(slab_proc_init); -#endif #ifdef CONFIG_HARDENED_USERCOPY /* diff --git a/mm/slab.h b/mm/slab.h index 86d7c7d860f9..ad657ffa44e5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -21,7 +21,7 @@ struct kmem_cache { unsigned int object_size;/* The original size of the object */ unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ - unsigned long flags; /* Active flags on the slab */ + slab_flags_t flags; /* Active flags on the slab */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ @@ -40,7 +40,6 @@ struct kmem_cache { #include <linux/memcontrol.h> #include <linux/fault-inject.h> -#include <linux/kmemcheck.h> #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/random.h> @@ -79,13 +78,13 @@ extern const struct kmalloc_info_struct { unsigned long size; } kmalloc_info[]; -unsigned long calculate_alignment(unsigned long flags, +unsigned long calculate_alignment(slab_flags_t flags, unsigned long align, unsigned long size); #ifndef CONFIG_SLOB /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); -void create_kmalloc_caches(unsigned long); +void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t, gfp_t); @@ -93,32 +92,32 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t); /* Functions provided by the slab allocators */ -extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); +int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, - unsigned long flags); + slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, - size_t size, unsigned long flags); + size_t size, slab_flags_t flags); int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(size_t size, size_t align, - unsigned long flags, const char *name, void (*ctor)(void *)); + slab_flags_t flags, const char *name, void (*ctor)(void *)); #ifndef CONFIG_SLOB struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)); + slab_flags_t flags, void (*ctor)(void *)); -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)); #else static inline struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { return NULL; } -static inline unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +static inline slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; @@ -142,10 +141,10 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ - SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_TEMPORARY | SLAB_ACCOUNT) #else #define SLAB_CACHE_FLAGS (0) #endif @@ -164,7 +163,6 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ - SLAB_NOTRACK | \ SLAB_ACCOUNT) int __kmem_cache_shutdown(struct kmem_cache *); @@ -439,7 +437,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, for (i = 0; i < size; i++) { void *object = p[i]; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); kasan_slab_alloc(s, object, flags); @@ -506,6 +503,14 @@ void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); void memcg_slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +void dump_unreclaimable_slab(void); +#else +static inline void dump_unreclaimable_slab(void) +{ +} +#endif + void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); #ifdef CONFIG_SLAB_FREELIST_RANDOM diff --git a/mm/slab_common.c b/mm/slab_common.c index 0d7fe71ff5e4..c8cb36774ba1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ - SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. @@ -291,7 +291,7 @@ int slab_unmergeable(struct kmem_cache *s) } struct kmem_cache *find_mergeable(size_t size, size_t align, - unsigned long flags, const char *name, void (*ctor)(void *)) + slab_flags_t flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -341,7 +341,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. */ -unsigned long calculate_alignment(unsigned long flags, +unsigned long calculate_alignment(slab_flags_t flags, unsigned long align, unsigned long size) { /* @@ -366,7 +366,7 @@ unsigned long calculate_alignment(unsigned long flags, static struct kmem_cache *create_cache(const char *name, size_t object_size, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *), + slab_flags_t flags, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; @@ -431,7 +431,7 @@ out_free_cache: */ struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s = NULL; const char *cache_name; @@ -879,7 +879,7 @@ bool slab_is_available(void) #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, - unsigned long flags) + slab_flags_t flags) { int err; @@ -899,7 +899,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz } struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, - unsigned long flags) + slab_flags_t flags) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); @@ -1057,7 +1057,7 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, unsigned long flags) +static void __init new_kmalloc_cache(int idx, slab_flags_t flags) { kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, kmalloc_info[idx].size, flags); @@ -1068,7 +1068,7 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags) * may already have been created because they were needed to * enable allocations for slab creation. */ -void __init create_kmalloc_caches(unsigned long flags) +void __init create_kmalloc_caches(slab_flags_t flags) { int i; @@ -1184,8 +1184,7 @@ void cache_random_seq_destroy(struct kmem_cache *cachep) } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ -#ifdef CONFIG_SLABINFO - +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) #ifdef CONFIG_SLAB #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) #else @@ -1281,7 +1280,41 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +void dump_unreclaimable_slab(void) +{ + struct kmem_cache *s, *s2; + struct slabinfo sinfo; + + /* + * Here acquiring slab_mutex is risky since we don't prefer to get + * sleep in oom path. But, without mutex hold, it may introduce a + * risk of crash. + * Use mutex_trylock to protect the list traverse, dump nothing + * without acquiring the mutex. + */ + if (!mutex_trylock(&slab_mutex)) { + pr_warn("excessive unreclaimable slab but cannot dump stats\n"); + return; + } + + pr_info("Unreclaimable slab info:\n"); + pr_info("Name Used Total\n"); + + list_for_each_entry_safe(s, s2, &slab_caches, list) { + if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT)) + continue; + + get_slabinfo(s, &sinfo); + + if (sinfo.num_objs > 0) + pr_info("%-17s %10luKB %10luKB\n", cache_name(s), + (sinfo.active_objs * s->size) / 1024, + (sinfo.num_objs * s->size) / 1024); + } + mutex_unlock(&slab_mutex); +} + +#if defined(CONFIG_MEMCG) void *memcg_slab_start(struct seq_file *m, loff_t *pos) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -1355,7 +1388,7 @@ static int __init slab_proc_init(void) return 0; } module_init(slab_proc_init); -#endif /* CONFIG_SLABINFO */ +#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ static __always_inline void *__do_krealloc(const void *p, size_t new_size, gfp_t flags) diff --git a/mm/slob.c b/mm/slob.c index 10249160b693..623e8a5c46ce 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -330,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } - if (unlikely((gfp & __GFP_ZERO) && b)) + if (unlikely(gfp & __GFP_ZERO)) memset(b, 0, size); return b; } @@ -524,7 +524,7 @@ size_t ksize(const void *block) } EXPORT_SYMBOL(ksize); -int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { if (flags & SLAB_TYPESAFE_BY_RCU) { /* leave room for rcu footer at the end of object */ diff --git a/mm/slub.c b/mm/slub.c index 1efbb8123037..cfd56e5a35fb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -22,7 +22,6 @@ #include <linux/notifier.h> #include <linux/seq_file.h> #include <linux/kasan.h> -#include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/mempolicy.h> @@ -193,8 +192,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000UL /* Poison object */ -#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ +/* Poison object */ +#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) +/* Use cmpxchg_double */ +#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) /* * Tracking user of a slab. @@ -485,9 +486,9 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) * Debug settings: */ #if defined(CONFIG_SLUB_DEBUG_ON) -static int slub_debug = DEBUG_DEFAULT_FLAGS; +static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; #else -static int slub_debug; +static slab_flags_t slub_debug; #endif static char *slub_debug_slabs; @@ -1289,8 +1290,8 @@ out: __setup("slub_debug", setup_slub_debug); -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { /* @@ -1322,8 +1323,8 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; @@ -1370,12 +1371,11 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x) * So in order to make the debug calls that expect irqs to be * disabled we need to disable interrupts temporarily. */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) +#ifdef CONFIG_LOCKDEP { unsigned long flags; local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); debug_check_no_locks_freed(x, s->object_size); local_irq_restore(flags); } @@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, * Compiler cannot detect this function can be removed if slab_free_hook() * evaluates to nothing. Thus, catch all relevant config debug options here. */ -#if defined(CONFIG_KMEMCHECK) || \ - defined(CONFIG_LOCKDEP) || \ +#if defined(CONFIG_LOCKDEP) || \ defined(CONFIG_DEBUG_KMEMLEAK) || \ defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) @@ -1436,8 +1435,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, struct page *page; int order = oo_order(oo); - flags |= __GFP_NOTRACK; - if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else @@ -1596,22 +1593,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) stat(s, ORDER_FALLBACK); } - if (kmemcheck_enabled && - !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { - int pages = 1 << oo_order(oo); - - kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); - - /* - * Objects from caches that have a constructor don't get - * cleared when they're allocated, so we need to do it here. - */ - if (s->ctor) - kmemcheck_mark_uninitialized_pages(page, pages); - else - kmemcheck_mark_unallocated_pages(page, pages); - } - page->objects = oo_objects(oo); order = compound_order(page); @@ -1687,8 +1668,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) check_object(s, page, p, SLUB_RED_INACTIVE); } - kmemcheck_free_shadow(page, compound_order(page)); - mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, @@ -3477,7 +3456,7 @@ static void set_cpu_partial(struct kmem_cache *s) */ static int calculate_sizes(struct kmem_cache *s, int forced_order) { - unsigned long flags = s->flags; + slab_flags_t flags = s->flags; size_t size = s->object_size; int order; @@ -3593,7 +3572,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) +static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; @@ -3655,7 +3634,7 @@ error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", s->name, (unsigned long)s->size, s->size, - oo_order(s->oo), s->offset, flags); + oo_order(s->oo), s->offset, (unsigned long)flags); return -EINVAL; } @@ -3792,7 +3771,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -4245,7 +4224,7 @@ void __init kmem_cache_init_late(void) struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s, *c; @@ -4275,7 +4254,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return s; } -int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) { int err; @@ -5655,8 +5634,6 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; - if (!(s->flags & SLAB_NOTRACK)) - *p++ = 't'; if (s->flags & SLAB_ACCOUNT) *p++ = 'A'; if (p != name + 1) @@ -5704,6 +5681,10 @@ static int sysfs_slab_add(struct kmem_cache *s) return 0; } + if (!unmergeable && disable_higher_order_debug && + (slub_debug & DEBUG_METADATA_FLAGS)) + unmergeable = 1; + if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5852,7 +5833,7 @@ __initcall(slab_sysfs_init); /* * The /proc/slabinfo ABI */ -#ifdef CONFIG_SLABINFO +#ifdef CONFIG_SLUB_DEBUG void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { unsigned long nr_slabs = 0; @@ -5884,4 +5865,4 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, { return -EIO; } -#endif /* CONFIG_SLABINFO */ +#endif /* CONFIG_SLUB_DEBUG */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 478ce6d4a2c4..17acf01791fa 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -42,7 +42,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_virt_alloc_try_nid(size, align, goal, + return memblock_virt_alloc_try_nid_raw(size, align, goal, BOOTMEM_ALLOC_ACCESSIBLE, node); } @@ -53,13 +53,20 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ if (slab_is_available()) { + gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + int order = get_order(size); + static bool warned; struct page *page; - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); + page = alloc_pages_node(node, gfp_mask, order); if (page) return page_address(page); + + if (!warned) { + warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL, + "vmemmap alloc failure: order:%u", order); + warned = true; + } return NULL; } else return __earlyonly_bootmem_alloc(node, size, size, @@ -180,11 +187,22 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) return pte; } +static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) +{ + void *p = vmemmap_alloc_block(size, node); + + if (!p) + return NULL; + memset(p, 0, size); + + return p; +} + pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) { pmd_t *pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pmd_populate_kernel(&init_mm, pmd, p); @@ -196,7 +214,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pud_populate(&init_mm, pud, p); @@ -208,7 +226,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; p4d_populate(&init_mm, p4d, p); @@ -220,7 +238,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) { pgd_t *pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pgd_populate(&init_mm, pgd, p); diff --git a/mm/sparse.c b/mm/sparse.c index 044138852baf..2609aba121e8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -207,6 +207,16 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; +#ifdef CONFIG_SPARSEMEM_EXTREME + if (unlikely(!mem_section)) { + unsigned long size, align; + + size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_virt_alloc(size, align); + } +#endif + start &= PAGE_SECTION_MASK; mminit_validate_memmodel_limits(&start, &end); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { @@ -443,9 +453,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = memblock_virt_alloc_try_nid(size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nodeid); + map = memblock_virt_alloc_try_nid_raw(size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) diff --git a/mm/swap.c b/mm/swap.c index a77d68f2c1b6..38e1b6374a97 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -76,7 +76,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, false); + free_unref_page(page); } static void __put_compound_page(struct page *page) @@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, } if (pgdat) spin_unlock_irqrestore(&pgdat->lru_lock, flags); - release_pages(pvec->pages, pvec->nr, pvec->cold); + release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } @@ -740,7 +740,7 @@ void lru_add_drain_all(void) * Decrement the reference count on all the pages in @pages. If it * fell to zero, remove the page from the LRU and free it. */ -void release_pages(struct page **pages, int nr, bool cold) +void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); @@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr, bool cold) spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); mem_cgroup_uncharge_list(&pages_to_free); - free_hot_cold_page_list(&pages_to_free, cold); + free_unref_page_list(&pages_to_free); } EXPORT_SYMBOL(release_pages); @@ -833,8 +833,11 @@ EXPORT_SYMBOL(release_pages); */ void __pagevec_release(struct pagevec *pvec) { - lru_add_drain(); - release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); + if (!pvec->percpu_pvec_drained) { + lru_add_drain(); + pvec->percpu_pvec_drained = true; + } + release_pages(pvec->pages, pagevec_count(pvec)); pagevec_reinit(pvec); } EXPORT_SYMBOL(__pagevec_release); @@ -986,15 +989,25 @@ unsigned pagevec_lookup_range(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range); -unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *index, int tag, unsigned nr_pages) +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag) { - pvec->nr = find_get_pages_tag(mapping, index, tag, - nr_pages, pvec->pages); + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup_tag); +EXPORT_SYMBOL(pagevec_lookup_range_tag); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages) +{ + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); /* * Perform any setup for the swap system */ diff --git a/mm/swap_slots.c b/mm/swap_slots.c index d81cfc5a43d5..bebc19292018 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -149,6 +149,13 @@ static int alloc_swap_slot_cache(unsigned int cpu) cache->nr = 0; cache->cur = 0; cache->n_ret = 0; + /* + * We initialized alloc_lock and free_lock earlier. We use + * !cache->slots or !cache->slots_ret to know if it is safe to acquire + * the corresponding lock and use the cache. Memory barrier below + * ensures the assumption. + */ + mb(); cache->slots = slots; slots = NULL; cache->slots_ret = slots_ret; @@ -275,7 +282,7 @@ int free_swap_slot(swp_entry_t entry) struct swap_slots_cache *cache; cache = raw_cpu_ptr(&swp_slots); - if (use_swap_slot_cache && cache->slots_ret) { + if (likely(use_swap_slot_cache && cache->slots_ret)) { spin_lock_irq(&cache->free_lock); /* Swap slots cache may be deactivated before acquiring lock */ if (!use_swap_slot_cache || !cache->slots_ret) { @@ -326,7 +333,7 @@ swp_entry_t get_swap_page(struct page *page) */ cache = raw_cpu_ptr(&swp_slots); - if (check_cache_active()) { + if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: diff --git a/mm/swap_state.c b/mm/swap_state.c index 326439428daf..39ae7cfad90f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -36,9 +36,9 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES]; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; -bool swap_vma_readahead = true; +struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; +static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +bool swap_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) lru_add_drain(); for (i = 0; i < nr; i++) free_swap_cache(pagep[i]); - release_pages(pagep, nr, false); + release_pages(pagep, nr); } /* @@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; + struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; bool do_poll = true, page_allocated; @@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; + if (end_offset >= si->max) + end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { diff --git a/mm/swapfile.c b/mm/swapfile.c index e47a21e64764..3074b02eaa09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1328,6 +1328,13 @@ int page_swapcount(struct page *page) return count; } +int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +{ + pgoff_t offset = swp_offset(entry); + + return swap_count(si->swap_map[offset]); +} + static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { int count = 0; @@ -3169,6 +3176,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) p->flags |= SWP_STABLE_WRITES; + if (bdi_cap_synchronous_io(inode_to_bdi(inode))) + p->flags |= SWP_SYNCHRONOUS_IO; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; unsigned long ci, nr_cluster; @@ -3452,10 +3462,15 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_HAS_CACHE); } +struct swap_info_struct *swp_swap_info(swp_entry_t entry) +{ + return swap_info[swp_type(entry)]; +} + struct swap_info_struct *page_swap_info(struct page *page) { - swp_entry_t swap = { .val = page_private(page) }; - return swap_info[swp_type(swap)]; + swp_entry_t entry = { .val = page_private(page) }; + return swp_swap_info(entry); } /* @@ -3463,7 +3478,6 @@ struct swap_info_struct *page_swap_info(struct page *page) */ struct address_space *__page_file_mapping(struct page *page) { - VM_BUG_ON_PAGE(!PageSwapCache(page), page); return page_swap_info(page)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(__page_file_mapping); @@ -3471,7 +3485,6 @@ EXPORT_SYMBOL_GPL(__page_file_mapping); pgoff_t __page_file_index(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; - VM_BUG_ON_PAGE(!PageSwapCache(page), page); return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); diff --git a/mm/truncate.c b/mm/truncate.c index 2330223841fb..e4b4cf0f4070 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -25,44 +25,85 @@ #include <linux/rmap.h> #include "internal.h" -static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, - void *entry) +/* + * Regular page slots are stabilized by the page lock even without the tree + * itself locked. These unlocked entries need verification under the tree + * lock. + */ +static inline void __clear_shadow_entry(struct address_space *mapping, + pgoff_t index, void *entry) { struct radix_tree_node *node; void **slot; - spin_lock_irq(&mapping->tree_lock); - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; + return; if (*slot != entry) - goto unlock; + return; __radix_tree_replace(&mapping->page_tree, node, slot, NULL, - workingset_update_node, mapping); + workingset_update_node); mapping->nrexceptional--; -unlock: +} + +static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, + void *entry) +{ + spin_lock_irq(&mapping->tree_lock); + __clear_shadow_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); } /* - * Unconditionally remove exceptional entry. Usually called from truncate path. + * Unconditionally remove exceptional entries. Usually called from truncate + * path. Note that the pagevec may be altered by this function by removing + * exceptional entries similar to what pagevec_remove_exceptionals does. */ -static void truncate_exceptional_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void truncate_exceptional_pvec_entries(struct address_space *mapping, + struct pagevec *pvec, pgoff_t *indices, + pgoff_t end) { + int i, j; + bool dax, lock; + /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; - if (dax_mapping(mapping)) { - dax_delete_mapping_entry(mapping, index); + for (j = 0; j < pagevec_count(pvec); j++) + if (radix_tree_exceptional_entry(pvec->pages[j])) + break; + + if (j == pagevec_count(pvec)) return; + + dax = dax_mapping(mapping); + lock = !dax && indices[j] < end; + if (lock) + spin_lock_irq(&mapping->tree_lock); + + for (i = j; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + pgoff_t index = indices[i]; + + if (!radix_tree_exceptional_entry(page)) { + pvec->pages[j++] = page; + continue; + } + + if (index >= end) + continue; + + if (unlikely(dax)) { + dax_delete_mapping_entry(mapping, index); + continue; + } + + __clear_shadow_entry(mapping, index, page); } - clear_shadow_entry(mapping, index, entry); + + if (lock) + spin_unlock_irq(&mapping->tree_lock); + pvec->nr = j; } /* @@ -134,11 +175,17 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static int -truncate_complete_page(struct address_space *mapping, struct page *page) +static void +truncate_cleanup_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) - return -EIO; + if (page_mapped(page)) { + loff_t holelen; + + holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_SHIFT, + holelen, 0); + } if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_SIZE); @@ -150,8 +197,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) */ cancel_dirty_page(page); ClearPageMappedToDisk(page); - delete_from_page_cache(page); - return 0; } /* @@ -180,16 +225,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) int truncate_inode_page(struct address_space *mapping, struct page *page) { - loff_t holelen; VM_BUG_ON_PAGE(PageTail(page), page); - holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_SHIFT, - holelen, 0); - } - return truncate_complete_page(mapping, page); + if (page->mapping != mapping) + return -EIO; + + truncate_cleanup_page(mapping, page); + delete_from_page_cache(page); + return 0; } /* @@ -287,11 +330,19 @@ void truncate_inode_pages_range(struct address_space *mapping, else end = (lend + 1) >> PAGE_SHIFT; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* + * Pagevec array has exceptional entries and we may also fail + * to lock some pages. So we store pages that can be deleted + * in a new pagevec. + */ + struct pagevec locked_pvec; + + pagevec_init(&locked_pvec); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -300,11 +351,8 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { - truncate_exceptional_entry(mapping, index, - page); + if (radix_tree_exceptional_entry(page)) continue; - } if (!trylock_page(page)) continue; @@ -313,15 +361,22 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - truncate_inode_page(mapping, page); - unlock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + pagevec_add(&locked_pvec, page); } - pagevec_remove_exceptionals(&pvec); + for (i = 0; i < pagevec_count(&locked_pvec); i++) + truncate_cleanup_page(mapping, locked_pvec.pages[i]); + delete_from_page_cache_batch(mapping, &locked_pvec); + for (i = 0; i < pagevec_count(&locked_pvec); i++) + unlock_page(locked_pvec.pages[i]); + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); pagevec_release(&pvec); cond_resched(); index++; } - if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { @@ -379,6 +434,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); break; } + for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -390,11 +446,8 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) { - truncate_exceptional_entry(mapping, index, - page); + if (radix_tree_exceptional_entry(page)) continue; - } lock_page(page); WARN_ON(page_to_index(page) != index); @@ -402,7 +455,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } - pagevec_remove_exceptionals(&pvec); + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); pagevec_release(&pvec); index++; } @@ -500,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { @@ -630,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (mapping->nrpages == 0 && mapping->nrexceptional == 0) goto out; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, diff --git a/mm/vmscan.c b/mm/vmscan.c index eb2f0315b8c0..47d5ced51f2d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -297,10 +297,13 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { + if (!shrinker->nr_deferred) + return; down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; } EXPORT_SYMBOL(unregister_shrinker); @@ -1349,7 +1352,7 @@ keep: mem_cgroup_uncharge_list(&free_pages); try_to_unmap_flush(); - free_hot_cold_page_list(&free_pages, true); + free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1824,7 +1827,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&page_list); - free_hot_cold_page_list(&page_list, true); + free_unref_page_list(&page_list); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1868,7 +1871,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * also allow kswapd to start writing pages during reclaim. */ if (stat.nr_unqueued_dirty == nr_taken) { - wakeup_flusher_threads(0, WB_REASON_VMSCAN); + wakeup_flusher_threads(WB_REASON_VMSCAN); set_bit(PGDAT_DIRTY, &pgdat->flags); } @@ -2063,7 +2066,7 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); - free_hot_cold_page_list(&l_hold, true); + free_unref_page_list(&l_hold); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2082,7 +2085,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * If that fails and refaulting is observed, the inactive list grows. * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages - * on this LRU, maintained by the pageout code. A zone->inactive_ratio + * on this LRU, maintained by the pageout code. An inactive_ratio * of 3 means 3:1 or 25% of the pages are kept on the inactive list. * * total target max diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bb13e72ac97..40b2db6db6b1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -32,6 +32,77 @@ #define NUMA_STATS_THRESHOLD (U16_MAX - 2) +#ifdef CONFIG_NUMA +int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; + +/* zero numa counters within a zone */ +static void zero_zone_numa_counters(struct zone *zone) +{ + int item, cpu; + + for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { + atomic_long_set(&zone->vm_numa_stat[item], 0); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] + = 0; + } +} + +/* zero numa counters of all the populated zones */ +static void zero_zones_numa_counters(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zero_zone_numa_counters(zone); +} + +/* zero global numa counters */ +static void zero_global_numa_counters(void) +{ + int item; + + for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) + atomic_long_set(&vm_numa_stat[item], 0); +} + +static void invalid_numa_statistics(void) +{ + zero_zones_numa_counters(); + zero_global_numa_counters(); +} + +static DEFINE_MUTEX(vm_numa_stat_lock); + +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret, oldval; + + mutex_lock(&vm_numa_stat_lock); + if (write) + oldval = sysctl_vm_numa_stat; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + goto out; + + if (oldval == sysctl_vm_numa_stat) + goto out; + else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { + static_branch_enable(&vm_numa_stat_key); + pr_info("enable numa statistics\n"); + } else { + static_branch_disable(&vm_numa_stat_key); + invalid_numa_statistics(); + pr_info("disable numa statistics, and clear numa counters\n"); + } + +out: + mutex_unlock(&vm_numa_stat_lock); + return ret; +} +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -1564,11 +1635,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n node_unreclaimable: %u" - "\n start_pfn: %lu" - "\n node_inactive_ratio: %u", + "\n start_pfn: %lu", pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, - zone->zone_start_pfn, - zone->zone_pgdat->inactive_ratio); + zone->zone_start_pfn); seq_putc(m, '\n'); } diff --git a/mm/workingset.c b/mm/workingset.c index b997c9de28f6..b7d616a3bbbe 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -340,14 +340,8 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node, void *private) +void workingset_update_node(struct radix_tree_node *node) { - struct address_space *mapping = private; - - /* Only regular page cache has shadow entries */ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. @@ -475,7 +469,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); out_invalid: spin_unlock(&mapping->tree_lock); diff --git a/mm/z3fold.c b/mm/z3fold.c index b2ba2ba585f3..39e19125d6a0 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -404,8 +404,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) WARN_ON(z3fold_page_trylock(zhdr)); else z3fold_page_lock(zhdr); - if (test_bit(PAGE_STALE, &page->private) || - !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) { + if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) { z3fold_page_unlock(zhdr); return; } @@ -413,6 +412,11 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + atomic64_dec(&pool->pages_nr); + return; + } + z3fold_compact_page(zhdr); unbuddied = get_cpu_ptr(pool->unbuddied); fchunks = num_free_chunks(zhdr); @@ -753,9 +757,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); zhdr->cpu = -1; + kref_get(&zhdr->refcount); do_compact_page(zhdr, true); return; } + kref_get(&zhdr->refcount); queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); z3fold_page_unlock(zhdr); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7c38e850a8fc..683c0651098c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -53,6 +53,7 @@ #include <linux/mount.h> #include <linux/migrate.h> #include <linux/pagemap.h> +#include <linux/fs.h> #define ZSPAGE_MAGIC 0x58 @@ -1349,7 +1350,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, * pools/users, we can't allow mapping in interrupt context * because it can corrupt another users mappings. */ - WARN_ON_ONCE(in_interrupt()); + BUG_ON(in_interrupt()); /* From now on, migration cannot move the object */ pin_tag(handle); |