diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 20 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 20 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/mprotect.c | 25 | ||||
-rw-r--r-- | mm/page-writeback.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 30 | ||||
-rw-r--r-- | mm/slub.c | 38 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 63 | ||||
-rw-r--r-- | mm/swapfile.c | 11 | ||||
-rw-r--r-- | mm/vmpressure.c | 1 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
15 files changed, 169 insertions, 81 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index d56d3c145b9f..7a13f6ac5421 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) ret = err; } return ret; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82166bf974e1..1546655a2d78 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1166,8 +1166,10 @@ alloc: } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); - if (ret & VM_FAULT_OOM) + if (ret & VM_FAULT_OOM) { split_huge_page(page); + ret |= VM_FAULT_FALLBACK; + } put_page(page); } count_vm_event(THP_FAULT_FALLBACK); @@ -1179,9 +1181,10 @@ alloc: if (page) { split_huge_page(page); put_page(page); - } + } else + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); - ret |= VM_FAULT_OOM; goto out; } @@ -1545,6 +1548,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); BUG_ON(pmd_write(entry)); } else { struct page *page = pmd_page(*pmd); @@ -1557,16 +1561,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (!is_huge_zero_page(page) && !pmd_numa(*pmd)) { - entry = *pmd; - entry = pmd_mknuma(entry); + pmdp_set_numa(mm, addr, pmd); ret = HPAGE_PMD_NR; } } - - /* Set PMD if cleared earlier */ - if (ret == HPAGE_PMD_NR) - set_pmd_at(mm, addr, pmd, entry); - spin_unlock(ptl); } @@ -1963,7 +1961,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) @@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item) static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head = compound_trans_head(page); + struct page *head = compound_head(page); /* * head may actually be splitted and freed from under * us but it's ok here. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53385cd4e6f0..5b6b0039f725 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1127,8 +1127,8 @@ skip_node: * skipping css reference should be safe. */ if (next_css) { - if ((next_css->flags & CSS_ONLINE) && - (next_css == &root->css || css_tryget(next_css))) + if ((next_css == &root->css) || + ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) return mem_cgroup_from_css(next_css); prev_css = next_css; @@ -1687,7 +1687,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) * protects memcg_name and makes sure that parallel ooms do not * interleave */ - static DEFINE_SPINLOCK(oom_info_lock); + static DEFINE_MUTEX(oom_info_lock); struct cgroup *task_cgrp; struct cgroup *mem_cgrp; static char memcg_name[PATH_MAX]; @@ -1698,7 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) if (!p) return; - spin_lock(&oom_info_lock); + mutex_lock(&oom_info_lock); rcu_read_lock(); mem_cgrp = memcg->css.cgroup; @@ -1767,7 +1767,7 @@ done: pr_cont("\n"); } - spin_unlock(&oom_info_lock); + mutex_unlock(&oom_info_lock); } /* @@ -6595,6 +6595,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; + struct cgroup_subsys_state *iter; /* * Unregister events and notify userspace. @@ -6611,7 +6612,14 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) kmem_cgroup_css_offline(memcg); mem_cgroup_invalidate_reclaim_iterators(memcg); - mem_cgroup_reparent_charges(memcg); + + /* + * This requires that offlining is serialized. Right now that is + * guaranteed because css_killed_work_fn() holds the cgroup_mutex. + */ + css_for_each_descendant_post(iter, css) + mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); + mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4f08a2d61487..90002ea43638 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -945,8 +945,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * to it. Similarly, page lock is shifted. */ if (hpage != p) { - put_page(hpage); - get_page(p); + if (!(flags & MF_COUNT_INCREASED)) { + put_page(hpage); + get_page(p); + } lock_page(p); unlock_page(hpage); *hpagep = p; @@ -1649,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_trans_head(page); + struct page *hpage = compound_head(page); if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); diff --git a/mm/memory.c b/mm/memory.c index be6a0c0d4ae0..22dfa617bddb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3348,6 +3348,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); ret = VM_FAULT_HWPOISON; + page_cache_release(vmf.page); goto uncharge_out; } @@ -3703,7 +3704,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -retry: pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) @@ -3741,20 +3741,13 @@ retry: if (dirty && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); - /* - * If COW results in an oom, the huge pmd will - * have been split, so retry the fault on the - * pte for a smaller charge. - */ - if (unlikely(ret & VM_FAULT_OOM)) - goto retry; - return ret; + if (!(ret & VM_FAULT_FALLBACK)) + return ret; } else { huge_pmd_set_accessed(mm, vma, address, pmd, orig_pmd, dirty); + return 0; } - - return 0; } } diff --git a/mm/mprotect.c b/mm/mprotect.c index 7332c1785744..769a67a15803 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,36 +58,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (pte_numa(ptent)) ptent = pte_mknonnuma(ptent); ptent = pte_modify(ptent, newprot); + /* + * Avoid taking write faults for pages we + * know to be dirty. + */ + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; } else { struct page *page; - ptent = *pte; page = vm_normal_page(vma, addr, oldpte); if (page && !PageKsm(page)) { if (!pte_numa(oldpte)) { - ptent = pte_mknuma(ptent); - set_pte_at(mm, addr, pte, ptent); + ptep_set_numa(mm, addr, pte); updated = true; } } } - - /* - * Avoid taking write faults for pages we know to be - * dirty. - */ - if (dirty_accountable && pte_dirty(ptent)) { - ptent = pte_mkwrite(ptent); - updated = true; - } - if (updated) pages++; - - /* Only !prot_numa always clears the pte */ - if (!prot_numa) - ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2d30e2cfe804..7106cb1aca8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2173,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page) if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; + unsigned long flags; if (!mapping) return 1; - spin_lock_irq(&mapping->tree_lock); + spin_lock_irqsave(&mapping->tree_lock, flags); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -2186,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3758a09a009..3bac76ae4b30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } @@ -1236,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } +static bool gfp_thisnode_allocation(gfp_t gfp_mask) +{ + return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; +} +#else +static bool gfp_thisnode_allocation(gfp_t gfp_mask) +{ + return false; +} #endif /* @@ -1572,7 +1583,13 @@ again: get_pageblock_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + /* + * NOTE: GFP_THISNODE allocations do not partake in the kswapd + * aging protocol, so they can't be fair. + */ + if (!gfp_thisnode_allocation(gfp_flags)) + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -1944,8 +1961,12 @@ zonelist_scan: * ultimately fall back to remote zones that do not * partake in the fairness round-robin cycle of this * zonelist. + * + * NOTE: GFP_THISNODE allocations do not partake in + * the kswapd aging protocol, so they can't be fair. */ - if (alloc_flags & ALLOC_WMARK_LOW) { + if ((alloc_flags & ALLOC_WMARK_LOW) && + !gfp_thisnode_allocation(gfp_mask)) { if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; if (!zone_local(preferred_zone, zone)) @@ -2501,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (IS_ENABLED(CONFIG_NUMA) && - (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (gfp_thisnode_allocation(gfp_mask)) goto nopage; restart: diff --git a/mm/slub.c b/mm/slub.c index 7e3e0458bce4..25f14ad8f817 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1004,21 +1004,19 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) static void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { - lockdep_assert_held(&n->list_lock); - if (!(s->flags & SLAB_STORE_USER)) return; + lockdep_assert_held(&n->list_lock); list_add(&page->lru, &n->full); } static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { - lockdep_assert_held(&n->list_lock); - if (!(s->flags & SLAB_STORE_USER)) return; + lockdep_assert_held(&n->list_lock); list_del(&page->lru); } @@ -1520,11 +1518,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page) /* * Management of partially allocated slabs. */ -static inline void add_partial(struct kmem_cache_node *n, - struct page *page, int tail) +static inline void +__add_partial(struct kmem_cache_node *n, struct page *page, int tail) { - lockdep_assert_held(&n->list_lock); - n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); @@ -1532,15 +1528,27 @@ static inline void add_partial(struct kmem_cache_node *n, list_add(&page->lru, &n->partial); } -static inline void remove_partial(struct kmem_cache_node *n, - struct page *page) +static inline void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) { lockdep_assert_held(&n->list_lock); + __add_partial(n, page, tail); +} +static inline void +__remove_partial(struct kmem_cache_node *n, struct page *page) +{ list_del(&page->lru); n->nr_partial--; } +static inline void remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + lockdep_assert_held(&n->list_lock); + __remove_partial(n, page); +} + /* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. @@ -2906,12 +2914,10 @@ static void early_kmem_cache_node_alloc(int node) inc_slabs_node(kmem_cache_node, node, page->objects); /* - * the lock is for lockdep's sake, not for any actual - * race protection + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. */ - spin_lock(&n->list_lock); - add_partial(n, page, DEACTIVATE_TO_HEAD); - spin_unlock(&n->list_lock); + __add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -3197,7 +3203,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - remove_partial(n, page); + __remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, diff --git a/mm/swap.c b/mm/swap.c index b31ba67d440a..0092097b3f4c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -98,7 +98,7 @@ static void put_compound_page(struct page *page) } /* __split_huge_page_refcount can run under us */ - page_head = compound_trans_head(page); + page_head = compound_head(page); /* * THP can not break up slab pages so avoid taking @@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page) */ unsigned long flags; bool got; - struct page *page_head = compound_trans_head(page); + struct page *page_head = compound_head(page); /* Ref to put_compound_page() comment. */ if (!__compound_tail_refcounted(page_head)) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 98e85e9c2b2d..e76ace30d436 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) return ret; } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; struct blk_plug plug; + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } diff --git a/mm/swapfile.c b/mm/swapfile.c index c6c13b050a58..4a7f7e6992b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1923,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; - p->flags = 0; frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -1949,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&inode->i_mutex); } filp_close(swap_file, NULL); + + /* + * Clear the SWP_USED flag after all resources are freed so that swapon + * can reuse this swap_info in alloc_swap_info() safely. It is ok to + * not hold p->lock after we cleared its SWP_WRITEOK. + */ + spin_lock(&swap_lock); + p->flags = 0; + spin_unlock(&swap_lock); + err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 196970a4541f..d4042e75f7c7 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/eventfd.h> +#include <linux/slab.h> #include <linux/swap.h> #include <linux/printk.h> #include <linux/vmpressure.h> diff --git a/mm/vmstat.c b/mm/vmstat.c index 72496140ac08..def5dd2fbe61 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -851,12 +851,14 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#endif +#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; |