diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-02-03 10:10:02 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-02-03 10:10:02 -0800 |
commit | b37a05c083c85c2657dca9bbe1f5d79dccf756d5 (patch) | |
tree | 0a9bd376a437484e21a6728ca16f2266a0e3e788 /mm | |
parent | d5bfb96bdad3588961f49a6eff89a625fbaa12bf (diff) | |
parent | 12c9d70bd5056b3ae84746fca973c286f48384cc (diff) | |
download | linux-b37a05c083c85c2657dca9bbe1f5d79dccf756d5.tar.bz2 |
Merge branch 'akpm' (patches from Andrew)
Merge fixes from Andrew Morton:
"18 fixes"
[ The 18 fixes turned into 17 commits, because one of the fixes was a
fix for another patch in the series that I just folded in by editing
the patch manually - hopefully correctly - Linus ]
* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm: fix memory leak in copy_huge_pmd()
drivers/hwspinlock: fix race between radix tree insertion and lookup
radix-tree: fix race in gang lookup
mm/vmpressure.c: fix subtree pressure detection
mm: polish virtual memory accounting
mm: warn about VmData over RLIMIT_DATA
Documentation: cgroup-v2: add memory.stat::sock description
mm: memcontrol: drop superfluous entry in the per-memcg stats array
drivers/scsi/sg.c: mark VMA as VM_IO to prevent migration
proc: revert /proc/<pid>/maps [stack:TID] annotation
numa: fix /proc/<pid>/numa_maps for hugetlbfs on s390
MAINTAINERS: update Seth email
ocfs2/cluster: fix memory leak in o2hb_region_release
lib/test-string_helpers.c: fix and improve string_get_size() tests
thp: limit number of object to scan on deferred_split_scan()
thp: change deferred_split_count() to return number of THP in queue
thp: make split_queue per-node
Diffstat (limited to 'mm')
-rw-r--r-- | mm/huge_memory.c | 87 | ||||
-rw-r--r-- | mm/internal.h | 31 | ||||
-rw-r--r-- | mm/mmap.c | 23 | ||||
-rw-r--r-- | mm/page_alloc.c | 5 | ||||
-rw-r--r-- | mm/util.c | 27 | ||||
-rw-r--r-- | mm/vmpressure.c | 3 |
6 files changed, 103 insertions, 73 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fd3a07b3e6f4..36c070167b71 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; -static DEFINE_SPINLOCK(split_queue_lock); -static LIST_HEAD(split_queue); -static unsigned long split_queue_len; static struct shrinker deferred_split_shrinker; static void set_recommended_min_free_kbytes(void) @@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - pgtable_trans_huge_deposit(mm, pmd, pgtable); + if (pgtable) + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); return true; @@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; - pgtable_t pgtable; + pgtable_t pgtable = NULL; int ret; - ret = -ENOMEM; - pgtable = pte_alloc_one(dst_mm, addr); - if (unlikely(!pgtable)) - goto out; + if (!vma_is_dax(vma)) { + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + } dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); @@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } - if (pmd_trans_huge(pmd)) { + if (!vma_is_dax(vma)) { /* thp accounting separate from pmd_devmap accounting */ src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); @@ -3358,6 +3358,7 @@ int total_mapcount(struct page *page) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); struct anon_vma *anon_vma; int count, mapcount, ret; bool mlocked; @@ -3401,19 +3402,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* Prevent deferred_split_scan() touching ->_count */ - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && count == 1) { if (!list_empty(page_deferred_list(head))) { - split_queue_len--; + pgdata->split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); __split_huge_page(page, list); ret = 0; } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); pr_alert("total_mapcount: %u, page_count(): %u\n", mapcount, count); if (PageTail(page)) @@ -3421,7 +3422,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } else { - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); unfreeze_page(anon_vma, head); ret = -EBUSY; } @@ -3436,64 +3437,65 @@ out: void free_transhuge_page(struct page *page) { + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); unsigned long flags; - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { - split_queue_len--; + pgdata->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { - list_add_tail(page_deferred_list(page), &split_queue); - split_queue_len++; + list_add_tail(page_deferred_list(page), &pgdata->split_queue); + pgdata->split_queue_len++; } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { - /* - * Split a page from split_queue will free up at least one page, - * at most HPAGE_PMD_NR - 1. We don't track exact number. - * Let's use HPAGE_PMD_NR / 2 as ballpark. - */ - return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2; + struct pglist_data *pgdata = NODE_DATA(sc->nid); + return ACCESS_ONCE(pgdata->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { + struct pglist_data *pgdata = NODE_DATA(sc->nid); unsigned long flags; LIST_HEAD(list), *pos, *next; struct page *page; int split = 0; - spin_lock_irqsave(&split_queue_lock, flags); - list_splice_init(&split_queue, &list); - + spin_lock_irqsave(&pgdata->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); - /* race with put_compound_page() */ - if (!get_page_unless_zero(page)) { + if (get_page_unless_zero(page)) { + list_move(page_deferred_list(page), &list); + } else { + /* We lost race with put_compound_page() */ list_del_init(page_deferred_list(page)); - split_queue_len--; + pgdata->split_queue_len--; } + if (!--sc->nr_to_scan) + break; } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); @@ -3505,17 +3507,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, put_page(page); } - spin_lock_irqsave(&split_queue_lock, flags); - list_splice_tail(&list, &split_queue); - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + list_splice_tail(&list, &pgdata->split_queue); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - return split * HPAGE_PMD_NR / 2; + /* + * Stop shrinker if we didn't split any page, but the queue is empty. + * This can happen if pages were freed under us. + */ + if (!split && list_empty(&pgdata->split_queue)) + return SHRINK_STOP; + return split; } static struct shrinker deferred_split_shrinker = { .count_objects = deferred_split_count, .scan_objects = deferred_split_scan, .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, }; #ifdef CONFIG_DEBUG_FS diff --git a/mm/internal.h b/mm/internal.h index ed8b5ffcf9b1..a38a21ebddb4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -216,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +/* + * These three helpers classifies VMAs for virtual memory accounting. + */ + +/* + * Executable code area - executable, not writable, not stack + */ +static inline bool is_exec_mapping(vm_flags_t flags) +{ + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; +} + +/* + * Stack area - atomatically grows in one direction + * + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: + * do_mmap() forbids all other combinations. + */ +static inline bool is_stack_mapping(vm_flags_t flags) +{ + return (flags & VM_STACK) == VM_STACK; +} + +/* + * Data area - private, writable, not stack + */ +static inline bool is_data_mapping(vm_flags_t flags) +{ + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); diff --git a/mm/mmap.c b/mm/mmap.c index 84b12624ceb0..cfc0cdca421e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -42,6 +42,7 @@ #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> +#include <linux/moduleparam.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif +static bool ignore_rlimit_data = true; +core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, @@ -2982,9 +2985,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; - if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS & - (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE) - return mm->data_vm + npages <= rlimit(RLIMIT_DATA); + if (is_data_mapping(flags) && + mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { + if (ignore_rlimit_data) + pr_warn_once("%s (%d): VmData %lu exceed data ulimit " + "%lu. Will be forbidden soon.\n", + current->comm, current->pid, + (mm->data_vm + npages) << PAGE_SHIFT, + rlimit(RLIMIT_DATA)); + else + return false; + } return true; } @@ -2993,11 +3004,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { mm->total_vm += npages; - if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC) + if (is_exec_mapping(flags)) mm->exec_vm += npages; - else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) + else if (is_stack_mapping(flags)) mm->stack_vm += npages; - else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + else if (is_data_mapping(flags)) mm->data_vm += npages; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 63358d9f9aa9..ea2c4d3e0c03 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5210,6 +5210,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&pgdat->split_queue_lock); + INIT_LIST_HEAD(&pgdat->split_queue); + pgdat->split_queue_len = 0; +#endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); diff --git a/mm/util.c b/mm/util.c index c108a6542d05..4fb14ca5a419 100644 --- a/mm/util.c +++ b/mm/util.c @@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -static int vm_is_stack_for_task(struct task_struct *t, - struct vm_area_struct *vma) +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) { return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -/* - * Check if the vma is being used as a stack. - * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the task_struct of the task - * that the vma is stack for. Must be called under rcu_read_lock(). - */ -struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group) -{ - if (vm_is_stack_for_task(task, vma)) - return task; - - if (in_group) { - struct task_struct *t; - - for_each_thread(task, t) { - if (vm_is_stack_for_task(t, vma)) - return t; - } - } - - return NULL; -} - #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 9a6c0704211c..149fdf6c5c56 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, if (tree) { spin_lock(&vmpr->sr_lock); - vmpr->tree_scanned += scanned; + scanned = vmpr->tree_scanned += scanned; vmpr->tree_reclaimed += reclaimed; - scanned = vmpr->scanned; spin_unlock(&vmpr->sr_lock); if (scanned < vmpressure_win) |